recoverd: Use talloc_array_length() for simpler code
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          false,
514                                          CTDB_EVENT_TAKE_IP,
515                                          "%s %s %u",
516                                          ctdb_vnn_iface_string(vnn),
517                                          ctdb_addr_to_str(&vnn->public_address),
518                                          vnn->public_netmask_bits);
519
520         if (ret != 0) {
521                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522                         ctdb_addr_to_str(&vnn->public_address),
523                         ctdb_vnn_iface_string(vnn)));
524                 talloc_free(state);
525                 return -1;
526         }
527
528         return 0;
529 }
530
531 struct ctdb_do_updateip_state {
532         struct ctdb_req_control *c;
533         struct ctdb_iface *old;
534         struct ctdb_vnn *vnn;
535 };
536
537 /*
538   called when updateip event finishes
539  */
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541                                       void *private_data)
542 {
543         struct ctdb_do_updateip_state *state =
544                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545         int32_t ret;
546
547         if (status != 0) {
548                 if (status == -ETIME) {
549                         ctdb_ban_self(ctdb);
550                 }
551                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552                         ctdb_addr_to_str(&state->vnn->public_address),
553                         state->old->name,
554                         ctdb_vnn_iface_string(state->vnn)));
555
556                 /*
557                  * All we can do is reset the old interface
558                  * and let the next run fix it
559                  */
560                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561                 state->vnn->iface = state->old;
562                 state->vnn->iface->references++;
563
564                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         if (ctdb->do_checkpublicip) {
570
571         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572         if (ret != 0) {
573                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574                 talloc_free(state);
575                 return;
576         }
577
578         }
579
580         /* the control succeeded */
581         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582         talloc_free(state);
583         return;
584 }
585
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587 {
588         state->vnn->update_in_flight = false;
589         return 0;
590 }
591
592 /*
593   update (move) an ip address
594  */
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596                                 struct ctdb_req_control *c,
597                                 struct ctdb_vnn *vnn)
598 {
599         int ret;
600         struct ctdb_do_updateip_state *state;
601         struct ctdb_iface *old = vnn->iface;
602         const char *new_name;
603
604         if (vnn->update_in_flight) {
605                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606                                     "update for this IP already in flight\n",
607                                     ctdb_addr_to_str(&vnn->public_address),
608                                     vnn->public_netmask_bits));
609                 return -1;
610         }
611
612         ctdb_vnn_unassign_iface(ctdb, vnn);
613         ret = ctdb_vnn_assign_iface(ctdb, vnn);
614         if (ret != 0) {
615                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616                                  "assin a usable interface (old iface '%s')\n",
617                                  ctdb_addr_to_str(&vnn->public_address),
618                                  vnn->public_netmask_bits,
619                                  old->name));
620                 return -1;
621         }
622
623         new_name = ctdb_vnn_iface_string(vnn);
624         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625                 /* A benign update from one interface onto itself.
626                  * no need to run the eventscripts in this case, just return
627                  * success.
628                  */
629                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630                 return 0;
631         }
632
633         state = talloc(vnn, struct ctdb_do_updateip_state);
634         CTDB_NO_MEMORY(ctdb, state);
635
636         state->c = talloc_steal(ctdb, c);
637         state->old = old;
638         state->vnn = vnn;
639
640         vnn->update_in_flight = true;
641         talloc_set_destructor(state, ctdb_updateip_destructor);
642
643         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644                             "interface %s to %s\n",
645                             ctdb_addr_to_str(&vnn->public_address),
646                             vnn->public_netmask_bits,
647                             old->name,
648                             new_name));
649
650         ret = ctdb_event_script_callback(ctdb,
651                                          state,
652                                          ctdb_do_updateip_callback,
653                                          state,
654                                          false,
655                                          CTDB_EVENT_UPDATE_IP,
656                                          "%s %s %s %u",
657                                          state->old->name,
658                                          new_name,
659                                          ctdb_addr_to_str(&vnn->public_address),
660                                          vnn->public_netmask_bits);
661         if (ret != 0) {
662                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663                                  ctdb_addr_to_str(&vnn->public_address),
664                                  old->name, new_name));
665                 talloc_free(state);
666                 return -1;
667         }
668
669         return 0;
670 }
671
672 /*
673   Find the vnn of the node that has a public ip address
674   returns -1 if the address is not known as a public address
675  */
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677 {
678         struct ctdb_vnn *vnn;
679
680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681                 if (ctdb_same_ip(&vnn->public_address, addr)) {
682                         return vnn;
683                 }
684         }
685
686         return NULL;
687 }
688
689 /*
690   take over an ip address
691  */
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693                                  struct ctdb_req_control *c,
694                                  TDB_DATA indata,
695                                  bool *async_reply)
696 {
697         int ret;
698         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699         struct ctdb_vnn *vnn;
700         bool have_ip = false;
701         bool do_updateip = false;
702         bool do_takeip = false;
703         struct ctdb_iface *best_iface = NULL;
704
705         if (pip->pnn != ctdb->pnn) {
706                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707                                  "with pnn %d, but we're node %d\n",
708                                  ctdb_addr_to_str(&pip->addr),
709                                  pip->pnn, ctdb->pnn));
710                 return -1;
711         }
712
713         /* update out vnn list */
714         vnn = find_public_ip_vnn(ctdb, &pip->addr);
715         if (vnn == NULL) {
716                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717                         ctdb_addr_to_str(&pip->addr)));
718                 return 0;
719         }
720
721         if (ctdb->do_checkpublicip) {
722                 have_ip = ctdb_sys_have_ip(&pip->addr);
723         }
724         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725         if (best_iface == NULL) {
726                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727                                  "a usable interface (old %s, have_ip %d)\n",
728                                  ctdb_addr_to_str(&vnn->public_address),
729                                  vnn->public_netmask_bits,
730                                  ctdb_vnn_iface_string(vnn),
731                                  have_ip));
732                 return -1;
733         }
734
735         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737                 have_ip = false;
738         }
739
740
741         if (vnn->iface == NULL && have_ip) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744                                  ctdb_addr_to_str(&vnn->public_address)));
745                 return 0;
746         }
747
748         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750                                   "and we have it on iface[%s], but it was assigned to node %d"
751                                   "and we are node %d, banning ourself\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754                 ctdb_ban_self(ctdb);
755                 return -1;
756         }
757
758         if (vnn->pnn == -1 && have_ip) {
759                 vnn->pnn = ctdb->pnn;
760                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761                                   "and we already have it on iface[%s], update local daemon\n",
762                                  ctdb_addr_to_str(&vnn->public_address),
763                                   ctdb_vnn_iface_string(vnn)));
764                 return 0;
765         }
766
767         if (vnn->iface) {
768                 if (vnn->iface != best_iface) {
769                         if (!vnn->iface->link_up) {
770                                 do_updateip = true;
771                         } else if (vnn->iface->references > (best_iface->references + 1)) {
772                                 /* only move when the rebalance gains something */
773                                         do_updateip = true;
774                         }
775                 }
776         }
777
778         if (!have_ip) {
779                 if (do_updateip) {
780                         ctdb_vnn_unassign_iface(ctdb, vnn);
781                         do_updateip = false;
782                 }
783                 do_takeip = true;
784         }
785
786         if (do_takeip) {
787                 ret = ctdb_do_takeip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else if (do_updateip) {
792                 ret = ctdb_do_updateip(ctdb, c, vnn);
793                 if (ret != 0) {
794                         return -1;
795                 }
796         } else {
797                 /*
798                  * The interface is up and the kernel known the ip
799                  * => do nothing
800                  */
801                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits,
804                         ctdb_vnn_iface_string(vnn)));
805                 return 0;
806         }
807
808         /* tell ctdb_control.c that we will be replying asynchronously */
809         *async_reply = true;
810
811         return 0;
812 }
813
814 /*
815   takeover an ip address old v4 style
816  */
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
818                                 struct ctdb_req_control *c,
819                                 TDB_DATA indata, 
820                                 bool *async_reply)
821 {
822         TDB_DATA data;
823         
824         data.dsize = sizeof(struct ctdb_public_ip);
825         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826         CTDB_NO_MEMORY(ctdb, data.dptr);
827         
828         memcpy(data.dptr, indata.dptr, indata.dsize);
829         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
830 }
831
832 /*
833   kill any clients that are registered with a IP that is being released
834  */
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
836 {
837         struct ctdb_client_ip *ip;
838
839         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840                 ctdb_addr_to_str(addr)));
841
842         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843                 ctdb_sock_addr tmp_addr;
844
845                 tmp_addr = ip->addr;
846                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
847                         ip->client_id,
848                         ctdb_addr_to_str(&ip->addr)));
849
850                 if (ctdb_same_ip(&tmp_addr, addr)) {
851                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
852                                                                      ip->client_id, 
853                                                                      struct ctdb_client);
854                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
855                                 ip->client_id,
856                                 ctdb_addr_to_str(&ip->addr),
857                                 client->pid));
858
859                         if (client->pid != 0) {
860                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861                                         (unsigned)client->pid,
862                                         ctdb_addr_to_str(addr),
863                                         ip->client_id));
864                                 ctdb_kill(ctdb, client->pid, SIGKILL);
865                         }
866                 }
867         }
868 }
869
870 /*
871   called when releaseip event finishes
872  */
873 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
874                                 void *private_data)
875 {
876         struct takeover_callback_state *state = 
877                 talloc_get_type(private_data, struct takeover_callback_state);
878         TDB_DATA data;
879
880         if (status == -ETIME) {
881                 ctdb_ban_self(ctdb);
882         }
883
884         /* send a message to all clients of this node telling them
885            that the cluster has been reconfigured and they should
886            release any sockets on this IP */
887         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
888         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
889         data.dsize = strlen((char *)data.dptr)+1;
890
891         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
892
893         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
894
895         /* kill clients that have registered with this IP */
896         release_kill_clients(ctdb, state->addr);
897
898         ctdb_vnn_unassign_iface(ctdb, state->vnn);
899
900         /* the control succeeded */
901         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
902         talloc_free(state);
903 }
904
905 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
906 {
907         state->vnn->update_in_flight = false;
908         return 0;
909 }
910
911 /*
912   release an ip address
913  */
914 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
915                                 struct ctdb_req_control *c,
916                                 TDB_DATA indata, 
917                                 bool *async_reply)
918 {
919         int ret;
920         struct takeover_callback_state *state;
921         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
922         struct ctdb_vnn *vnn;
923         char *iface;
924
925         /* update our vnn list */
926         vnn = find_public_ip_vnn(ctdb, &pip->addr);
927         if (vnn == NULL) {
928                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
929                         ctdb_addr_to_str(&pip->addr)));
930                 return 0;
931         }
932         vnn->pnn = pip->pnn;
933
934         /* stop any previous arps */
935         talloc_free(vnn->takeover_ctx);
936         vnn->takeover_ctx = NULL;
937
938         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
939          * lazy multicast to drop an IP from any node that isn't the
940          * intended new node.  The following causes makes ctdbd ignore
941          * a release for any address it doesn't host.
942          */
943         if (ctdb->do_checkpublicip) {
944                 if (!ctdb_sys_have_ip(&pip->addr)) {
945                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
946                                 ctdb_addr_to_str(&pip->addr),
947                                 vnn->public_netmask_bits,
948                                 ctdb_vnn_iface_string(vnn)));
949                         ctdb_vnn_unassign_iface(ctdb, vnn);
950                         return 0;
951                 }
952         } else {
953                 if (vnn->iface == NULL) {
954                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
955                                            ctdb_addr_to_str(&pip->addr),
956                                            vnn->public_netmask_bits));
957                         return 0;
958                 }
959         }
960
961         /* There is a potential race between take_ip and us because we
962          * update the VNN via a callback that run when the
963          * eventscripts have been run.  Avoid the race by allowing one
964          * update to be in flight at a time.
965          */
966         if (vnn->update_in_flight) {
967                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
968                                     "update for this IP already in flight\n",
969                                     ctdb_addr_to_str(&vnn->public_address),
970                                     vnn->public_netmask_bits));
971                 return -1;
972         }
973
974         if (ctdb->do_checkpublicip) {
975                 iface = ctdb_sys_find_ifname(&pip->addr);
976                 if (iface == NULL) {
977                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
978                         return 0;
979                 }
980         } else {
981                 iface = strdup(ctdb_vnn_iface_string(vnn));
982         }
983
984         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
985                 ctdb_addr_to_str(&pip->addr),
986                 vnn->public_netmask_bits,
987                 iface,
988                 pip->pnn));
989
990         state = talloc(ctdb, struct takeover_callback_state);
991         CTDB_NO_MEMORY(ctdb, state);
992
993         state->c = talloc_steal(state, c);
994         state->addr = talloc(state, ctdb_sock_addr);       
995         CTDB_NO_MEMORY(ctdb, state->addr);
996         *state->addr = pip->addr;
997         state->vnn   = vnn;
998
999         vnn->update_in_flight = true;
1000         talloc_set_destructor(state, ctdb_releaseip_destructor);
1001
1002         ret = ctdb_event_script_callback(ctdb, 
1003                                          state, release_ip_callback, state,
1004                                          false,
1005                                          CTDB_EVENT_RELEASE_IP,
1006                                          "%s %s %u",
1007                                          iface,
1008                                          ctdb_addr_to_str(&pip->addr),
1009                                          vnn->public_netmask_bits);
1010         free(iface);
1011         if (ret != 0) {
1012                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1013                         ctdb_addr_to_str(&pip->addr),
1014                         ctdb_vnn_iface_string(vnn)));
1015                 talloc_free(state);
1016                 return -1;
1017         }
1018
1019         /* tell the control that we will be reply asynchronously */
1020         *async_reply = true;
1021         return 0;
1022 }
1023
1024 /*
1025   release an ip address old v4 style
1026  */
1027 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1028                                 struct ctdb_req_control *c,
1029                                 TDB_DATA indata, 
1030                                 bool *async_reply)
1031 {
1032         TDB_DATA data;
1033         
1034         data.dsize = sizeof(struct ctdb_public_ip);
1035         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1036         CTDB_NO_MEMORY(ctdb, data.dptr);
1037         
1038         memcpy(data.dptr, indata.dptr, indata.dsize);
1039         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1040 }
1041
1042
1043 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1044                                    ctdb_sock_addr *addr,
1045                                    unsigned mask, const char *ifaces,
1046                                    bool check_address)
1047 {
1048         struct ctdb_vnn      *vnn;
1049         uint32_t num = 0;
1050         char *tmp;
1051         const char *iface;
1052         int i;
1053         int ret;
1054
1055         tmp = strdup(ifaces);
1056         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1057                 if (!ctdb_sys_check_iface_exists(iface)) {
1058                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1059                         free(tmp);
1060                         return -1;
1061                 }
1062         }
1063         free(tmp);
1064
1065         /* Verify that we dont have an entry for this ip yet */
1066         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1067                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1068                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1069                                 ctdb_addr_to_str(addr)));
1070                         return -1;
1071                 }               
1072         }
1073
1074         /* create a new vnn structure for this ip address */
1075         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1076         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1077         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1078         tmp = talloc_strdup(vnn, ifaces);
1079         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1080         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1081                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1082                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1083                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1084                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1085                 num++;
1086         }
1087         talloc_free(tmp);
1088         vnn->ifaces[num] = NULL;
1089         vnn->public_address      = *addr;
1090         vnn->public_netmask_bits = mask;
1091         vnn->pnn                 = -1;
1092         if (check_address) {
1093                 if (ctdb_sys_have_ip(addr)) {
1094                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1095                         vnn->pnn = ctdb->pnn;
1096                 }
1097         }
1098
1099         for (i=0; vnn->ifaces[i]; i++) {
1100                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1101                 if (ret != 0) {
1102                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1103                                            "for public_address[%s]\n",
1104                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1105                         talloc_free(vnn);
1106                         return -1;
1107                 }
1108         }
1109
1110         DLIST_ADD(ctdb->vnn, vnn);
1111
1112         return 0;
1113 }
1114
1115 /*
1116   setup the event script directory
1117 */
1118 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1119 {
1120         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1121         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1122         return 0;
1123 }
1124
1125 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1126                                   struct timeval t, void *private_data)
1127 {
1128         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1129                                                         struct ctdb_context);
1130         struct ctdb_vnn *vnn;
1131
1132         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1133                 int i;
1134
1135                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1136                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1137                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1138                                         vnn->ifaces[i],
1139                                         ctdb_addr_to_str(&vnn->public_address)));
1140                         }
1141                 }
1142         }
1143
1144         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1145                 timeval_current_ofs(30, 0), 
1146                 ctdb_check_interfaces_event, ctdb);
1147 }
1148
1149
1150 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1151 {
1152         if (ctdb->check_public_ifaces_ctx != NULL) {
1153                 talloc_free(ctdb->check_public_ifaces_ctx);
1154                 ctdb->check_public_ifaces_ctx = NULL;
1155         }
1156
1157         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1158         if (ctdb->check_public_ifaces_ctx == NULL) {
1159                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1160         }
1161
1162         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1163                 timeval_current_ofs(30, 0), 
1164                 ctdb_check_interfaces_event, ctdb);
1165
1166         return 0;
1167 }
1168
1169
1170 /*
1171   setup the public address lists from a file
1172 */
1173 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1174 {
1175         char **lines;
1176         int nlines;
1177         int i;
1178
1179         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1180         if (lines == NULL) {
1181                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1182                 return -1;
1183         }
1184         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1185                 nlines--;
1186         }
1187
1188         for (i=0;i<nlines;i++) {
1189                 unsigned mask;
1190                 ctdb_sock_addr addr;
1191                 const char *addrstr;
1192                 const char *ifaces;
1193                 char *tok, *line;
1194
1195                 line = lines[i];
1196                 while ((*line == ' ') || (*line == '\t')) {
1197                         line++;
1198                 }
1199                 if (*line == '#') {
1200                         continue;
1201                 }
1202                 if (strcmp(line, "") == 0) {
1203                         continue;
1204                 }
1205                 tok = strtok(line, " \t");
1206                 addrstr = tok;
1207                 tok = strtok(NULL, " \t");
1208                 if (tok == NULL) {
1209                         if (NULL == ctdb->default_public_interface) {
1210                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1211                                          i+1));
1212                                 talloc_free(lines);
1213                                 return -1;
1214                         }
1215                         ifaces = ctdb->default_public_interface;
1216                 } else {
1217                         ifaces = tok;
1218                 }
1219
1220                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1221                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1226                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1227                         talloc_free(lines);
1228                         return -1;
1229                 }
1230         }
1231
1232
1233         talloc_free(lines);
1234         return 0;
1235 }
1236
1237 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1238                               const char *iface,
1239                               const char *ip)
1240 {
1241         struct ctdb_vnn *svnn;
1242         struct ctdb_iface *cur = NULL;
1243         bool ok;
1244         int ret;
1245
1246         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1247         CTDB_NO_MEMORY(ctdb, svnn);
1248
1249         svnn->ifaces = talloc_array(svnn, const char *, 2);
1250         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1251         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1252         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1253         svnn->ifaces[1] = NULL;
1254
1255         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1256         if (!ok) {
1257                 talloc_free(svnn);
1258                 return -1;
1259         }
1260
1261         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1262         if (ret != 0) {
1263                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1264                                    "for single_ip[%s]\n",
1265                                    svnn->ifaces[0],
1266                                    ctdb_addr_to_str(&svnn->public_address)));
1267                 talloc_free(svnn);
1268                 return -1;
1269         }
1270
1271         /* assume the single public ip interface is initially "good" */
1272         cur = ctdb_find_iface(ctdb, iface);
1273         if (cur == NULL) {
1274                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1275                 return -1;
1276         }
1277         cur->link_up = true;
1278
1279         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1280         if (ret != 0) {
1281                 talloc_free(svnn);
1282                 return -1;
1283         }
1284
1285         ctdb->single_ip_vnn = svnn;
1286         return 0;
1287 }
1288
1289 /* Given a physical node, return the number of
1290    public addresses that is currently assigned to this node.
1291 */
1292 static int node_ip_coverage(struct ctdb_context *ctdb, 
1293         int32_t pnn,
1294         struct ctdb_public_ip_list *ips)
1295 {
1296         int num=0;
1297
1298         for (;ips;ips=ips->next) {
1299                 if (ips->pnn == pnn) {
1300                         num++;
1301                 }
1302         }
1303         return num;
1304 }
1305
1306
1307 /* Can the given node host the given IP: is the public IP known to the
1308  * node and is NOIPHOST unset?
1309 */
1310 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1311                              struct ctdb_ipflags ipflags,
1312                              struct ctdb_public_ip_list *ip)
1313 {
1314         struct ctdb_all_public_ips *public_ips;
1315         int i;
1316
1317         if (ipflags.noiphost) {
1318                 return false;
1319         }
1320
1321         public_ips = ctdb->nodes[pnn]->available_public_ips;
1322
1323         if (public_ips == NULL) {
1324                 return false;
1325         }
1326
1327         for (i=0;i<public_ips->num;i++) {
1328                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1329                         /* yes, this node can serve this public ip */
1330                         return true;
1331                 }
1332         }
1333
1334         return false;
1335 }
1336
1337 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1338                                  struct ctdb_ipflags ipflags,
1339                                  struct ctdb_public_ip_list *ip)
1340 {
1341         if (ipflags.noiptakeover) {
1342                 return false;
1343         }
1344
1345         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1346 }
1347
1348 /* search the node lists list for a node to takeover this ip.
1349    pick the node that currently are serving the least number of ips
1350    so that the ips get spread out evenly.
1351 */
1352 static int find_takeover_node(struct ctdb_context *ctdb, 
1353                 struct ctdb_ipflags *ipflags,
1354                 struct ctdb_public_ip_list *ip,
1355                 struct ctdb_public_ip_list *all_ips)
1356 {
1357         int pnn, min=0, num;
1358         int i, numnodes;
1359
1360         numnodes = talloc_array_length(ipflags);
1361         pnn    = -1;
1362         for (i=0;i<numnodes;i++) {
1363                 /* verify that this node can serve this ip */
1364                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1365                         /* no it couldnt   so skip to the next node */
1366                         continue;
1367                 }
1368
1369                 num = node_ip_coverage(ctdb, i, all_ips);
1370                 /* was this the first node we checked ? */
1371                 if (pnn == -1) {
1372                         pnn = i;
1373                         min  = num;
1374                 } else {
1375                         if (num < min) {
1376                                 pnn = i;
1377                                 min  = num;
1378                         }
1379                 }
1380         }       
1381         if (pnn == -1) {
1382                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1383                         ctdb_addr_to_str(&ip->addr)));
1384
1385                 return -1;
1386         }
1387
1388         ip->pnn = pnn;
1389         return 0;
1390 }
1391
1392 #define IP_KEYLEN       4
1393 static uint32_t *ip_key(ctdb_sock_addr *ip)
1394 {
1395         static uint32_t key[IP_KEYLEN];
1396
1397         bzero(key, sizeof(key));
1398
1399         switch (ip->sa.sa_family) {
1400         case AF_INET:
1401                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1402                 break;
1403         case AF_INET6: {
1404                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1405                 key[0]  = htonl(s6_a32[0]);
1406                 key[1]  = htonl(s6_a32[1]);
1407                 key[2]  = htonl(s6_a32[2]);
1408                 key[3]  = htonl(s6_a32[3]);
1409                 break;
1410         }
1411         default:
1412                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1413                 return key;
1414         }
1415
1416         return key;
1417 }
1418
1419 static void *add_ip_callback(void *parm, void *data)
1420 {
1421         struct ctdb_public_ip_list *this_ip = parm; 
1422         struct ctdb_public_ip_list *prev_ip = data; 
1423
1424         if (prev_ip == NULL) {
1425                 return parm;
1426         }
1427         if (this_ip->pnn == -1) {
1428                 this_ip->pnn = prev_ip->pnn;
1429         }
1430
1431         return parm;
1432 }
1433
1434 static int getips_count_callback(void *param, void *data)
1435 {
1436         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1437         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1438
1439         new_ip->next = *ip_list;
1440         *ip_list     = new_ip;
1441         return 0;
1442 }
1443
1444 static struct ctdb_public_ip_list *
1445 create_merged_ip_list(struct ctdb_context *ctdb)
1446 {
1447         int i, j;
1448         struct ctdb_public_ip_list *ip_list;
1449         struct ctdb_all_public_ips *public_ips;
1450
1451         if (ctdb->ip_tree != NULL) {
1452                 talloc_free(ctdb->ip_tree);
1453                 ctdb->ip_tree = NULL;
1454         }
1455         ctdb->ip_tree = trbt_create(ctdb, 0);
1456
1457         for (i=0;i<ctdb->num_nodes;i++) {
1458                 public_ips = ctdb->nodes[i]->known_public_ips;
1459
1460                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1461                         continue;
1462                 }
1463
1464                 /* there were no public ips for this node */
1465                 if (public_ips == NULL) {
1466                         continue;
1467                 }               
1468
1469                 for (j=0;j<public_ips->num;j++) {
1470                         struct ctdb_public_ip_list *tmp_ip; 
1471
1472                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1473                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1474                         /* Do not use information about IP addresses hosted
1475                          * on other nodes, it may not be accurate */
1476                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1477                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1478                         } else {
1479                                 tmp_ip->pnn = -1;
1480                         }
1481                         tmp_ip->addr = public_ips->ips[j].addr;
1482                         tmp_ip->next = NULL;
1483
1484                         trbt_insertarray32_callback(ctdb->ip_tree,
1485                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1486                                 add_ip_callback,
1487                                 tmp_ip);
1488                 }
1489         }
1490
1491         ip_list = NULL;
1492         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1493
1494         return ip_list;
1495 }
1496
1497 /* 
1498  * This is the length of the longtest common prefix between the IPs.
1499  * It is calculated by XOR-ing the 2 IPs together and counting the
1500  * number of leading zeroes.  The implementation means that all
1501  * addresses end up being 128 bits long.
1502  *
1503  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1504  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1505  * lots of nodes and IP addresses?
1506  */
1507 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1508 {
1509         uint32_t ip1_k[IP_KEYLEN];
1510         uint32_t *t;
1511         int i;
1512         uint32_t x;
1513
1514         uint32_t distance = 0;
1515
1516         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1517         t = ip_key(ip2);
1518         for (i=0; i<IP_KEYLEN; i++) {
1519                 x = ip1_k[i] ^ t[i];
1520                 if (x == 0) {
1521                         distance += 32;
1522                 } else {
1523                         /* Count number of leading zeroes. 
1524                          * FIXME? This could be optimised...
1525                          */
1526                         while ((x & (1 << 31)) == 0) {
1527                                 x <<= 1;
1528                                 distance += 1;
1529                         }
1530                 }
1531         }
1532
1533         return distance;
1534 }
1535
1536 /* Calculate the IP distance for the given IP relative to IPs on the
1537    given node.  The ips argument is generally the all_ips variable
1538    used in the main part of the algorithm.
1539  */
1540 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1541                                   struct ctdb_public_ip_list *ips,
1542                                   int pnn)
1543 {
1544         struct ctdb_public_ip_list *t;
1545         uint32_t d;
1546
1547         uint32_t sum = 0;
1548
1549         for (t=ips; t != NULL; t=t->next) {
1550                 if (t->pnn != pnn) {
1551                         continue;
1552                 }
1553
1554                 /* Optimisation: We never calculate the distance
1555                  * between an address and itself.  This allows us to
1556                  * calculate the effect of removing an address from a
1557                  * node by simply calculating the distance between
1558                  * that address and all of the exitsing addresses.
1559                  * Moreover, we assume that we're only ever dealing
1560                  * with addresses from all_ips so we can identify an
1561                  * address via a pointer rather than doing a more
1562                  * expensive address comparison. */
1563                 if (&(t->addr) == ip) {
1564                         continue;
1565                 }
1566
1567                 d = ip_distance(ip, &(t->addr));
1568                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1569         }
1570
1571         return sum;
1572 }
1573
1574 /* Return the LCP2 imbalance metric for addresses currently assigned
1575    to the given node.
1576  */
1577 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1578 {
1579         struct ctdb_public_ip_list *t;
1580
1581         uint32_t imbalance = 0;
1582
1583         for (t=all_ips; t!=NULL; t=t->next) {
1584                 if (t->pnn != pnn) {
1585                         continue;
1586                 }
1587                 /* Pass the rest of the IPs rather than the whole
1588                    all_ips input list.
1589                 */
1590                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1591         }
1592
1593         return imbalance;
1594 }
1595
1596 /* Allocate any unassigned IPs just by looping through the IPs and
1597  * finding the best node for each.
1598  */
1599 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1600                                       struct ctdb_ipflags *ipflags,
1601                                       struct ctdb_public_ip_list *all_ips)
1602 {
1603         struct ctdb_public_ip_list *tmp_ip;
1604
1605         /* loop over all ip's and find a physical node to cover for 
1606            each unassigned ip.
1607         */
1608         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1609                 if (tmp_ip->pnn == -1) {
1610                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1611                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1612                                         ctdb_addr_to_str(&tmp_ip->addr)));
1613                         }
1614                 }
1615         }
1616 }
1617
1618 /* Basic non-deterministic rebalancing algorithm.
1619  */
1620 static void basic_failback(struct ctdb_context *ctdb,
1621                            struct ctdb_ipflags *ipflags,
1622                            struct ctdb_public_ip_list *all_ips,
1623                            int num_ips)
1624 {
1625         int i, numnodes;
1626         int maxnode, maxnum, minnode, minnum, num, retries;
1627         struct ctdb_public_ip_list *tmp_ip;
1628
1629         numnodes = talloc_array_length(ipflags);
1630         retries = 0;
1631
1632 try_again:
1633         maxnum=0;
1634         minnum=0;
1635
1636         /* for each ip address, loop over all nodes that can serve
1637            this ip and make sure that the difference between the node
1638            serving the most and the node serving the least ip's are
1639            not greater than 1.
1640         */
1641         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1642                 if (tmp_ip->pnn == -1) {
1643                         continue;
1644                 }
1645
1646                 /* Get the highest and lowest number of ips's served by any 
1647                    valid node which can serve this ip.
1648                 */
1649                 maxnode = -1;
1650                 minnode = -1;
1651                 for (i=0;i<numnodes;i++) {
1652                         /* only check nodes that can actually serve this ip */
1653                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1654                                 /* no it couldnt   so skip to the next node */
1655                                 continue;
1656                         }
1657
1658                         num = node_ip_coverage(ctdb, i, all_ips);
1659                         if (maxnode == -1) {
1660                                 maxnode = i;
1661                                 maxnum  = num;
1662                         } else {
1663                                 if (num > maxnum) {
1664                                         maxnode = i;
1665                                         maxnum  = num;
1666                                 }
1667                         }
1668                         if (minnode == -1) {
1669                                 minnode = i;
1670                                 minnum  = num;
1671                         } else {
1672                                 if (num < minnum) {
1673                                         minnode = i;
1674                                         minnum  = num;
1675                                 }
1676                         }
1677                 }
1678                 if (maxnode == -1) {
1679                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1680                                 ctdb_addr_to_str(&tmp_ip->addr)));
1681
1682                         continue;
1683                 }
1684
1685                 /* if the spread between the smallest and largest coverage by
1686                    a node is >=2 we steal one of the ips from the node with
1687                    most coverage to even things out a bit.
1688                    try to do this a limited number of times since we dont
1689                    want to spend too much time balancing the ip coverage.
1690                 */
1691                 if ( (maxnum > minnum+1)
1692                      && (retries < (num_ips + 5)) ){
1693                         struct ctdb_public_ip_list *tmp;
1694
1695                         /* Reassign one of maxnode's VNNs */
1696                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1697                                 if (tmp->pnn == maxnode) {
1698                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1699                                         retries++;
1700                                         goto try_again;;
1701                                 }
1702                         }
1703                 }
1704         }
1705 }
1706
1707 struct ctdb_rebalancenodes {
1708         struct ctdb_rebalancenodes *next;
1709         uint32_t pnn;
1710 };
1711 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1712
1713
1714 /* set this flag to force the node to be rebalanced even if it just didnt
1715    become healthy again.
1716 */
1717 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1718 {
1719         struct ctdb_rebalancenodes *rebalance;
1720
1721         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1722                 if (rebalance->pnn == pnn) {
1723                         return;
1724                 }
1725         }
1726
1727         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1728         rebalance->pnn = pnn;
1729         rebalance->next = force_rebalance_list;
1730         force_rebalance_list = rebalance;
1731 }
1732
1733 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1734  * that we can unit test it.
1735  */
1736 static void lcp2_init(struct ctdb_context *tmp_ctx,
1737                       struct ctdb_ipflags *ipflags,
1738                       struct ctdb_public_ip_list *all_ips,
1739                       uint32_t **lcp2_imbalances,
1740                       bool **rebalance_candidates)
1741 {
1742         int i, numnodes;
1743         struct ctdb_public_ip_list *tmp_ip;
1744
1745         numnodes = talloc_array_length(ipflags);
1746
1747         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1748         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1749         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1750         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1751
1752         for (i=0;i<numnodes;i++) {
1753                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1754                 /* First step: assume all nodes are candidates */
1755                 (*rebalance_candidates)[i] = true;
1756         }
1757
1758         /* 2nd step: if a node has IPs assigned then it must have been
1759          * healthy before, so we remove it from consideration.  This
1760          * is overkill but is all we have because we don't maintain
1761          * state between takeover runs.  An alternative would be to
1762          * keep state and invalidate it every time the recovery master
1763          * changes.
1764          */
1765         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1766                 if (tmp_ip->pnn != -1) {
1767                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1768                 }
1769         }
1770
1771         /* 3rd step: if a node is forced to re-balance then
1772            we allow failback onto the node */
1773         while (force_rebalance_list != NULL) {
1774                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1775
1776                 if (force_rebalance_list->pnn <= numnodes) {
1777                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1778                 }
1779
1780                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1781                 talloc_free(force_rebalance_list);
1782                 force_rebalance_list = next;
1783         }
1784 }
1785
1786 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1787  * the IP/node combination that will cost the least.
1788  */
1789 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1790                                      struct ctdb_ipflags *ipflags,
1791                                      struct ctdb_public_ip_list *all_ips,
1792                                      uint32_t *lcp2_imbalances)
1793 {
1794         struct ctdb_public_ip_list *tmp_ip;
1795         int dstnode, numnodes;
1796
1797         int minnode;
1798         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1799         struct ctdb_public_ip_list *minip;
1800
1801         bool should_loop = true;
1802         bool have_unassigned = true;
1803
1804         numnodes = talloc_array_length(ipflags);
1805
1806         while (have_unassigned && should_loop) {
1807                 should_loop = false;
1808
1809                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1810                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1811
1812                 minnode = -1;
1813                 mindsum = 0;
1814                 minip = NULL;
1815
1816                 /* loop over each unassigned ip. */
1817                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1818                         if (tmp_ip->pnn != -1) {
1819                                 continue;
1820                         }
1821
1822                         for (dstnode=0; dstnode < numnodes; dstnode++) {
1823                                 /* only check nodes that can actually takeover this ip */
1824                                 if (!can_node_takeover_ip(ctdb, dstnode,
1825                                                           ipflags[dstnode],
1826                                                           tmp_ip)) {
1827                                         /* no it couldnt   so skip to the next node */
1828                                         continue;
1829                                 }
1830
1831                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1832                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1833                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1834                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1835                                                    dstnode,
1836                                                    dstimbl - lcp2_imbalances[dstnode]));
1837
1838
1839                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1840                                         minnode = dstnode;
1841                                         minimbl = dstimbl;
1842                                         mindsum = dstdsum;
1843                                         minip = tmp_ip;
1844                                         should_loop = true;
1845                                 }
1846                         }
1847                 }
1848
1849                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1850
1851                 /* If we found one then assign it to the given node. */
1852                 if (minnode != -1) {
1853                         minip->pnn = minnode;
1854                         lcp2_imbalances[minnode] = minimbl;
1855                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1856                                           ctdb_addr_to_str(&(minip->addr)),
1857                                           minnode,
1858                                           mindsum));
1859                 }
1860
1861                 /* There might be a better way but at least this is clear. */
1862                 have_unassigned = false;
1863                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1864                         if (tmp_ip->pnn == -1) {
1865                                 have_unassigned = true;
1866                         }
1867                 }
1868         }
1869
1870         /* We know if we have an unassigned addresses so we might as
1871          * well optimise.
1872          */
1873         if (have_unassigned) {
1874                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1875                         if (tmp_ip->pnn == -1) {
1876                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1877                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1878                         }
1879                 }
1880         }
1881 }
1882
1883 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1884  * to move IPs from, determines the best IP/destination node
1885  * combination to move from the source node.
1886  */
1887 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1888                                     struct ctdb_ipflags *ipflags,
1889                                     struct ctdb_public_ip_list *all_ips,
1890                                     int srcnode,
1891                                     uint32_t candimbl,
1892                                     uint32_t *lcp2_imbalances,
1893                                     bool *rebalance_candidates)
1894 {
1895         int dstnode, mindstnode, numnodes;
1896         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1897         uint32_t minsrcimbl, mindstimbl;
1898         struct ctdb_public_ip_list *minip;
1899         struct ctdb_public_ip_list *tmp_ip;
1900
1901         /* Find an IP and destination node that best reduces imbalance. */
1902         minip = NULL;
1903         minsrcimbl = 0;
1904         mindstnode = -1;
1905         mindstimbl = 0;
1906
1907         numnodes = talloc_array_length(ipflags);
1908
1909         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1910         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1911
1912         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1913                 /* Only consider addresses on srcnode. */
1914                 if (tmp_ip->pnn != srcnode) {
1915                         continue;
1916                 }
1917
1918                 /* What is this IP address costing the source node? */
1919                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1920                 srcimbl = candimbl - srcdsum;
1921
1922                 /* Consider this IP address would cost each potential
1923                  * destination node.  Destination nodes are limited to
1924                  * those that are newly healthy, since we don't want
1925                  * to do gratuitous failover of IPs just to make minor
1926                  * balance improvements.
1927                  */
1928                 for (dstnode=0; dstnode < numnodes; dstnode++) {
1929                         if (!rebalance_candidates[dstnode]) {
1930                                 continue;
1931                         }
1932
1933                         /* only check nodes that can actually takeover this ip */
1934                         if (!can_node_takeover_ip(ctdb, dstnode,
1935                                                   ipflags[dstnode], tmp_ip)) {
1936                                 /* no it couldnt   so skip to the next node */
1937                                 continue;
1938                         }
1939
1940                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1941                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1942                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1943                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1944                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1945                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1946
1947                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1948                             ((mindstnode == -1) ||                              \
1949                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1950
1951                                 minip = tmp_ip;
1952                                 minsrcimbl = srcimbl;
1953                                 mindstnode = dstnode;
1954                                 mindstimbl = dstimbl;
1955                         }
1956                 }
1957         }
1958         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1959
1960         if (mindstnode != -1) {
1961                 /* We found a move that makes things better... */
1962                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1963                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1964                                   ctdb_addr_to_str(&(minip->addr)),
1965                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1966
1967
1968                 lcp2_imbalances[srcnode] = srcimbl;
1969                 lcp2_imbalances[mindstnode] = mindstimbl;
1970                 minip->pnn = mindstnode;
1971
1972                 return true;
1973         }
1974
1975         return false;
1976         
1977 }
1978
1979 struct lcp2_imbalance_pnn {
1980         uint32_t imbalance;
1981         int pnn;
1982 };
1983
1984 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1985 {
1986         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1987         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1988
1989         if (lipa->imbalance > lipb->imbalance) {
1990                 return -1;
1991         } else if (lipa->imbalance == lipb->imbalance) {
1992                 return 0;
1993         } else {
1994                 return 1;
1995         }
1996 }
1997
1998 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1999  * node with the highest LCP2 imbalance, and then determines the best
2000  * IP/destination node combination to move from the source node.
2001  */
2002 static void lcp2_failback(struct ctdb_context *ctdb,
2003                           struct ctdb_ipflags *ipflags,
2004                           struct ctdb_public_ip_list *all_ips,
2005                           uint32_t *lcp2_imbalances,
2006                           bool *rebalance_candidates)
2007 {
2008         int i, num_rebalance_candidates, numnodes;
2009         struct lcp2_imbalance_pnn * lips;
2010         bool again;
2011
2012         numnodes = talloc_array_length(ipflags);
2013
2014 try_again:
2015
2016         /* It is only worth continuing if we have suitable target
2017          * nodes to transfer IPs to.  This check is much cheaper than
2018          * continuing on...
2019          */
2020         num_rebalance_candidates = 0;
2021         for (i = 0; i < numnodes; i++) {
2022                 if (rebalance_candidates[i]) {
2023                         num_rebalance_candidates++;
2024                 }
2025         }
2026         if (num_rebalance_candidates == 0) {
2027                 return;
2028         }
2029
2030         /* Put the imbalances and nodes into an array, sort them and
2031          * iterate through candidates.  Usually the 1st one will be
2032          * used, so this doesn't cost much...
2033          */
2034         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2035         for (i = 0; i < numnodes; i++) {
2036                 lips[i].imbalance = lcp2_imbalances[i];
2037                 lips[i].pnn = i;
2038         }
2039         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2040               lcp2_cmp_imbalance_pnn);
2041
2042         again = false;
2043         for (i = 0; i < numnodes; i++) {
2044                 /* This means that all nodes had 0 or 1 addresses, so
2045                  * can't be imbalanced.
2046                  */
2047                 if (lips[i].imbalance == 0) {
2048                         break;
2049                 }
2050
2051                 if (lcp2_failback_candidate(ctdb,
2052                                             ipflags,
2053                                             all_ips,
2054                                             lips[i].pnn,
2055                                             lips[i].imbalance,
2056                                             lcp2_imbalances,
2057                                             rebalance_candidates)) {
2058                         again = true;
2059                         break;
2060                 }
2061         }
2062
2063         talloc_free(lips);
2064         if (again) {
2065                 goto try_again;
2066         }
2067 }
2068
2069 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2070                                     struct ctdb_ipflags *ipflags,
2071                                     struct ctdb_public_ip_list *all_ips)
2072 {
2073         struct ctdb_public_ip_list *tmp_ip;
2074
2075         /* verify that the assigned nodes can serve that public ip
2076            and set it to -1 if not
2077         */
2078         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2079                 if (tmp_ip->pnn == -1) {
2080                         continue;
2081                 }
2082                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2083                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2084                         /* this node can not serve this ip. */
2085                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2086                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2087                                            tmp_ip->pnn));
2088                         tmp_ip->pnn = -1;
2089                 }
2090         }
2091 }
2092
2093 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2094                                        struct ctdb_ipflags *ipflags,
2095                                        struct ctdb_public_ip_list *all_ips)
2096 {
2097         struct ctdb_public_ip_list *tmp_ip;
2098         int i, numnodes;
2099
2100         numnodes = talloc_array_length(ipflags);
2101
2102         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2103        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2104         *  always be allocated the same way for a specific set of
2105         *  available/unavailable nodes.
2106         */
2107
2108         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2109                 tmp_ip->pnn = i%numnodes;
2110         }
2111
2112         /* IP failback doesn't make sense with deterministic
2113          * IPs, since the modulo step above implicitly fails
2114          * back IPs to their "home" node.
2115          */
2116         if (1 == ctdb->tunable.no_ip_failback) {
2117                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2118         }
2119
2120         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2121
2122         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2123
2124         /* No failback here! */
2125 }
2126
2127 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2128                                           struct ctdb_ipflags *ipflags,
2129                                           struct ctdb_public_ip_list *all_ips)
2130 {
2131         /* This should be pushed down into basic_failback. */
2132         struct ctdb_public_ip_list *tmp_ip;
2133         int num_ips = 0;
2134         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2135                 num_ips++;
2136         }
2137
2138         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2139
2140         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2141
2142         /* If we don't want IPs to fail back then don't rebalance IPs. */
2143         if (1 == ctdb->tunable.no_ip_failback) {
2144                 return;
2145         }
2146
2147         /* Now, try to make sure the ip adresses are evenly distributed
2148            across the nodes.
2149         */
2150         basic_failback(ctdb, ipflags, all_ips, num_ips);
2151 }
2152
2153 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2154                           struct ctdb_ipflags *ipflags,
2155                           struct ctdb_public_ip_list *all_ips)
2156 {
2157         uint32_t *lcp2_imbalances;
2158         bool *rebalance_candidates;
2159
2160         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2161
2162         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2163
2164         lcp2_init(tmp_ctx, ipflags, all_ips,
2165                   &lcp2_imbalances, &rebalance_candidates);
2166
2167         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2168
2169         /* If we don't want IPs to fail back then don't rebalance IPs. */
2170         if (1 == ctdb->tunable.no_ip_failback) {
2171                 goto finished;
2172         }
2173
2174         /* Now, try to make sure the ip adresses are evenly distributed
2175            across the nodes.
2176         */
2177         lcp2_failback(ctdb, ipflags, all_ips,
2178                       lcp2_imbalances, rebalance_candidates);
2179
2180 finished:
2181         talloc_free(tmp_ctx);
2182 }
2183
2184 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2185 {
2186         int i, num_healthy;
2187
2188         /* Count how many completely healthy nodes we have */
2189         num_healthy = 0;
2190         for (i=0;i<nodemap->num;i++) {
2191                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2192                         num_healthy++;
2193                 }
2194         }
2195
2196         return num_healthy == 0;
2197 }
2198
2199 /* The calculation part of the IP allocation algorithm. */
2200 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2201                                    struct ctdb_ipflags *ipflags,
2202                                    struct ctdb_public_ip_list **all_ips_p)
2203 {
2204         /* since nodes only know about those public addresses that
2205            can be served by that particular node, no single node has
2206            a full list of all public addresses that exist in the cluster.
2207            Walk over all node structures and create a merged list of
2208            all public addresses that exist in the cluster.
2209
2210            keep the tree of ips around as ctdb->ip_tree
2211         */
2212         *all_ips_p = create_merged_ip_list(ctdb);
2213
2214         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2215                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p);
2216         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2217                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2218         } else {
2219                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2220         }
2221
2222         /* at this point ->pnn is the node which will own each IP
2223            or -1 if there is no node that can cover this ip
2224         */
2225
2226         return;
2227 }
2228
2229 struct get_tunable_callback_data {
2230         const char *tunable;
2231         uint32_t *out;
2232 };
2233
2234 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2235                                  int32_t res, TDB_DATA outdata,
2236                                  void *callback)
2237 {
2238         struct get_tunable_callback_data *cd =
2239                 (struct get_tunable_callback_data *)callback;
2240         int size;
2241
2242         if (res != 0) {
2243                 DEBUG(DEBUG_ERR,
2244                       ("Failure to read \"%s\" tunable from remote node %d\n",
2245                        cd->tunable, pnn));
2246                 return;
2247         }
2248
2249         if (outdata.dsize != sizeof(uint32_t)) {
2250                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2251                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2252                                  (int)outdata.dsize));
2253                 return;
2254         }
2255
2256         size = talloc_array_length(cd->out);
2257         if (pnn >= size) {
2258                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2259                                  cd->tunable, pnn, size));
2260                 return;
2261         }
2262
2263                 
2264         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2265 }
2266
2267 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2268                                         TALLOC_CTX *tmp_ctx,
2269                                         struct ctdb_node_map *nodemap,
2270                                         const char *tunable)
2271 {
2272         TDB_DATA data;
2273         struct ctdb_control_get_tunable *t;
2274         uint32_t *nodes;
2275         uint32_t *tvals;
2276         struct get_tunable_callback_data callback_data;
2277
2278         tvals = talloc_zero_array(tmp_ctx, uint32_t, nodemap->num);
2279         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2280         callback_data.out = tvals;
2281         callback_data.tunable = tunable;
2282
2283         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2284         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2285         t = (struct ctdb_control_get_tunable *)data.dptr;
2286         t->length = strlen(tunable)+1;
2287         memcpy(t->name, tunable, t->length);
2288         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2289         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2290                                       nodes, 0, TAKEOVER_TIMEOUT(),
2291                                       false, data,
2292                                       get_tunable_callback, NULL,
2293                                       &callback_data) != 0) {
2294                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get %s tunable failed\n", tunable));
2295         }
2296         talloc_free(nodes);
2297         talloc_free(data.dptr);
2298
2299         return tvals;
2300 }
2301
2302 /* Set internal flags for IP allocation:
2303  *   Clear ip flags
2304  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2305  *   Set NOIPHOST ip flag for each INACTIVE node
2306  *   if all nodes are disabled:
2307  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2308  *   else
2309  *     Set NOIPHOST ip flags for disabled nodes
2310  */
2311 static struct ctdb_ipflags *
2312 set_ipflags_internal(struct ctdb_context *ctdb,
2313                      TALLOC_CTX *tmp_ctx,
2314                      struct ctdb_node_map *nodemap,
2315                      uint32_t *tval_noiptakeover,
2316                      uint32_t *tval_noiphostonalldisabled)
2317 {
2318         int i;
2319         struct ctdb_ipflags *ipflags;
2320
2321         /* Clear IP flags - implicit due to talloc_zero */
2322         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2323         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2324
2325         for (i=0;i<nodemap->num;i++) {
2326                 /* Can not take IPs on node with NoIPTakeover set */
2327                 if (tval_noiptakeover[i] != 0) {
2328                         ipflags[i].noiptakeover = true;
2329                 }
2330
2331                 /* Can not host IPs on INACTIVE node */
2332                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2333                         ipflags[i].noiphost = true;
2334                 }
2335         }
2336
2337         if (all_nodes_are_disabled(nodemap)) {
2338                 /* If all nodes are disabled, can not host IPs on node
2339                  * with NoIPHostOnAllDisabled set
2340                  */
2341                 for (i=0;i<nodemap->num;i++) {
2342                         if (tval_noiphostonalldisabled[i] != 0) {
2343                                 ipflags[i].noiphost = true;
2344                         }
2345                 }
2346         } else {
2347                 /* If some nodes are not disabled, then can not host
2348                  * IPs on DISABLED node
2349                  */
2350                 for (i=0;i<nodemap->num;i++) {
2351                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2352                                 ipflags[i].noiphost = true;
2353                         }
2354                 }
2355         }
2356
2357         return ipflags;
2358 }
2359
2360 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2361                                         TALLOC_CTX *tmp_ctx,
2362                                         struct ctdb_node_map *nodemap)
2363 {
2364         uint32_t *tval_noiptakeover;
2365         uint32_t *tval_noiphostonalldisabled;
2366         struct ctdb_ipflags *ipflags;
2367
2368         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2369                                                    "NoIPTakeover");
2370         if (tval_noiptakeover == NULL) {
2371                 return NULL;
2372         }
2373
2374         tval_noiphostonalldisabled =
2375                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2376                                        "NoIPHostOnAllDisabled");
2377         if (tval_noiphostonalldisabled == NULL) {
2378                 return NULL;
2379         }
2380
2381         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2382                                        tval_noiptakeover,
2383                                        tval_noiphostonalldisabled);
2384
2385         talloc_free(tval_noiptakeover);
2386         talloc_free(tval_noiphostonalldisabled);
2387
2388         return ipflags;
2389 }
2390
2391 /*
2392   make any IP alias changes for public addresses that are necessary 
2393  */
2394 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2395                       client_async_callback fail_callback, void *callback_data)
2396 {
2397         int i;
2398         struct ctdb_public_ip ip;
2399         struct ctdb_public_ipv4 ipv4;
2400         uint32_t *nodes;
2401         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2402         TDB_DATA data;
2403         struct timeval timeout;
2404         struct client_async_data *async_data;
2405         struct ctdb_client_control_state *state;
2406         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2407         uint32_t disable_timeout;
2408         struct ctdb_ipflags *ipflags;
2409
2410         /*
2411          * ip failover is completely disabled, just send out the 
2412          * ipreallocated event.
2413          */
2414         if (ctdb->tunable.disable_ip_failover != 0) {
2415                 goto ipreallocated;
2416         }
2417
2418         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2419         if (ipflags == NULL) {
2420                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2421                 talloc_free(tmp_ctx);
2422                 return -1;
2423         }
2424
2425         ZERO_STRUCT(ip);
2426
2427         /* Do the IP reassignment calculations */
2428         ctdb_takeover_run_core(ctdb, ipflags, &all_ips);
2429
2430         /* The IP flags need to be cleared because they should never
2431          * be seen outside the IP allocation code.
2432          */
2433
2434         /* The recovery daemon does regular sanity checks of the IPs.
2435          * However, sometimes it is overzealous and thinks changes are
2436          * required when they're already underway.  This stops the
2437          * checks for a while before we start moving IPs.
2438          */
2439         disable_timeout = ctdb->tunable.takeover_timeout;
2440         data.dptr  = (uint8_t*)&disable_timeout;
2441         data.dsize = sizeof(disable_timeout);
2442         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2443                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2444                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2445         }
2446
2447         /* now tell all nodes to delete any alias that they should not
2448            have.  This will be a NOOP on nodes that don't currently
2449            hold the given alias */
2450         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2451         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2452
2453         async_data->fail_callback = fail_callback;
2454         async_data->callback_data = callback_data;
2455
2456         for (i=0;i<nodemap->num;i++) {
2457                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2458                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2459                         continue;
2460                 }
2461
2462                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2463                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2464                                 /* This node should be serving this
2465                                    vnn so dont tell it to release the ip
2466                                 */
2467                                 continue;
2468                         }
2469                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2470                                 ipv4.pnn = tmp_ip->pnn;
2471                                 ipv4.sin = tmp_ip->addr.ip;
2472
2473                                 timeout = TAKEOVER_TIMEOUT();
2474                                 data.dsize = sizeof(ipv4);
2475                                 data.dptr  = (uint8_t *)&ipv4;
2476                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2477                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2478                                                 data, async_data,
2479                                                 &timeout, NULL);
2480                         } else {
2481                                 ip.pnn  = tmp_ip->pnn;
2482                                 ip.addr = tmp_ip->addr;
2483
2484                                 timeout = TAKEOVER_TIMEOUT();
2485                                 data.dsize = sizeof(ip);
2486                                 data.dptr  = (uint8_t *)&ip;
2487                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2488                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2489                                                 data, async_data,
2490                                                 &timeout, NULL);
2491                         }
2492
2493                         if (state == NULL) {
2494                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2495                                 talloc_free(tmp_ctx);
2496                                 return -1;
2497                         }
2498                 
2499                         ctdb_client_async_add(async_data, state);
2500                 }
2501         }
2502         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2503                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2504                 talloc_free(tmp_ctx);
2505                 return -1;
2506         }
2507         talloc_free(async_data);
2508
2509
2510         /* tell all nodes to get their own IPs */
2511         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2512         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2513
2514         async_data->fail_callback = fail_callback;
2515         async_data->callback_data = callback_data;
2516
2517         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2518                 if (tmp_ip->pnn == -1) {
2519                         /* this IP won't be taken over */
2520                         continue;
2521                 }
2522
2523                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2524                         ipv4.pnn = tmp_ip->pnn;
2525                         ipv4.sin = tmp_ip->addr.ip;
2526
2527                         timeout = TAKEOVER_TIMEOUT();
2528                         data.dsize = sizeof(ipv4);
2529                         data.dptr  = (uint8_t *)&ipv4;
2530                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2531                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2532                                         data, async_data,
2533                                         &timeout, NULL);
2534                 } else {
2535                         ip.pnn  = tmp_ip->pnn;
2536                         ip.addr = tmp_ip->addr;
2537
2538                         timeout = TAKEOVER_TIMEOUT();
2539                         data.dsize = sizeof(ip);
2540                         data.dptr  = (uint8_t *)&ip;
2541                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2542                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2543                                         data, async_data,
2544                                         &timeout, NULL);
2545                 }
2546                 if (state == NULL) {
2547                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2548                         talloc_free(tmp_ctx);
2549                         return -1;
2550                 }
2551                 
2552                 ctdb_client_async_add(async_data, state);
2553         }
2554         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2555                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2556                 talloc_free(tmp_ctx);
2557                 return -1;
2558         }
2559
2560 ipreallocated:
2561         /* 
2562          * Tell all nodes to run eventscripts to process the
2563          * "ipreallocated" event.  This can do a lot of things,
2564          * including restarting services to reconfigure them if public
2565          * IPs have moved.  Once upon a time this event only used to
2566          * update natwg.
2567          */
2568         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2569         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2570                                       nodes, 0, TAKEOVER_TIMEOUT(),
2571                                       false, tdb_null,
2572                                       NULL, fail_callback,
2573                                       callback_data) != 0) {
2574                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2575         }
2576
2577         talloc_free(tmp_ctx);
2578         return 0;
2579 }
2580
2581
2582 /*
2583   destroy a ctdb_client_ip structure
2584  */
2585 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2586 {
2587         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2588                 ctdb_addr_to_str(&ip->addr),
2589                 ntohs(ip->addr.ip.sin_port),
2590                 ip->client_id));
2591
2592         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2593         return 0;
2594 }
2595
2596 /*
2597   called by a client to inform us of a TCP connection that it is managing
2598   that should tickled with an ACK when IP takeover is done
2599   we handle both the old ipv4 style of packets as well as the new ipv4/6
2600   pdus.
2601  */
2602 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2603                                 TDB_DATA indata)
2604 {
2605         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2606         struct ctdb_control_tcp *old_addr = NULL;
2607         struct ctdb_control_tcp_addr new_addr;
2608         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2609         struct ctdb_tcp_list *tcp;
2610         struct ctdb_tcp_connection t;
2611         int ret;
2612         TDB_DATA data;
2613         struct ctdb_client_ip *ip;
2614         struct ctdb_vnn *vnn;
2615         ctdb_sock_addr addr;
2616
2617         switch (indata.dsize) {
2618         case sizeof(struct ctdb_control_tcp):
2619                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2620                 ZERO_STRUCT(new_addr);
2621                 tcp_sock = &new_addr;
2622                 tcp_sock->src.ip  = old_addr->src;
2623                 tcp_sock->dest.ip = old_addr->dest;
2624                 break;
2625         case sizeof(struct ctdb_control_tcp_addr):
2626                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2627                 break;
2628         default:
2629                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2630                                  "to ctdb_control_tcp_client. size was %d but "
2631                                  "only allowed sizes are %lu and %lu\n",
2632                                  (int)indata.dsize,
2633                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2634                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2635                 return -1;
2636         }
2637
2638         addr = tcp_sock->src;
2639         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2640         addr = tcp_sock->dest;
2641         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2642
2643         ZERO_STRUCT(addr);
2644         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2645         vnn = find_public_ip_vnn(ctdb, &addr);
2646         if (vnn == NULL) {
2647                 switch (addr.sa.sa_family) {
2648                 case AF_INET:
2649                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2650                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2651                                         ctdb_addr_to_str(&addr)));
2652                         }
2653                         break;
2654                 case AF_INET6:
2655                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2656                                 ctdb_addr_to_str(&addr)));
2657                         break;
2658                 default:
2659                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2660                 }
2661
2662                 return 0;
2663         }
2664
2665         if (vnn->pnn != ctdb->pnn) {
2666                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2667                         ctdb_addr_to_str(&addr),
2668                         client_id, client->pid));
2669                 /* failing this call will tell smbd to die */
2670                 return -1;
2671         }
2672
2673         ip = talloc(client, struct ctdb_client_ip);
2674         CTDB_NO_MEMORY(ctdb, ip);
2675
2676         ip->ctdb      = ctdb;
2677         ip->addr      = addr;
2678         ip->client_id = client_id;
2679         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2680         DLIST_ADD(ctdb->client_ip_list, ip);
2681
2682         tcp = talloc(client, struct ctdb_tcp_list);
2683         CTDB_NO_MEMORY(ctdb, tcp);
2684
2685         tcp->connection.src_addr = tcp_sock->src;
2686         tcp->connection.dst_addr = tcp_sock->dest;
2687
2688         DLIST_ADD(client->tcp_list, tcp);
2689
2690         t.src_addr = tcp_sock->src;
2691         t.dst_addr = tcp_sock->dest;
2692
2693         data.dptr = (uint8_t *)&t;
2694         data.dsize = sizeof(t);
2695
2696         switch (addr.sa.sa_family) {
2697         case AF_INET:
2698                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2699                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2700                         ctdb_addr_to_str(&tcp_sock->src),
2701                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2702                 break;
2703         case AF_INET6:
2704                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2705                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2706                         ctdb_addr_to_str(&tcp_sock->src),
2707                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2708                 break;
2709         default:
2710                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2711         }
2712
2713
2714         /* tell all nodes about this tcp connection */
2715         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2716                                        CTDB_CONTROL_TCP_ADD,
2717                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2718         if (ret != 0) {
2719                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2720                 return -1;
2721         }
2722
2723         return 0;
2724 }
2725
2726 /*
2727   find a tcp address on a list
2728  */
2729 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2730                                            struct ctdb_tcp_connection *tcp)
2731 {
2732         int i;
2733
2734         if (array == NULL) {
2735                 return NULL;
2736         }
2737
2738         for (i=0;i<array->num;i++) {
2739                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2740                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2741                         return &array->connections[i];
2742                 }
2743         }
2744         return NULL;
2745 }
2746
2747
2748
2749 /*
2750   called by a daemon to inform us of a TCP connection that one of its
2751   clients managing that should tickled with an ACK when IP takeover is
2752   done
2753  */
2754 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2755 {
2756         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2757         struct ctdb_tcp_array *tcparray;
2758         struct ctdb_tcp_connection tcp;
2759         struct ctdb_vnn *vnn;
2760
2761         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2762         if (vnn == NULL) {
2763                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2764                         ctdb_addr_to_str(&p->dst_addr)));
2765
2766                 return -1;
2767         }
2768
2769
2770         tcparray = vnn->tcp_array;
2771
2772         /* If this is the first tickle */
2773         if (tcparray == NULL) {
2774                 tcparray = talloc_size(ctdb->nodes, 
2775                         offsetof(struct ctdb_tcp_array, connections) +
2776                         sizeof(struct ctdb_tcp_connection) * 1);
2777                 CTDB_NO_MEMORY(ctdb, tcparray);
2778                 vnn->tcp_array = tcparray;
2779
2780                 tcparray->num = 0;
2781                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2782                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2783
2784                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2785                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2786                 tcparray->num++;
2787
2788                 if (tcp_update_needed) {
2789                         vnn->tcp_update_needed = true;
2790                 }
2791                 return 0;
2792         }
2793
2794
2795         /* Do we already have this tickle ?*/
2796         tcp.src_addr = p->src_addr;
2797         tcp.dst_addr = p->dst_addr;
2798         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2799                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2800                         ctdb_addr_to_str(&tcp.dst_addr),
2801                         ntohs(tcp.dst_addr.ip.sin_port),
2802                         vnn->pnn));
2803                 return 0;
2804         }
2805
2806         /* A new tickle, we must add it to the array */
2807         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2808                                         struct ctdb_tcp_connection,
2809                                         tcparray->num+1);
2810         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2811
2812         vnn->tcp_array = tcparray;
2813         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2814         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2815         tcparray->num++;
2816                                 
2817         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2818                 ctdb_addr_to_str(&tcp.dst_addr),
2819                 ntohs(tcp.dst_addr.ip.sin_port),
2820                 vnn->pnn));
2821
2822         if (tcp_update_needed) {
2823                 vnn->tcp_update_needed = true;
2824         }
2825
2826         return 0;
2827 }
2828
2829
2830 /*
2831   called by a daemon to inform us of a TCP connection that one of its
2832   clients managing that should tickled with an ACK when IP takeover is
2833   done
2834  */
2835 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2836 {
2837         struct ctdb_tcp_connection *tcpp;
2838         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2839
2840         if (vnn == NULL) {
2841                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2842                         ctdb_addr_to_str(&conn->dst_addr)));
2843                 return;
2844         }
2845
2846         /* if the array is empty we cant remove it
2847            and we dont need to do anything
2848          */
2849         if (vnn->tcp_array == NULL) {
2850                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2851                         ctdb_addr_to_str(&conn->dst_addr),
2852                         ntohs(conn->dst_addr.ip.sin_port)));
2853                 return;
2854         }
2855
2856
2857         /* See if we know this connection
2858            if we dont know this connection  then we dont need to do anything
2859          */
2860         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2861         if (tcpp == NULL) {
2862                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2863                         ctdb_addr_to_str(&conn->dst_addr),
2864                         ntohs(conn->dst_addr.ip.sin_port)));
2865                 return;
2866         }
2867
2868
2869         /* We need to remove this entry from the array.
2870            Instead of allocating a new array and copying data to it
2871            we cheat and just copy the last entry in the existing array
2872            to the entry that is to be removed and just shring the 
2873            ->num field
2874          */
2875         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2876         vnn->tcp_array->num--;
2877
2878         /* If we deleted the last entry we also need to remove the entire array
2879          */
2880         if (vnn->tcp_array->num == 0) {
2881                 talloc_free(vnn->tcp_array);
2882                 vnn->tcp_array = NULL;
2883         }               
2884
2885         vnn->tcp_update_needed = true;
2886
2887         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2888                 ctdb_addr_to_str(&conn->src_addr),
2889                 ntohs(conn->src_addr.ip.sin_port)));
2890 }
2891
2892
2893 /*
2894   called by a daemon to inform us of a TCP connection that one of its
2895   clients used are no longer needed in the tickle database
2896  */
2897 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2898 {
2899         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2900
2901         ctdb_remove_tcp_connection(ctdb, conn);
2902
2903         return 0;
2904 }
2905
2906
2907 /*
2908   called when a daemon restarts - send all tickes for all public addresses
2909   we are serving immediately to the new node.
2910  */
2911 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2912 {
2913 /*XXX here we should send all tickes we are serving to the new node */
2914         return 0;
2915 }
2916
2917
2918 /*
2919   called when a client structure goes away - hook to remove
2920   elements from the tcp_list in all daemons
2921  */
2922 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2923 {
2924         while (client->tcp_list) {
2925                 struct ctdb_tcp_list *tcp = client->tcp_list;
2926                 DLIST_REMOVE(client->tcp_list, tcp);
2927                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2928         }
2929 }
2930
2931
2932 /*
2933   release all IPs on shutdown
2934  */
2935 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2936 {
2937         struct ctdb_vnn *vnn;
2938
2939         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2940                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2941                         ctdb_vnn_unassign_iface(ctdb, vnn);
2942                         continue;
2943                 }
2944                 if (!vnn->iface) {
2945                         continue;
2946                 }
2947                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2948                                   ctdb_vnn_iface_string(vnn),
2949                                   ctdb_addr_to_str(&vnn->public_address),
2950                                   vnn->public_netmask_bits);
2951                 release_kill_clients(ctdb, &vnn->public_address);
2952                 ctdb_vnn_unassign_iface(ctdb, vnn);
2953         }
2954 }
2955
2956
2957 /*
2958   get list of public IPs
2959  */
2960 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2961                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2962 {
2963         int i, num, len;
2964         struct ctdb_all_public_ips *ips;
2965         struct ctdb_vnn *vnn;
2966         bool only_available = false;
2967
2968         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2969                 only_available = true;
2970         }
2971
2972         /* count how many public ip structures we have */
2973         num = 0;
2974         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2975                 num++;
2976         }
2977
2978         len = offsetof(struct ctdb_all_public_ips, ips) + 
2979                 num*sizeof(struct ctdb_public_ip);
2980         ips = talloc_zero_size(outdata, len);
2981         CTDB_NO_MEMORY(ctdb, ips);
2982
2983         i = 0;
2984         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2985                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2986                         continue;
2987                 }
2988                 ips->ips[i].pnn  = vnn->pnn;
2989                 ips->ips[i].addr = vnn->public_address;
2990                 i++;
2991         }
2992         ips->num = i;
2993         len = offsetof(struct ctdb_all_public_ips, ips) +
2994                 i*sizeof(struct ctdb_public_ip);
2995
2996         outdata->dsize = len;
2997         outdata->dptr  = (uint8_t *)ips;
2998
2999         return 0;
3000 }
3001
3002
3003 /*
3004   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3005  */
3006 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3007                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3008 {
3009         int i, num, len;
3010         struct ctdb_all_public_ipsv4 *ips;
3011         struct ctdb_vnn *vnn;
3012
3013         /* count how many public ip structures we have */
3014         num = 0;
3015         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3016                 if (vnn->public_address.sa.sa_family != AF_INET) {
3017                         continue;
3018                 }
3019                 num++;
3020         }
3021
3022         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3023                 num*sizeof(struct ctdb_public_ipv4);
3024         ips = talloc_zero_size(outdata, len);
3025         CTDB_NO_MEMORY(ctdb, ips);
3026
3027         outdata->dsize = len;
3028         outdata->dptr  = (uint8_t *)ips;
3029
3030         ips->num = num;
3031         i = 0;
3032         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3033                 if (vnn->public_address.sa.sa_family != AF_INET) {
3034                         continue;
3035                 }
3036                 ips->ips[i].pnn = vnn->pnn;
3037                 ips->ips[i].sin = vnn->public_address.ip;
3038                 i++;
3039         }
3040
3041         return 0;
3042 }
3043
3044 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3045                                         struct ctdb_req_control *c,
3046                                         TDB_DATA indata,
3047                                         TDB_DATA *outdata)
3048 {
3049         int i, num, len;
3050         ctdb_sock_addr *addr;
3051         struct ctdb_control_public_ip_info *info;
3052         struct ctdb_vnn *vnn;
3053
3054         addr = (ctdb_sock_addr *)indata.dptr;
3055
3056         vnn = find_public_ip_vnn(ctdb, addr);
3057         if (vnn == NULL) {
3058                 /* if it is not a public ip   it could be our 'single ip' */
3059                 if (ctdb->single_ip_vnn) {
3060                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3061                                 vnn = ctdb->single_ip_vnn;
3062                         }
3063                 }
3064         }
3065         if (vnn == NULL) {
3066                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3067                                  "'%s'not a public address\n",
3068                                  ctdb_addr_to_str(addr)));
3069                 return -1;
3070         }
3071
3072         /* count how many public ip structures we have */
3073         num = 0;
3074         for (;vnn->ifaces[num];) {
3075                 num++;
3076         }
3077
3078         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3079                 num*sizeof(struct ctdb_control_iface_info);
3080         info = talloc_zero_size(outdata, len);
3081         CTDB_NO_MEMORY(ctdb, info);
3082
3083         info->ip.addr = vnn->public_address;
3084         info->ip.pnn = vnn->pnn;
3085         info->active_idx = 0xFFFFFFFF;
3086
3087         for (i=0; vnn->ifaces[i]; i++) {
3088                 struct ctdb_iface *cur;
3089
3090                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3091                 if (cur == NULL) {
3092                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3093                                            vnn->ifaces[i]));
3094                         return -1;
3095                 }
3096                 if (vnn->iface == cur) {
3097                         info->active_idx = i;
3098                 }
3099                 strcpy(info->ifaces[i].name, cur->name);
3100                 info->ifaces[i].link_state = cur->link_up;
3101                 info->ifaces[i].references = cur->references;
3102         }
3103         info->num = i;
3104         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3105                 i*sizeof(struct ctdb_control_iface_info);
3106
3107         outdata->dsize = len;
3108         outdata->dptr  = (uint8_t *)info;
3109
3110         return 0;
3111 }
3112
3113 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3114                                 struct ctdb_req_control *c,
3115                                 TDB_DATA *outdata)
3116 {
3117         int i, num, len;
3118         struct ctdb_control_get_ifaces *ifaces;
3119         struct ctdb_iface *cur;
3120
3121         /* count how many public ip structures we have */
3122         num = 0;
3123         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3124                 num++;
3125         }
3126
3127         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3128                 num*sizeof(struct ctdb_control_iface_info);
3129         ifaces = talloc_zero_size(outdata, len);
3130         CTDB_NO_MEMORY(ctdb, ifaces);
3131
3132         i = 0;
3133         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3134                 strcpy(ifaces->ifaces[i].name, cur->name);
3135                 ifaces->ifaces[i].link_state = cur->link_up;
3136                 ifaces->ifaces[i].references = cur->references;
3137                 i++;
3138         }
3139         ifaces->num = i;
3140         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3141                 i*sizeof(struct ctdb_control_iface_info);
3142
3143         outdata->dsize = len;
3144         outdata->dptr  = (uint8_t *)ifaces;
3145
3146         return 0;
3147 }
3148
3149 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3150                                     struct ctdb_req_control *c,
3151                                     TDB_DATA indata)
3152 {
3153         struct ctdb_control_iface_info *info;
3154         struct ctdb_iface *iface;
3155         bool link_up = false;
3156
3157         info = (struct ctdb_control_iface_info *)indata.dptr;
3158
3159         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3160                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3161                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3162                                   len, len, info->name));
3163                 return -1;
3164         }
3165
3166         switch (info->link_state) {
3167         case 0:
3168                 link_up = false;
3169                 break;
3170         case 1:
3171                 link_up = true;
3172                 break;
3173         default:
3174                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3175                                   (unsigned int)info->link_state));
3176                 return -1;
3177         }
3178
3179         if (info->references != 0) {
3180                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3181                                   (unsigned int)info->references));
3182                 return -1;
3183         }
3184
3185         iface = ctdb_find_iface(ctdb, info->name);
3186         if (iface == NULL) {
3187                 return -1;
3188         }
3189
3190         if (link_up == iface->link_up) {
3191                 return 0;
3192         }
3193
3194         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3195               ("iface[%s] has changed it's link status %s => %s\n",
3196                iface->name,
3197                iface->link_up?"up":"down",
3198                link_up?"up":"down"));
3199
3200         iface->link_up = link_up;
3201         return 0;
3202 }
3203
3204
3205 /* 
3206    structure containing the listening socket and the list of tcp connections
3207    that the ctdb daemon is to kill
3208 */
3209 struct ctdb_kill_tcp {
3210         struct ctdb_vnn *vnn;
3211         struct ctdb_context *ctdb;
3212         int capture_fd;
3213         struct fd_event *fde;
3214         trbt_tree_t *connections;
3215         void *private_data;
3216 };
3217
3218 /*
3219   a tcp connection that is to be killed
3220  */
3221 struct ctdb_killtcp_con {
3222         ctdb_sock_addr src_addr;
3223         ctdb_sock_addr dst_addr;
3224         int count;
3225         struct ctdb_kill_tcp *killtcp;
3226 };
3227
3228 /* this function is used to create a key to represent this socketpair
3229    in the killtcp tree.
3230    this key is used to insert and lookup matching socketpairs that are
3231    to be tickled and RST
3232 */
3233 #define KILLTCP_KEYLEN  10
3234 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3235 {
3236         static uint32_t key[KILLTCP_KEYLEN];
3237
3238         bzero(key, sizeof(key));
3239
3240         if (src->sa.sa_family != dst->sa.sa_family) {
3241                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3242                 return key;
3243         }
3244         
3245         switch (src->sa.sa_family) {
3246         case AF_INET:
3247                 key[0]  = dst->ip.sin_addr.s_addr;
3248                 key[1]  = src->ip.sin_addr.s_addr;
3249                 key[2]  = dst->ip.sin_port;
3250                 key[3]  = src->ip.sin_port;
3251                 break;
3252         case AF_INET6: {
3253                 uint32_t *dst6_addr32 =
3254                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3255                 uint32_t *src6_addr32 =
3256                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3257                 key[0]  = dst6_addr32[3];
3258                 key[1]  = src6_addr32[3];
3259                 key[2]  = dst6_addr32[2];
3260                 key[3]  = src6_addr32[2];
3261                 key[4]  = dst6_addr32[1];
3262                 key[5]  = src6_addr32[1];
3263                 key[6]  = dst6_addr32[0];
3264                 key[7]  = src6_addr32[0];
3265                 key[8]  = dst->ip6.sin6_port;
3266                 key[9]  = src->ip6.sin6_port;
3267                 break;
3268         }
3269         default:
3270                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3271                 return key;
3272         }
3273
3274         return key;
3275 }
3276
3277 /*
3278   called when we get a read event on the raw socket
3279  */
3280 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3281                                 uint16_t flags, void *private_data)
3282 {
3283         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3284         struct ctdb_killtcp_con *con;
3285         ctdb_sock_addr src, dst;
3286         uint32_t ack_seq, seq;
3287
3288         if (!(flags & EVENT_FD_READ)) {
3289                 return;
3290         }
3291
3292         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3293                                 killtcp->private_data,
3294                                 &src, &dst,
3295                                 &ack_seq, &seq) != 0) {
3296                 /* probably a non-tcp ACK packet */
3297                 return;
3298         }
3299
3300         /* check if we have this guy in our list of connections
3301            to kill
3302         */
3303         con = trbt_lookuparray32(killtcp->connections, 
3304                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3305         if (con == NULL) {
3306                 /* no this was some other packet we can just ignore */
3307                 return;
3308         }
3309
3310         /* This one has been tickled !
3311            now reset him and remove him from the list.
3312          */
3313         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3314                 ntohs(con->dst_addr.ip.sin_port),
3315                 ctdb_addr_to_str(&con->src_addr),
3316                 ntohs(con->src_addr.ip.sin_port)));
3317
3318         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3319         talloc_free(con);
3320 }
3321
3322
3323 /* when traversing the list of all tcp connections to send tickle acks to
3324    (so that we can capture the ack coming back and kill the connection
3325     by a RST)
3326    this callback is called for each connection we are currently trying to kill
3327 */
3328 static int tickle_connection_traverse(void *param, void *data)
3329 {
3330         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3331
3332         /* have tried too many times, just give up */
3333         if (con->count >= 5) {
3334                 /* can't delete in traverse: reparent to delete_cons */
3335                 talloc_steal(param, con);
3336                 return 0;
3337         }
3338
3339         /* othervise, try tickling it again */
3340         con->count++;
3341         ctdb_sys_send_tcp(
3342                 (ctdb_sock_addr *)&con->dst_addr,
3343                 (ctdb_sock_addr *)&con->src_addr,
3344                 0, 0, 0);
3345         return 0;
3346 }
3347
3348
3349 /* 
3350    called every second until all sentenced connections have been reset
3351  */
3352 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3353                                               struct timeval t, void *private_data)
3354 {
3355         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3356         void *delete_cons = talloc_new(NULL);
3357
3358         /* loop over all connections sending tickle ACKs */
3359         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3360
3361         /* now we've finished traverse, it's safe to do deletion. */
3362         talloc_free(delete_cons);
3363
3364         /* If there are no more connections to kill we can remove the
3365            entire killtcp structure
3366          */
3367         if ( (killtcp->connections == NULL) || 
3368              (killtcp->connections->root == NULL) ) {
3369                 talloc_free(killtcp);
3370                 return;
3371         }
3372
3373         /* try tickling them again in a seconds time
3374          */
3375         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3376                         ctdb_tickle_sentenced_connections, killtcp);
3377 }
3378
3379 /*
3380   destroy the killtcp structure
3381  */
3382 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3383 {
3384         struct ctdb_vnn *tmpvnn;
3385
3386         /* verify that this vnn is still active */
3387         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3388                 if (tmpvnn == killtcp->vnn) {
3389                         break;
3390                 }
3391         }
3392
3393         if (tmpvnn == NULL) {
3394                 return 0;
3395         }
3396
3397         if (killtcp->vnn->killtcp != killtcp) {
3398                 return 0;
3399         }
3400
3401         killtcp->vnn->killtcp = NULL;
3402
3403         return 0;
3404 }
3405
3406
3407 /* nothing fancy here, just unconditionally replace any existing
3408    connection structure with the new one.
3409
3410    dont even free the old one if it did exist, that one is talloc_stolen
3411    by the same node in the tree anyway and will be deleted when the new data 
3412    is deleted
3413 */
3414 static void *add_killtcp_callback(void *parm, void *data)
3415 {
3416         return parm;
3417 }
3418
3419 /*
3420   add a tcp socket to the list of connections we want to RST
3421  */
3422 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3423                                        ctdb_sock_addr *s,
3424                                        ctdb_sock_addr *d)
3425 {
3426         ctdb_sock_addr src, dst;
3427         struct ctdb_kill_tcp *killtcp;
3428         struct ctdb_killtcp_con *con;
3429         struct ctdb_vnn *vnn;
3430
3431         ctdb_canonicalize_ip(s, &src);
3432         ctdb_canonicalize_ip(d, &dst);
3433
3434         vnn = find_public_ip_vnn(ctdb, &dst);
3435         if (vnn == NULL) {
3436                 vnn = find_public_ip_vnn(ctdb, &src);
3437         }
3438         if (vnn == NULL) {
3439                 /* if it is not a public ip   it could be our 'single ip' */
3440                 if (ctdb->single_ip_vnn) {
3441                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3442                                 vnn = ctdb->single_ip_vnn;
3443                         }
3444                 }
3445         }
3446         if (vnn == NULL) {
3447                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3448                 return -1;
3449         }
3450
3451         killtcp = vnn->killtcp;
3452         
3453         /* If this is the first connection to kill we must allocate
3454            a new structure
3455          */
3456         if (killtcp == NULL) {
3457                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3458                 CTDB_NO_MEMORY(ctdb, killtcp);
3459
3460                 killtcp->vnn         = vnn;
3461                 killtcp->ctdb        = ctdb;
3462                 killtcp->capture_fd  = -1;
3463                 killtcp->connections = trbt_create(killtcp, 0);
3464
3465                 vnn->killtcp         = killtcp;
3466                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3467         }
3468
3469
3470
3471         /* create a structure that describes this connection we want to
3472            RST and store it in killtcp->connections
3473         */
3474         con = talloc(killtcp, struct ctdb_killtcp_con);
3475         CTDB_NO_MEMORY(ctdb, con);
3476         con->src_addr = src;
3477         con->dst_addr = dst;
3478         con->count    = 0;
3479         con->killtcp  = killtcp;
3480
3481
3482         trbt_insertarray32_callback(killtcp->connections,
3483                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3484                         add_killtcp_callback, con);
3485
3486         /* 
3487            If we dont have a socket to listen on yet we must create it
3488          */
3489         if (killtcp->capture_fd == -1) {
3490                 const char *iface = ctdb_vnn_iface_string(vnn);
3491                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3492                 if (killtcp->capture_fd == -1) {
3493                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3494                                           "socket on iface '%s' for killtcp (%s)\n",
3495                                           iface, strerror(errno)));
3496                         goto failed;
3497                 }
3498         }
3499
3500
3501         if (killtcp->fde == NULL) {
3502                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3503                                             EVENT_FD_READ,
3504                                             capture_tcp_handler, killtcp);
3505                 tevent_fd_set_auto_close(killtcp->fde);
3506
3507                 /* We also need to set up some events to tickle all these connections
3508                    until they are all reset
3509                 */
3510                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3511                                 ctdb_tickle_sentenced_connections, killtcp);
3512         }
3513
3514         /* tickle him once now */
3515         ctdb_sys_send_tcp(
3516                 &con->dst_addr,
3517                 &con->src_addr,
3518                 0, 0, 0);
3519
3520         return 0;
3521
3522 failed:
3523         talloc_free(vnn->killtcp);
3524         vnn->killtcp = NULL;
3525         return -1;
3526 }
3527
3528 /*
3529   kill a TCP connection.
3530  */
3531 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3532 {
3533         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3534
3535         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3536 }
3537
3538 /*
3539   called by a daemon to inform us of the entire list of TCP tickles for
3540   a particular public address.
3541   this control should only be sent by the node that is currently serving
3542   that public address.
3543  */
3544 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3545 {
3546         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3547         struct ctdb_tcp_array *tcparray;
3548         struct ctdb_vnn *vnn;
3549
3550         /* We must at least have tickles.num or else we cant verify the size
3551            of the received data blob
3552          */
3553         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3554                                         tickles.connections)) {
3555                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3556                 return -1;
3557         }
3558
3559         /* verify that the size of data matches what we expect */
3560         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3561                                 tickles.connections)
3562                          + sizeof(struct ctdb_tcp_connection)
3563                                  * list->tickles.num) {
3564                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3565                 return -1;
3566         }       
3567
3568         vnn = find_public_ip_vnn(ctdb, &list->addr);
3569         if (vnn == NULL) {
3570                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3571                         ctdb_addr_to_str(&list->addr)));
3572
3573                 return 1;
3574         }
3575
3576         /* remove any old ticklelist we might have */
3577         talloc_free(vnn->tcp_array);
3578         vnn->tcp_array = NULL;
3579
3580         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3581         CTDB_NO_MEMORY(ctdb, tcparray);
3582
3583         tcparray->num = list->tickles.num;
3584
3585         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3586         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3587
3588         memcpy(tcparray->connections, &list->tickles.connections[0], 
3589                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3590
3591         /* We now have a new fresh tickle list array for this vnn */
3592         vnn->tcp_array = talloc_steal(vnn, tcparray);
3593         
3594         return 0;
3595 }
3596
3597 /*
3598   called to return the full list of tickles for the puclic address associated 
3599   with the provided vnn
3600  */
3601 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3602 {
3603         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3604         struct ctdb_control_tcp_tickle_list *list;
3605         struct ctdb_tcp_array *tcparray;
3606         int num;
3607         struct ctdb_vnn *vnn;
3608
3609         vnn = find_public_ip_vnn(ctdb, addr);
3610         if (vnn == NULL) {
3611                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3612                         ctdb_addr_to_str(addr)));
3613
3614                 return 1;
3615         }
3616
3617         tcparray = vnn->tcp_array;
3618         if (tcparray) {
3619                 num = tcparray->num;
3620         } else {
3621                 num = 0;
3622         }
3623
3624         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3625                                 tickles.connections)
3626                         + sizeof(struct ctdb_tcp_connection) * num;
3627
3628         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3629         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3630         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3631
3632         list->addr = *addr;
3633         list->tickles.num = num;
3634         if (num) {
3635                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3636                         sizeof(struct ctdb_tcp_connection) * num);
3637         }
3638
3639         return 0;
3640 }
3641
3642
3643 /*
3644   set the list of all tcp tickles for a public address
3645  */
3646 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3647                               struct timeval timeout, uint32_t destnode, 
3648                               ctdb_sock_addr *addr,
3649                               struct ctdb_tcp_array *tcparray)
3650 {
3651         int ret, num;
3652         TDB_DATA data;
3653         struct ctdb_control_tcp_tickle_list *list;
3654
3655         if (tcparray) {
3656                 num = tcparray->num;
3657         } else {
3658                 num = 0;
3659         }
3660
3661         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3662                                 tickles.connections) +
3663                         sizeof(struct ctdb_tcp_connection) * num;
3664         data.dptr = talloc_size(ctdb, data.dsize);
3665         CTDB_NO_MEMORY(ctdb, data.dptr);
3666
3667         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3668         list->addr = *addr;
3669         list->tickles.num = num;
3670         if (tcparray) {
3671                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3672         }
3673
3674         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3675                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3676                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3677         if (ret != 0) {
3678                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3679                 return -1;
3680         }
3681
3682         talloc_free(data.dptr);
3683
3684         return ret;
3685 }
3686
3687
3688 /*
3689   perform tickle updates if required
3690  */
3691 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3692                                 struct timed_event *te, 
3693                                 struct timeval t, void *private_data)
3694 {
3695         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3696         int ret;
3697         struct ctdb_vnn *vnn;
3698
3699         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3700                 /* we only send out updates for public addresses that 
3701                    we have taken over
3702                  */
3703                 if (ctdb->pnn != vnn->pnn) {
3704                         continue;
3705                 }
3706                 /* We only send out the updates if we need to */
3707                 if (!vnn->tcp_update_needed) {
3708                         continue;
3709                 }
3710                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3711                                 TAKEOVER_TIMEOUT(),
3712                                 CTDB_BROADCAST_CONNECTED,
3713                                 &vnn->public_address,
3714                                 vnn->tcp_array);
3715                 if (ret != 0) {
3716                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3717                                 ctdb_addr_to_str(&vnn->public_address)));
3718                 }
3719         }
3720
3721         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3722                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3723                              ctdb_update_tcp_tickles, ctdb);
3724 }               
3725         
3726
3727 /*
3728   start periodic update of tcp tickles
3729  */
3730 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3731 {
3732         ctdb->tickle_update_context = talloc_new(ctdb);
3733
3734         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3735                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3736                              ctdb_update_tcp_tickles, ctdb);
3737 }
3738
3739
3740
3741
3742 struct control_gratious_arp {
3743         struct ctdb_context *ctdb;
3744         ctdb_sock_addr addr;
3745         const char *iface;
3746         int count;
3747 };
3748
3749 /*
3750   send a control_gratuitous arp
3751  */
3752 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3753                                   struct timeval t, void *private_data)
3754 {
3755         int ret;
3756         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3757                                                         struct control_gratious_arp);
3758
3759         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3760         if (ret != 0) {
3761                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3762                                  arp->iface, strerror(errno)));
3763         }
3764
3765
3766         arp->count++;
3767         if (arp->count == CTDB_ARP_REPEAT) {
3768                 talloc_free(arp);
3769                 return;
3770         }
3771
3772         event_add_timed(arp->ctdb->ev, arp, 
3773                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3774                         send_gratious_arp, arp);
3775 }
3776
3777
3778 /*
3779   send a gratious arp 
3780  */
3781 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3782 {
3783         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3784         struct control_gratious_arp *arp;
3785
3786         /* verify the size of indata */
3787         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3788                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3789                                  (unsigned)indata.dsize, 
3790                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3791                 return -1;
3792         }
3793         if (indata.dsize != 
3794                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3795                 + gratious_arp->len ) ){
3796
3797                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3798                         "but should be %u bytes\n", 
3799                          (unsigned)indata.dsize, 
3800                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3801                 return -1;
3802         }
3803
3804
3805         arp = talloc(ctdb, struct control_gratious_arp);
3806         CTDB_NO_MEMORY(ctdb, arp);
3807
3808         arp->ctdb  = ctdb;
3809         arp->addr   = gratious_arp->addr;
3810         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3811         CTDB_NO_MEMORY(ctdb, arp->iface);
3812         arp->count = 0;
3813         
3814         event_add_timed(arp->ctdb->ev, arp, 
3815                         timeval_zero(), send_gratious_arp, arp);
3816
3817         return 0;
3818 }
3819
3820 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3821 {
3822         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3823         int ret;
3824
3825         /* verify the size of indata */
3826         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3827                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3828                 return -1;
3829         }
3830         if (indata.dsize != 
3831                 ( offsetof(struct ctdb_control_ip_iface, iface)
3832                 + pub->len ) ){
3833
3834                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3835                         "but should be %u bytes\n", 
3836                          (unsigned)indata.dsize, 
3837                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3838                 return -1;
3839         }
3840
3841         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
3842
3843         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3844
3845         if (ret != 0) {
3846                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3847                 return -1;
3848         }
3849
3850         return 0;
3851 }
3852
3853 /*
3854   called when releaseip event finishes for del_public_address
3855  */
3856 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3857                                 void *private_data)
3858 {
3859         talloc_free(private_data);
3860 }
3861
3862 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3863 {
3864         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3865         struct ctdb_vnn *vnn;
3866         int ret;
3867
3868         /* verify the size of indata */
3869         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3870                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3871                 return -1;
3872         }
3873         if (indata.dsize != 
3874                 ( offsetof(struct ctdb_control_ip_iface, iface)
3875                 + pub->len ) ){
3876
3877                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3878                         "but should be %u bytes\n", 
3879                          (unsigned)indata.dsize, 
3880                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3881                 return -1;
3882         }
3883
3884         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
3885
3886         /* walk over all public addresses until we find a match */
3887         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3888                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3889                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3890
3891                         DLIST_REMOVE(ctdb->vnn, vnn);
3892                         talloc_steal(mem_ctx, vnn);
3893                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3894                         if (vnn->pnn != ctdb->pnn) {
3895                                 if (vnn->iface != NULL) {
3896                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3897                                 }
3898                                 talloc_free(mem_ctx);
3899                                 return 0;
3900                         }
3901                         vnn->pnn = -1;
3902
3903                         ret = ctdb_event_script_callback(ctdb, 
3904                                          mem_ctx, delete_ip_callback, mem_ctx,
3905                                          false,
3906                                          CTDB_EVENT_RELEASE_IP,
3907                                          "%s %s %u",
3908                                          ctdb_vnn_iface_string(vnn),
3909                                          ctdb_addr_to_str(&vnn->public_address),
3910                                          vnn->public_netmask_bits);
3911                         if (vnn->iface != NULL) {
3912                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3913                         }
3914                         if (ret != 0) {
3915                                 return -1;
3916                         }
3917                         return 0;
3918                 }
3919         }
3920
3921         return -1;
3922 }
3923
3924
3925 struct ipreallocated_callback_state {
3926         struct ctdb_req_control *c;
3927 };
3928
3929 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3930                                         int status, void *p)
3931 {
3932         struct ipreallocated_callback_state *state =
3933                 talloc_get_type(p, struct ipreallocated_callback_state);
3934
3935         if (status != 0) {
3936                 DEBUG(DEBUG_ERR,
3937                       (" \"ipreallocated\" event script failed (status %d)\n",
3938                        status));
3939                 if (status == -ETIME) {
3940                         ctdb_ban_self(ctdb);
3941                 }
3942         }
3943
3944         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3945         talloc_free(state);
3946 }
3947
3948 /* A control to run the ipreallocated event */
3949 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3950                                    struct ctdb_req_control *c,
3951                                    bool *async_reply)
3952 {
3953         int ret;
3954         struct ipreallocated_callback_state *state;
3955
3956         state = talloc(ctdb, struct ipreallocated_callback_state);
3957         CTDB_NO_MEMORY(ctdb, state);
3958
3959         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3960
3961         ret = ctdb_event_script_callback(ctdb, state,
3962                                          ctdb_ipreallocated_callback, state,
3963                                          false, CTDB_EVENT_IPREALLOCATED,
3964                                          "%s", "");
3965
3966         if (ret != 0) {
3967                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3968                 talloc_free(state);
3969                 return -1;
3970         }
3971
3972         /* tell the control that we will be reply asynchronously */
3973         state->c    = talloc_steal(state, c);
3974         *async_reply = true;
3975
3976         return 0;
3977 }
3978
3979
3980 /* This function is called from the recovery daemon to verify that a remote
3981    node has the expected ip allocation.
3982    This is verified against ctdb->ip_tree
3983 */
3984 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3985 {
3986         struct ctdb_public_ip_list *tmp_ip; 
3987         int i;
3988
3989         if (ctdb->ip_tree == NULL) {
3990                 /* dont know the expected allocation yet, assume remote node
3991                    is correct. */
3992                 return 0;
3993         }
3994
3995         if (ips == NULL) {
3996                 return 0;
3997         }
3998
3999         for (i=0; i<ips->num; i++) {
4000                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4001                 if (tmp_ip == NULL) {
4002                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4003                         return -1;
4004                 }
4005
4006                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4007                         continue;
4008                 }
4009
4010                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4011                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
4012                         return -1;
4013                 }
4014         }
4015
4016         return 0;
4017 }
4018
4019 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4020 {
4021         struct ctdb_public_ip_list *tmp_ip; 
4022
4023         if (ctdb->ip_tree == NULL) {
4024                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4025                 return -1;
4026         }
4027
4028         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4029         if (tmp_ip == NULL) {
4030                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4031                 return -1;
4032         }
4033
4034         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4035         tmp_ip->pnn = ip->pnn;
4036
4037         return 0;
4038 }
4039
4040
4041 struct ctdb_reloadips_handle {
4042         struct ctdb_context *ctdb;
4043         struct ctdb_req_control *c;
4044         int status;
4045         int fd[2];
4046         pid_t child;
4047         struct fd_event *fde;
4048 };
4049
4050 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4051 {
4052         if (h == h->ctdb->reload_ips) {
4053                 h->ctdb->reload_ips = NULL;
4054         }
4055         if (h->c != NULL) {
4056                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4057                 h->c = NULL;
4058         }
4059         ctdb_kill(h->ctdb, h->child, SIGKILL);
4060         return 0;
4061 }
4062
4063 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4064                                 struct timed_event *te,
4065                                 struct timeval t, void *private_data)
4066 {
4067         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4068
4069         talloc_free(h);
4070 }       
4071
4072 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4073                              uint16_t flags, void *private_data)
4074 {
4075         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4076
4077         char res;
4078         int ret;
4079
4080         ret = read(h->fd[0], &res, 1);
4081         if (ret < 1 || res != 0) {
4082                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4083                 res = 1;
4084         }
4085         h->status = res;
4086
4087         talloc_free(h);
4088 }
4089
4090 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4091 {
4092         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4093         struct ctdb_all_public_ips *ips;
4094         struct ctdb_vnn *vnn;
4095         int i, ret;
4096
4097         /* read the ip allocation from the local node */
4098         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4099         if (ret != 0) {
4100                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4101                 talloc_free(mem_ctx);
4102                 return -1;
4103         }
4104
4105         /* re-read the public ips file */
4106         ctdb->vnn = NULL;
4107         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4108                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4109                 talloc_free(mem_ctx);
4110                 return -1;
4111         }               
4112
4113
4114         /* check the previous list of ips and scan for ips that have been
4115            dropped.
4116          */
4117         for (i = 0; i < ips->num; i++) {
4118                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4119                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4120                                 break;
4121                         }
4122                 }
4123
4124                 /* we need to delete this ip, no longer available on this node */
4125                 if (vnn == NULL) {
4126                         struct ctdb_control_ip_iface pub;
4127
4128                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4129                         pub.addr  = ips->ips[i].addr;
4130                         pub.mask  = 0;
4131                         pub.len   = 0;
4132
4133                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4134                         if (ret != 0) {
4135                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4136                                 return -1;
4137                         }
4138                 }
4139         }
4140
4141
4142         /* loop over all new ones and check the ones we need to add */
4143         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4144                 for (i = 0; i < ips->num; i++) {
4145                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4146                                 break;
4147                         }
4148                 }
4149                 if (i == ips->num) {
4150                         struct ctdb_control_ip_iface pub;
4151                         const char *ifaces = NULL;
4152                         int iface = 0;
4153
4154                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4155
4156                         pub.addr  = vnn->public_address;
4157                         pub.mask  = vnn->public_netmask_bits;
4158
4159
4160                         ifaces = vnn->ifaces[0];
4161                         iface = 1;
4162                         while (vnn->ifaces[iface] != NULL) {
4163                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4164                                 iface++;
4165                         }
4166                         pub.len   = strlen(ifaces)+1;
4167                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4168
4169                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4170                         if (ret != 0) {
4171                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4172                                 return -1;
4173                         }
4174                 }
4175         }
4176
4177         return 0;
4178 }
4179
4180 /* This control is sent to force the node to re-read the public addresses file
4181    and drop any addresses we should nnot longer host, and add new addresses
4182    that we are now able to host
4183 */
4184 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4185 {
4186         struct ctdb_reloadips_handle *h;
4187         pid_t parent = getpid();
4188
4189         if (ctdb->reload_ips != NULL) {
4190                 talloc_free(ctdb->reload_ips);
4191                 ctdb->reload_ips = NULL;
4192         }
4193
4194         h = talloc(ctdb, struct ctdb_reloadips_handle);
4195         CTDB_NO_MEMORY(ctdb, h);
4196         h->ctdb     = ctdb;
4197         h->c        = NULL;
4198         h->status   = -1;
4199         
4200         if (pipe(h->fd) == -1) {
4201                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4202                 talloc_free(h);
4203                 return -1;
4204         }
4205
4206         h->child = ctdb_fork(ctdb);
4207         if (h->child == (pid_t)-1) {
4208                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4209                 close(h->fd[0]);
4210                 close(h->fd[1]);
4211                 talloc_free(h);
4212                 return -1;
4213         }
4214
4215         /* child process */
4216         if (h->child == 0) {
4217                 signed char res = 0;
4218
4219                 close(h->fd[0]);
4220                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4221
4222                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4223                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4224                         res = -1;
4225                 } else {
4226                         res = ctdb_reloadips_child(ctdb);
4227                         if (res != 0) {
4228                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4229                         }
4230                 }
4231
4232                 write(h->fd[1], &res, 1);
4233                 /* make sure we die when our parent dies */
4234                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4235                         sleep(5);
4236                 }
4237                 _exit(0);
4238         }
4239
4240         h->c             = talloc_steal(h, c);
4241
4242         close(h->fd[1]);
4243         set_close_on_exec(h->fd[0]);
4244
4245         talloc_set_destructor(h, ctdb_reloadips_destructor);
4246
4247
4248         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4249                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4250                         (void *)h);
4251         tevent_fd_set_auto_close(h->fde);
4252
4253         event_add_timed(ctdb->ev, h,
4254                         timeval_current_ofs(120, 0),
4255                         ctdb_reloadips_timeout_event, h);
4256
4257         /* we reply later */
4258         *async_reply = true;
4259         return 0;
4260 }