recoverd: reloadips should rebalance target nodes for new IPs
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          false,
514                                          CTDB_EVENT_TAKE_IP,
515                                          "%s %s %u",
516                                          ctdb_vnn_iface_string(vnn),
517                                          ctdb_addr_to_str(&vnn->public_address),
518                                          vnn->public_netmask_bits);
519
520         if (ret != 0) {
521                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522                         ctdb_addr_to_str(&vnn->public_address),
523                         ctdb_vnn_iface_string(vnn)));
524                 talloc_free(state);
525                 return -1;
526         }
527
528         return 0;
529 }
530
531 struct ctdb_do_updateip_state {
532         struct ctdb_req_control *c;
533         struct ctdb_iface *old;
534         struct ctdb_vnn *vnn;
535 };
536
537 /*
538   called when updateip event finishes
539  */
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541                                       void *private_data)
542 {
543         struct ctdb_do_updateip_state *state =
544                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545         int32_t ret;
546
547         if (status != 0) {
548                 if (status == -ETIME) {
549                         ctdb_ban_self(ctdb);
550                 }
551                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552                         ctdb_addr_to_str(&state->vnn->public_address),
553                         state->old->name,
554                         ctdb_vnn_iface_string(state->vnn)));
555
556                 /*
557                  * All we can do is reset the old interface
558                  * and let the next run fix it
559                  */
560                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561                 state->vnn->iface = state->old;
562                 state->vnn->iface->references++;
563
564                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         if (ctdb->do_checkpublicip) {
570
571         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572         if (ret != 0) {
573                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574                 talloc_free(state);
575                 return;
576         }
577
578         }
579
580         /* the control succeeded */
581         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582         talloc_free(state);
583         return;
584 }
585
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587 {
588         state->vnn->update_in_flight = false;
589         return 0;
590 }
591
592 /*
593   update (move) an ip address
594  */
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596                                 struct ctdb_req_control *c,
597                                 struct ctdb_vnn *vnn)
598 {
599         int ret;
600         struct ctdb_do_updateip_state *state;
601         struct ctdb_iface *old = vnn->iface;
602         const char *new_name;
603
604         if (vnn->update_in_flight) {
605                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606                                     "update for this IP already in flight\n",
607                                     ctdb_addr_to_str(&vnn->public_address),
608                                     vnn->public_netmask_bits));
609                 return -1;
610         }
611
612         ctdb_vnn_unassign_iface(ctdb, vnn);
613         ret = ctdb_vnn_assign_iface(ctdb, vnn);
614         if (ret != 0) {
615                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616                                  "assin a usable interface (old iface '%s')\n",
617                                  ctdb_addr_to_str(&vnn->public_address),
618                                  vnn->public_netmask_bits,
619                                  old->name));
620                 return -1;
621         }
622
623         new_name = ctdb_vnn_iface_string(vnn);
624         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625                 /* A benign update from one interface onto itself.
626                  * no need to run the eventscripts in this case, just return
627                  * success.
628                  */
629                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630                 return 0;
631         }
632
633         state = talloc(vnn, struct ctdb_do_updateip_state);
634         CTDB_NO_MEMORY(ctdb, state);
635
636         state->c = talloc_steal(ctdb, c);
637         state->old = old;
638         state->vnn = vnn;
639
640         vnn->update_in_flight = true;
641         talloc_set_destructor(state, ctdb_updateip_destructor);
642
643         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644                             "interface %s to %s\n",
645                             ctdb_addr_to_str(&vnn->public_address),
646                             vnn->public_netmask_bits,
647                             old->name,
648                             new_name));
649
650         ret = ctdb_event_script_callback(ctdb,
651                                          state,
652                                          ctdb_do_updateip_callback,
653                                          state,
654                                          false,
655                                          CTDB_EVENT_UPDATE_IP,
656                                          "%s %s %s %u",
657                                          state->old->name,
658                                          new_name,
659                                          ctdb_addr_to_str(&vnn->public_address),
660                                          vnn->public_netmask_bits);
661         if (ret != 0) {
662                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663                                  ctdb_addr_to_str(&vnn->public_address),
664                                  old->name, new_name));
665                 talloc_free(state);
666                 return -1;
667         }
668
669         return 0;
670 }
671
672 /*
673   Find the vnn of the node that has a public ip address
674   returns -1 if the address is not known as a public address
675  */
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677 {
678         struct ctdb_vnn *vnn;
679
680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681                 if (ctdb_same_ip(&vnn->public_address, addr)) {
682                         return vnn;
683                 }
684         }
685
686         return NULL;
687 }
688
689 /*
690   take over an ip address
691  */
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693                                  struct ctdb_req_control *c,
694                                  TDB_DATA indata,
695                                  bool *async_reply)
696 {
697         int ret;
698         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699         struct ctdb_vnn *vnn;
700         bool have_ip = false;
701         bool do_updateip = false;
702         bool do_takeip = false;
703         struct ctdb_iface *best_iface = NULL;
704
705         if (pip->pnn != ctdb->pnn) {
706                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707                                  "with pnn %d, but we're node %d\n",
708                                  ctdb_addr_to_str(&pip->addr),
709                                  pip->pnn, ctdb->pnn));
710                 return -1;
711         }
712
713         /* update out vnn list */
714         vnn = find_public_ip_vnn(ctdb, &pip->addr);
715         if (vnn == NULL) {
716                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717                         ctdb_addr_to_str(&pip->addr)));
718                 return 0;
719         }
720
721         if (ctdb->do_checkpublicip) {
722                 have_ip = ctdb_sys_have_ip(&pip->addr);
723         }
724         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725         if (best_iface == NULL) {
726                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727                                  "a usable interface (old %s, have_ip %d)\n",
728                                  ctdb_addr_to_str(&vnn->public_address),
729                                  vnn->public_netmask_bits,
730                                  ctdb_vnn_iface_string(vnn),
731                                  have_ip));
732                 return -1;
733         }
734
735         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737                 have_ip = false;
738         }
739
740
741         if (vnn->iface == NULL && have_ip) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744                                  ctdb_addr_to_str(&vnn->public_address)));
745                 return 0;
746         }
747
748         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750                                   "and we have it on iface[%s], but it was assigned to node %d"
751                                   "and we are node %d, banning ourself\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754                 ctdb_ban_self(ctdb);
755                 return -1;
756         }
757
758         if (vnn->pnn == -1 && have_ip) {
759                 vnn->pnn = ctdb->pnn;
760                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761                                   "and we already have it on iface[%s], update local daemon\n",
762                                  ctdb_addr_to_str(&vnn->public_address),
763                                   ctdb_vnn_iface_string(vnn)));
764                 return 0;
765         }
766
767         if (vnn->iface) {
768                 if (vnn->iface != best_iface) {
769                         if (!vnn->iface->link_up) {
770                                 do_updateip = true;
771                         } else if (vnn->iface->references > (best_iface->references + 1)) {
772                                 /* only move when the rebalance gains something */
773                                         do_updateip = true;
774                         }
775                 }
776         }
777
778         if (!have_ip) {
779                 if (do_updateip) {
780                         ctdb_vnn_unassign_iface(ctdb, vnn);
781                         do_updateip = false;
782                 }
783                 do_takeip = true;
784         }
785
786         if (do_takeip) {
787                 ret = ctdb_do_takeip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else if (do_updateip) {
792                 ret = ctdb_do_updateip(ctdb, c, vnn);
793                 if (ret != 0) {
794                         return -1;
795                 }
796         } else {
797                 /*
798                  * The interface is up and the kernel known the ip
799                  * => do nothing
800                  */
801                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits,
804                         ctdb_vnn_iface_string(vnn)));
805                 return 0;
806         }
807
808         /* tell ctdb_control.c that we will be replying asynchronously */
809         *async_reply = true;
810
811         return 0;
812 }
813
814 /*
815   takeover an ip address old v4 style
816  */
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
818                                 struct ctdb_req_control *c,
819                                 TDB_DATA indata, 
820                                 bool *async_reply)
821 {
822         TDB_DATA data;
823         
824         data.dsize = sizeof(struct ctdb_public_ip);
825         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826         CTDB_NO_MEMORY(ctdb, data.dptr);
827         
828         memcpy(data.dptr, indata.dptr, indata.dsize);
829         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
830 }
831
832 /*
833   kill any clients that are registered with a IP that is being released
834  */
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
836 {
837         struct ctdb_client_ip *ip;
838
839         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840                 ctdb_addr_to_str(addr)));
841
842         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843                 ctdb_sock_addr tmp_addr;
844
845                 tmp_addr = ip->addr;
846                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
847                         ip->client_id,
848                         ctdb_addr_to_str(&ip->addr)));
849
850                 if (ctdb_same_ip(&tmp_addr, addr)) {
851                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
852                                                                      ip->client_id, 
853                                                                      struct ctdb_client);
854                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
855                                 ip->client_id,
856                                 ctdb_addr_to_str(&ip->addr),
857                                 client->pid));
858
859                         if (client->pid != 0) {
860                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861                                         (unsigned)client->pid,
862                                         ctdb_addr_to_str(addr),
863                                         ip->client_id));
864                                 kill(client->pid, SIGKILL);
865                         }
866                 }
867         }
868 }
869
870 /*
871   called when releaseip event finishes
872  */
873 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
874                                 void *private_data)
875 {
876         struct takeover_callback_state *state = 
877                 talloc_get_type(private_data, struct takeover_callback_state);
878         TDB_DATA data;
879
880         if (status == -ETIME) {
881                 ctdb_ban_self(ctdb);
882         }
883
884         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
885                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
886                                   ctdb_addr_to_str(state->addr)));
887                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
888                 talloc_free(state);
889                 return;
890         }
891
892         /* send a message to all clients of this node telling them
893            that the cluster has been reconfigured and they should
894            release any sockets on this IP */
895         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
896         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
897         data.dsize = strlen((char *)data.dptr)+1;
898
899         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
900
901         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
902
903         /* kill clients that have registered with this IP */
904         release_kill_clients(ctdb, state->addr);
905
906         ctdb_vnn_unassign_iface(ctdb, state->vnn);
907
908         /* the control succeeded */
909         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
910         talloc_free(state);
911 }
912
913 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
914 {
915         state->vnn->update_in_flight = false;
916         return 0;
917 }
918
919 /*
920   release an ip address
921  */
922 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
923                                 struct ctdb_req_control *c,
924                                 TDB_DATA indata, 
925                                 bool *async_reply)
926 {
927         int ret;
928         struct takeover_callback_state *state;
929         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
930         struct ctdb_vnn *vnn;
931         char *iface;
932
933         /* update our vnn list */
934         vnn = find_public_ip_vnn(ctdb, &pip->addr);
935         if (vnn == NULL) {
936                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
937                         ctdb_addr_to_str(&pip->addr)));
938                 return 0;
939         }
940         vnn->pnn = pip->pnn;
941
942         /* stop any previous arps */
943         talloc_free(vnn->takeover_ctx);
944         vnn->takeover_ctx = NULL;
945
946         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
947          * lazy multicast to drop an IP from any node that isn't the
948          * intended new node.  The following causes makes ctdbd ignore
949          * a release for any address it doesn't host.
950          */
951         if (ctdb->do_checkpublicip) {
952                 if (!ctdb_sys_have_ip(&pip->addr)) {
953                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
954                                 ctdb_addr_to_str(&pip->addr),
955                                 vnn->public_netmask_bits,
956                                 ctdb_vnn_iface_string(vnn)));
957                         ctdb_vnn_unassign_iface(ctdb, vnn);
958                         return 0;
959                 }
960         } else {
961                 if (vnn->iface == NULL) {
962                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
963                                            ctdb_addr_to_str(&pip->addr),
964                                            vnn->public_netmask_bits));
965                         return 0;
966                 }
967         }
968
969         /* There is a potential race between take_ip and us because we
970          * update the VNN via a callback that run when the
971          * eventscripts have been run.  Avoid the race by allowing one
972          * update to be in flight at a time.
973          */
974         if (vnn->update_in_flight) {
975                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
976                                     "update for this IP already in flight\n",
977                                     ctdb_addr_to_str(&vnn->public_address),
978                                     vnn->public_netmask_bits));
979                 return -1;
980         }
981
982         if (ctdb->do_checkpublicip) {
983                 iface = ctdb_sys_find_ifname(&pip->addr);
984                 if (iface == NULL) {
985                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
986                         return 0;
987                 }
988                 if (vnn->iface == NULL) {
989                         DEBUG(DEBUG_WARNING,
990                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
991                                ctdb_addr_to_str(&pip->addr),
992                                iface));
993                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
994                         DEBUG(DEBUG_WARNING,
995                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
996                                ctdb_addr_to_str(&pip->addr),
997                                iface,
998                                ctdb_vnn_iface_string(vnn)));
999                         /* Should we fix vnn->iface?  If we do, what
1000                          * happens to reference counts?
1001                          */
1002                 }
1003         } else {
1004                 iface = strdup(ctdb_vnn_iface_string(vnn));
1005         }
1006
1007         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1008                 ctdb_addr_to_str(&pip->addr),
1009                 vnn->public_netmask_bits,
1010                 iface,
1011                 pip->pnn));
1012
1013         state = talloc(ctdb, struct takeover_callback_state);
1014         CTDB_NO_MEMORY(ctdb, state);
1015
1016         state->c = talloc_steal(state, c);
1017         state->addr = talloc(state, ctdb_sock_addr);       
1018         CTDB_NO_MEMORY(ctdb, state->addr);
1019         *state->addr = pip->addr;
1020         state->vnn   = vnn;
1021
1022         vnn->update_in_flight = true;
1023         talloc_set_destructor(state, ctdb_releaseip_destructor);
1024
1025         ret = ctdb_event_script_callback(ctdb, 
1026                                          state, release_ip_callback, state,
1027                                          false,
1028                                          CTDB_EVENT_RELEASE_IP,
1029                                          "%s %s %u",
1030                                          iface,
1031                                          ctdb_addr_to_str(&pip->addr),
1032                                          vnn->public_netmask_bits);
1033         free(iface);
1034         if (ret != 0) {
1035                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1036                         ctdb_addr_to_str(&pip->addr),
1037                         ctdb_vnn_iface_string(vnn)));
1038                 talloc_free(state);
1039                 return -1;
1040         }
1041
1042         /* tell the control that we will be reply asynchronously */
1043         *async_reply = true;
1044         return 0;
1045 }
1046
1047 /*
1048   release an ip address old v4 style
1049  */
1050 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1051                                 struct ctdb_req_control *c,
1052                                 TDB_DATA indata, 
1053                                 bool *async_reply)
1054 {
1055         TDB_DATA data;
1056         
1057         data.dsize = sizeof(struct ctdb_public_ip);
1058         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1059         CTDB_NO_MEMORY(ctdb, data.dptr);
1060         
1061         memcpy(data.dptr, indata.dptr, indata.dsize);
1062         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1063 }
1064
1065
1066 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1067                                    ctdb_sock_addr *addr,
1068                                    unsigned mask, const char *ifaces,
1069                                    bool check_address)
1070 {
1071         struct ctdb_vnn      *vnn;
1072         uint32_t num = 0;
1073         char *tmp;
1074         const char *iface;
1075         int i;
1076         int ret;
1077
1078         tmp = strdup(ifaces);
1079         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1080                 if (!ctdb_sys_check_iface_exists(iface)) {
1081                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1082                         free(tmp);
1083                         return -1;
1084                 }
1085         }
1086         free(tmp);
1087
1088         /* Verify that we dont have an entry for this ip yet */
1089         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1090                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1091                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1092                                 ctdb_addr_to_str(addr)));
1093                         return -1;
1094                 }               
1095         }
1096
1097         /* create a new vnn structure for this ip address */
1098         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1099         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1100         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1101         tmp = talloc_strdup(vnn, ifaces);
1102         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1103         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1104                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1105                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1106                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1107                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1108                 num++;
1109         }
1110         talloc_free(tmp);
1111         vnn->ifaces[num] = NULL;
1112         vnn->public_address      = *addr;
1113         vnn->public_netmask_bits = mask;
1114         vnn->pnn                 = -1;
1115         if (check_address) {
1116                 if (ctdb_sys_have_ip(addr)) {
1117                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1118                         vnn->pnn = ctdb->pnn;
1119                 }
1120         }
1121
1122         for (i=0; vnn->ifaces[i]; i++) {
1123                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1124                 if (ret != 0) {
1125                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1126                                            "for public_address[%s]\n",
1127                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1128                         talloc_free(vnn);
1129                         return -1;
1130                 }
1131         }
1132
1133         DLIST_ADD(ctdb->vnn, vnn);
1134
1135         return 0;
1136 }
1137
1138 /*
1139   setup the event script directory
1140 */
1141 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1142 {
1143         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1144         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1145         return 0;
1146 }
1147
1148 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1149                                   struct timeval t, void *private_data)
1150 {
1151         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1152                                                         struct ctdb_context);
1153         struct ctdb_vnn *vnn;
1154
1155         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1156                 int i;
1157
1158                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1159                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1160                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1161                                         vnn->ifaces[i],
1162                                         ctdb_addr_to_str(&vnn->public_address)));
1163                         }
1164                 }
1165         }
1166
1167         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1168                 timeval_current_ofs(30, 0), 
1169                 ctdb_check_interfaces_event, ctdb);
1170 }
1171
1172
1173 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1174 {
1175         if (ctdb->check_public_ifaces_ctx != NULL) {
1176                 talloc_free(ctdb->check_public_ifaces_ctx);
1177                 ctdb->check_public_ifaces_ctx = NULL;
1178         }
1179
1180         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1181         if (ctdb->check_public_ifaces_ctx == NULL) {
1182                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1183         }
1184
1185         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1186                 timeval_current_ofs(30, 0), 
1187                 ctdb_check_interfaces_event, ctdb);
1188
1189         return 0;
1190 }
1191
1192
1193 /*
1194   setup the public address lists from a file
1195 */
1196 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1197 {
1198         char **lines;
1199         int nlines;
1200         int i;
1201
1202         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1203         if (lines == NULL) {
1204                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1205                 return -1;
1206         }
1207         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1208                 nlines--;
1209         }
1210
1211         for (i=0;i<nlines;i++) {
1212                 unsigned mask;
1213                 ctdb_sock_addr addr;
1214                 const char *addrstr;
1215                 const char *ifaces;
1216                 char *tok, *line;
1217
1218                 line = lines[i];
1219                 while ((*line == ' ') || (*line == '\t')) {
1220                         line++;
1221                 }
1222                 if (*line == '#') {
1223                         continue;
1224                 }
1225                 if (strcmp(line, "") == 0) {
1226                         continue;
1227                 }
1228                 tok = strtok(line, " \t");
1229                 addrstr = tok;
1230                 tok = strtok(NULL, " \t");
1231                 if (tok == NULL) {
1232                         if (NULL == ctdb->default_public_interface) {
1233                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1234                                          i+1));
1235                                 talloc_free(lines);
1236                                 return -1;
1237                         }
1238                         ifaces = ctdb->default_public_interface;
1239                 } else {
1240                         ifaces = tok;
1241                 }
1242
1243                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1244                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1245                         talloc_free(lines);
1246                         return -1;
1247                 }
1248                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1249                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1250                         talloc_free(lines);
1251                         return -1;
1252                 }
1253         }
1254
1255
1256         talloc_free(lines);
1257         return 0;
1258 }
1259
1260 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1261                               const char *iface,
1262                               const char *ip)
1263 {
1264         struct ctdb_vnn *svnn;
1265         struct ctdb_iface *cur = NULL;
1266         bool ok;
1267         int ret;
1268
1269         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1270         CTDB_NO_MEMORY(ctdb, svnn);
1271
1272         svnn->ifaces = talloc_array(svnn, const char *, 2);
1273         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1274         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1275         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1276         svnn->ifaces[1] = NULL;
1277
1278         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1279         if (!ok) {
1280                 talloc_free(svnn);
1281                 return -1;
1282         }
1283
1284         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1285         if (ret != 0) {
1286                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1287                                    "for single_ip[%s]\n",
1288                                    svnn->ifaces[0],
1289                                    ctdb_addr_to_str(&svnn->public_address)));
1290                 talloc_free(svnn);
1291                 return -1;
1292         }
1293
1294         /* assume the single public ip interface is initially "good" */
1295         cur = ctdb_find_iface(ctdb, iface);
1296         if (cur == NULL) {
1297                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1298                 return -1;
1299         }
1300         cur->link_up = true;
1301
1302         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1303         if (ret != 0) {
1304                 talloc_free(svnn);
1305                 return -1;
1306         }
1307
1308         ctdb->single_ip_vnn = svnn;
1309         return 0;
1310 }
1311
1312 struct ctdb_public_ip_list {
1313         struct ctdb_public_ip_list *next;
1314         uint32_t pnn;
1315         ctdb_sock_addr addr;
1316 };
1317
1318 /* Given a physical node, return the number of
1319    public addresses that is currently assigned to this node.
1320 */
1321 static int node_ip_coverage(struct ctdb_context *ctdb, 
1322         int32_t pnn,
1323         struct ctdb_public_ip_list *ips)
1324 {
1325         int num=0;
1326
1327         for (;ips;ips=ips->next) {
1328                 if (ips->pnn == pnn) {
1329                         num++;
1330                 }
1331         }
1332         return num;
1333 }
1334
1335
1336 /* Can the given node host the given IP: is the public IP known to the
1337  * node and is NOIPHOST unset?
1338 */
1339 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1340                              struct ctdb_ipflags ipflags,
1341                              struct ctdb_public_ip_list *ip)
1342 {
1343         struct ctdb_all_public_ips *public_ips;
1344         int i;
1345
1346         if (ipflags.noiphost) {
1347                 return false;
1348         }
1349
1350         public_ips = ctdb->nodes[pnn]->available_public_ips;
1351
1352         if (public_ips == NULL) {
1353                 return false;
1354         }
1355
1356         for (i=0; i<public_ips->num; i++) {
1357                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1358                         /* yes, this node can serve this public ip */
1359                         return true;
1360                 }
1361         }
1362
1363         return false;
1364 }
1365
1366 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1367                                  struct ctdb_ipflags ipflags,
1368                                  struct ctdb_public_ip_list *ip)
1369 {
1370         if (ipflags.noiptakeover) {
1371                 return false;
1372         }
1373
1374         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1375 }
1376
1377 /* search the node lists list for a node to takeover this ip.
1378    pick the node that currently are serving the least number of ips
1379    so that the ips get spread out evenly.
1380 */
1381 static int find_takeover_node(struct ctdb_context *ctdb, 
1382                 struct ctdb_ipflags *ipflags,
1383                 struct ctdb_public_ip_list *ip,
1384                 struct ctdb_public_ip_list *all_ips)
1385 {
1386         int pnn, min=0, num;
1387         int i, numnodes;
1388
1389         numnodes = talloc_array_length(ipflags);
1390         pnn    = -1;
1391         for (i=0; i<numnodes; i++) {
1392                 /* verify that this node can serve this ip */
1393                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1394                         /* no it couldnt   so skip to the next node */
1395                         continue;
1396                 }
1397
1398                 num = node_ip_coverage(ctdb, i, all_ips);
1399                 /* was this the first node we checked ? */
1400                 if (pnn == -1) {
1401                         pnn = i;
1402                         min  = num;
1403                 } else {
1404                         if (num < min) {
1405                                 pnn = i;
1406                                 min  = num;
1407                         }
1408                 }
1409         }       
1410         if (pnn == -1) {
1411                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1412                         ctdb_addr_to_str(&ip->addr)));
1413
1414                 return -1;
1415         }
1416
1417         ip->pnn = pnn;
1418         return 0;
1419 }
1420
1421 #define IP_KEYLEN       4
1422 static uint32_t *ip_key(ctdb_sock_addr *ip)
1423 {
1424         static uint32_t key[IP_KEYLEN];
1425
1426         bzero(key, sizeof(key));
1427
1428         switch (ip->sa.sa_family) {
1429         case AF_INET:
1430                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1431                 break;
1432         case AF_INET6: {
1433                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1434                 key[0]  = htonl(s6_a32[0]);
1435                 key[1]  = htonl(s6_a32[1]);
1436                 key[2]  = htonl(s6_a32[2]);
1437                 key[3]  = htonl(s6_a32[3]);
1438                 break;
1439         }
1440         default:
1441                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1442                 return key;
1443         }
1444
1445         return key;
1446 }
1447
1448 static void *add_ip_callback(void *parm, void *data)
1449 {
1450         struct ctdb_public_ip_list *this_ip = parm; 
1451         struct ctdb_public_ip_list *prev_ip = data; 
1452
1453         if (prev_ip == NULL) {
1454                 return parm;
1455         }
1456         if (this_ip->pnn == -1) {
1457                 this_ip->pnn = prev_ip->pnn;
1458         }
1459
1460         return parm;
1461 }
1462
1463 static int getips_count_callback(void *param, void *data)
1464 {
1465         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1466         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1467
1468         new_ip->next = *ip_list;
1469         *ip_list     = new_ip;
1470         return 0;
1471 }
1472
1473 static struct ctdb_public_ip_list *
1474 create_merged_ip_list(struct ctdb_context *ctdb)
1475 {
1476         int i, j;
1477         struct ctdb_public_ip_list *ip_list;
1478         struct ctdb_all_public_ips *public_ips;
1479
1480         if (ctdb->ip_tree != NULL) {
1481                 talloc_free(ctdb->ip_tree);
1482                 ctdb->ip_tree = NULL;
1483         }
1484         ctdb->ip_tree = trbt_create(ctdb, 0);
1485
1486         for (i=0;i<ctdb->num_nodes;i++) {
1487                 public_ips = ctdb->nodes[i]->known_public_ips;
1488
1489                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1490                         continue;
1491                 }
1492
1493                 /* there were no public ips for this node */
1494                 if (public_ips == NULL) {
1495                         continue;
1496                 }               
1497
1498                 for (j=0;j<public_ips->num;j++) {
1499                         struct ctdb_public_ip_list *tmp_ip; 
1500
1501                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1502                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503                         /* Do not use information about IP addresses hosted
1504                          * on other nodes, it may not be accurate */
1505                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1507                         } else {
1508                                 tmp_ip->pnn = -1;
1509                         }
1510                         tmp_ip->addr = public_ips->ips[j].addr;
1511                         tmp_ip->next = NULL;
1512
1513                         trbt_insertarray32_callback(ctdb->ip_tree,
1514                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1515                                 add_ip_callback,
1516                                 tmp_ip);
1517                 }
1518         }
1519
1520         ip_list = NULL;
1521         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1522
1523         return ip_list;
1524 }
1525
1526 /* 
1527  * This is the length of the longtest common prefix between the IPs.
1528  * It is calculated by XOR-ing the 2 IPs together and counting the
1529  * number of leading zeroes.  The implementation means that all
1530  * addresses end up being 128 bits long.
1531  *
1532  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534  * lots of nodes and IP addresses?
1535  */
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1537 {
1538         uint32_t ip1_k[IP_KEYLEN];
1539         uint32_t *t;
1540         int i;
1541         uint32_t x;
1542
1543         uint32_t distance = 0;
1544
1545         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1546         t = ip_key(ip2);
1547         for (i=0; i<IP_KEYLEN; i++) {
1548                 x = ip1_k[i] ^ t[i];
1549                 if (x == 0) {
1550                         distance += 32;
1551                 } else {
1552                         /* Count number of leading zeroes. 
1553                          * FIXME? This could be optimised...
1554                          */
1555                         while ((x & (1 << 31)) == 0) {
1556                                 x <<= 1;
1557                                 distance += 1;
1558                         }
1559                 }
1560         }
1561
1562         return distance;
1563 }
1564
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566    given node.  The ips argument is generally the all_ips variable
1567    used in the main part of the algorithm.
1568  */
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570                                   struct ctdb_public_ip_list *ips,
1571                                   int pnn)
1572 {
1573         struct ctdb_public_ip_list *t;
1574         uint32_t d;
1575
1576         uint32_t sum = 0;
1577
1578         for (t=ips; t != NULL; t=t->next) {
1579                 if (t->pnn != pnn) {
1580                         continue;
1581                 }
1582
1583                 /* Optimisation: We never calculate the distance
1584                  * between an address and itself.  This allows us to
1585                  * calculate the effect of removing an address from a
1586                  * node by simply calculating the distance between
1587                  * that address and all of the exitsing addresses.
1588                  * Moreover, we assume that we're only ever dealing
1589                  * with addresses from all_ips so we can identify an
1590                  * address via a pointer rather than doing a more
1591                  * expensive address comparison. */
1592                 if (&(t->addr) == ip) {
1593                         continue;
1594                 }
1595
1596                 d = ip_distance(ip, &(t->addr));
1597                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1598         }
1599
1600         return sum;
1601 }
1602
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1604    to the given node.
1605  */
1606 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1607 {
1608         struct ctdb_public_ip_list *t;
1609
1610         uint32_t imbalance = 0;
1611
1612         for (t=all_ips; t!=NULL; t=t->next) {
1613                 if (t->pnn != pnn) {
1614                         continue;
1615                 }
1616                 /* Pass the rest of the IPs rather than the whole
1617                    all_ips input list.
1618                 */
1619                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1620         }
1621
1622         return imbalance;
1623 }
1624
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626  * finding the best node for each.
1627  */
1628 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1629                                       struct ctdb_ipflags *ipflags,
1630                                       struct ctdb_public_ip_list *all_ips)
1631 {
1632         struct ctdb_public_ip_list *tmp_ip;
1633
1634         /* loop over all ip's and find a physical node to cover for 
1635            each unassigned ip.
1636         */
1637         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638                 if (tmp_ip->pnn == -1) {
1639                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1640                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1641                                         ctdb_addr_to_str(&tmp_ip->addr)));
1642                         }
1643                 }
1644         }
1645 }
1646
1647 /* Basic non-deterministic rebalancing algorithm.
1648  */
1649 static void basic_failback(struct ctdb_context *ctdb,
1650                            struct ctdb_ipflags *ipflags,
1651                            struct ctdb_public_ip_list *all_ips,
1652                            int num_ips)
1653 {
1654         int i, numnodes;
1655         int maxnode, maxnum, minnode, minnum, num, retries;
1656         struct ctdb_public_ip_list *tmp_ip;
1657
1658         numnodes = talloc_array_length(ipflags);
1659         retries = 0;
1660
1661 try_again:
1662         maxnum=0;
1663         minnum=0;
1664
1665         /* for each ip address, loop over all nodes that can serve
1666            this ip and make sure that the difference between the node
1667            serving the most and the node serving the least ip's are
1668            not greater than 1.
1669         */
1670         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1671                 if (tmp_ip->pnn == -1) {
1672                         continue;
1673                 }
1674
1675                 /* Get the highest and lowest number of ips's served by any 
1676                    valid node which can serve this ip.
1677                 */
1678                 maxnode = -1;
1679                 minnode = -1;
1680                 for (i=0; i<numnodes; i++) {
1681                         /* only check nodes that can actually serve this ip */
1682                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1683                                 /* no it couldnt   so skip to the next node */
1684                                 continue;
1685                         }
1686
1687                         num = node_ip_coverage(ctdb, i, all_ips);
1688                         if (maxnode == -1) {
1689                                 maxnode = i;
1690                                 maxnum  = num;
1691                         } else {
1692                                 if (num > maxnum) {
1693                                         maxnode = i;
1694                                         maxnum  = num;
1695                                 }
1696                         }
1697                         if (minnode == -1) {
1698                                 minnode = i;
1699                                 minnum  = num;
1700                         } else {
1701                                 if (num < minnum) {
1702                                         minnode = i;
1703                                         minnum  = num;
1704                                 }
1705                         }
1706                 }
1707                 if (maxnode == -1) {
1708                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1709                                 ctdb_addr_to_str(&tmp_ip->addr)));
1710
1711                         continue;
1712                 }
1713
1714                 /* if the spread between the smallest and largest coverage by
1715                    a node is >=2 we steal one of the ips from the node with
1716                    most coverage to even things out a bit.
1717                    try to do this a limited number of times since we dont
1718                    want to spend too much time balancing the ip coverage.
1719                 */
1720                 if ( (maxnum > minnum+1)
1721                      && (retries < (num_ips + 5)) ){
1722                         struct ctdb_public_ip_list *tmp;
1723
1724                         /* Reassign one of maxnode's VNNs */
1725                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1726                                 if (tmp->pnn == maxnode) {
1727                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1728                                         retries++;
1729                                         goto try_again;;
1730                                 }
1731                         }
1732                 }
1733         }
1734 }
1735
1736 static void lcp2_init(struct ctdb_context *tmp_ctx,
1737                       struct ctdb_ipflags *ipflags,
1738                       struct ctdb_public_ip_list *all_ips,
1739                       uint32_t *force_rebalance_nodes,
1740                       uint32_t **lcp2_imbalances,
1741                       bool **rebalance_candidates)
1742 {
1743         int i, numnodes;
1744         struct ctdb_public_ip_list *tmp_ip;
1745
1746         numnodes = talloc_array_length(ipflags);
1747
1748         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1750         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1751         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752
1753         for (i=0; i<numnodes; i++) {
1754                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1755                 /* First step: assume all nodes are candidates */
1756                 (*rebalance_candidates)[i] = true;
1757         }
1758
1759         /* 2nd step: if a node has IPs assigned then it must have been
1760          * healthy before, so we remove it from consideration.  This
1761          * is overkill but is all we have because we don't maintain
1762          * state between takeover runs.  An alternative would be to
1763          * keep state and invalidate it every time the recovery master
1764          * changes.
1765          */
1766         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1767                 if (tmp_ip->pnn != -1) {
1768                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1769                 }
1770         }
1771
1772         /* 3rd step: if a node is forced to re-balance then
1773            we allow failback onto the node */
1774         if (force_rebalance_nodes == NULL) {
1775                 return;
1776         }
1777         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1778                 uint32_t pnn = force_rebalance_nodes[i];
1779                 if (pnn >= numnodes) {
1780                         DEBUG(DEBUG_ERR,
1781                               (__location__ "unknown node %u\n", pnn));
1782                         continue;
1783                 }
1784
1785                 DEBUG(DEBUG_NOTICE,
1786                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1787                 (*rebalance_candidates)[pnn] = true;
1788         }
1789 }
1790
1791 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1792  * the IP/node combination that will cost the least.
1793  */
1794 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1795                                      struct ctdb_ipflags *ipflags,
1796                                      struct ctdb_public_ip_list *all_ips,
1797                                      uint32_t *lcp2_imbalances)
1798 {
1799         struct ctdb_public_ip_list *tmp_ip;
1800         int dstnode, numnodes;
1801
1802         int minnode;
1803         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1804         struct ctdb_public_ip_list *minip;
1805
1806         bool should_loop = true;
1807         bool have_unassigned = true;
1808
1809         numnodes = talloc_array_length(ipflags);
1810
1811         while (have_unassigned && should_loop) {
1812                 should_loop = false;
1813
1814                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1815                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1816
1817                 minnode = -1;
1818                 mindsum = 0;
1819                 minip = NULL;
1820
1821                 /* loop over each unassigned ip. */
1822                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1823                         if (tmp_ip->pnn != -1) {
1824                                 continue;
1825                         }
1826
1827                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1828                                 /* only check nodes that can actually takeover this ip */
1829                                 if (!can_node_takeover_ip(ctdb, dstnode,
1830                                                           ipflags[dstnode],
1831                                                           tmp_ip)) {
1832                                         /* no it couldnt   so skip to the next node */
1833                                         continue;
1834                                 }
1835
1836                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1837                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1838                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1839                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1840                                                    dstnode,
1841                                                    dstimbl - lcp2_imbalances[dstnode]));
1842
1843
1844                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1845                                         minnode = dstnode;
1846                                         minimbl = dstimbl;
1847                                         mindsum = dstdsum;
1848                                         minip = tmp_ip;
1849                                         should_loop = true;
1850                                 }
1851                         }
1852                 }
1853
1854                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1855
1856                 /* If we found one then assign it to the given node. */
1857                 if (minnode != -1) {
1858                         minip->pnn = minnode;
1859                         lcp2_imbalances[minnode] = minimbl;
1860                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1861                                           ctdb_addr_to_str(&(minip->addr)),
1862                                           minnode,
1863                                           mindsum));
1864                 }
1865
1866                 /* There might be a better way but at least this is clear. */
1867                 have_unassigned = false;
1868                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1869                         if (tmp_ip->pnn == -1) {
1870                                 have_unassigned = true;
1871                         }
1872                 }
1873         }
1874
1875         /* We know if we have an unassigned addresses so we might as
1876          * well optimise.
1877          */
1878         if (have_unassigned) {
1879                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1880                         if (tmp_ip->pnn == -1) {
1881                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1882                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1883                         }
1884                 }
1885         }
1886 }
1887
1888 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1889  * to move IPs from, determines the best IP/destination node
1890  * combination to move from the source node.
1891  */
1892 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1893                                     struct ctdb_ipflags *ipflags,
1894                                     struct ctdb_public_ip_list *all_ips,
1895                                     int srcnode,
1896                                     uint32_t candimbl,
1897                                     uint32_t *lcp2_imbalances,
1898                                     bool *rebalance_candidates)
1899 {
1900         int dstnode, mindstnode, numnodes;
1901         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1902         uint32_t minsrcimbl, mindstimbl;
1903         struct ctdb_public_ip_list *minip;
1904         struct ctdb_public_ip_list *tmp_ip;
1905
1906         /* Find an IP and destination node that best reduces imbalance. */
1907         srcimbl = 0;
1908         minip = NULL;
1909         minsrcimbl = 0;
1910         mindstnode = -1;
1911         mindstimbl = 0;
1912
1913         numnodes = talloc_array_length(ipflags);
1914
1915         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1916         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1917
1918         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1919                 /* Only consider addresses on srcnode. */
1920                 if (tmp_ip->pnn != srcnode) {
1921                         continue;
1922                 }
1923
1924                 /* What is this IP address costing the source node? */
1925                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1926                 srcimbl = candimbl - srcdsum;
1927
1928                 /* Consider this IP address would cost each potential
1929                  * destination node.  Destination nodes are limited to
1930                  * those that are newly healthy, since we don't want
1931                  * to do gratuitous failover of IPs just to make minor
1932                  * balance improvements.
1933                  */
1934                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1935                         if (!rebalance_candidates[dstnode]) {
1936                                 continue;
1937                         }
1938
1939                         /* only check nodes that can actually takeover this ip */
1940                         if (!can_node_takeover_ip(ctdb, dstnode,
1941                                                   ipflags[dstnode], tmp_ip)) {
1942                                 /* no it couldnt   so skip to the next node */
1943                                 continue;
1944                         }
1945
1946                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1947                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1948                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1949                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1950                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1951                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1952
1953                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1954                             ((mindstnode == -1) ||                              \
1955                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1956
1957                                 minip = tmp_ip;
1958                                 minsrcimbl = srcimbl;
1959                                 mindstnode = dstnode;
1960                                 mindstimbl = dstimbl;
1961                         }
1962                 }
1963         }
1964         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1965
1966         if (mindstnode != -1) {
1967                 /* We found a move that makes things better... */
1968                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1969                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1970                                   ctdb_addr_to_str(&(minip->addr)),
1971                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1972
1973
1974                 lcp2_imbalances[srcnode] = srcimbl;
1975                 lcp2_imbalances[mindstnode] = mindstimbl;
1976                 minip->pnn = mindstnode;
1977
1978                 return true;
1979         }
1980
1981         return false;
1982         
1983 }
1984
1985 struct lcp2_imbalance_pnn {
1986         uint32_t imbalance;
1987         int pnn;
1988 };
1989
1990 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1991 {
1992         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1993         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1994
1995         if (lipa->imbalance > lipb->imbalance) {
1996                 return -1;
1997         } else if (lipa->imbalance == lipb->imbalance) {
1998                 return 0;
1999         } else {
2000                 return 1;
2001         }
2002 }
2003
2004 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2005  * node with the highest LCP2 imbalance, and then determines the best
2006  * IP/destination node combination to move from the source node.
2007  */
2008 static void lcp2_failback(struct ctdb_context *ctdb,
2009                           struct ctdb_ipflags *ipflags,
2010                           struct ctdb_public_ip_list *all_ips,
2011                           uint32_t *lcp2_imbalances,
2012                           bool *rebalance_candidates)
2013 {
2014         int i, num_rebalance_candidates, numnodes;
2015         struct lcp2_imbalance_pnn * lips;
2016         bool again;
2017
2018         numnodes = talloc_array_length(ipflags);
2019
2020 try_again:
2021
2022         /* It is only worth continuing if we have suitable target
2023          * nodes to transfer IPs to.  This check is much cheaper than
2024          * continuing on...
2025          */
2026         num_rebalance_candidates = 0;
2027         for (i=0; i<numnodes; i++) {
2028                 if (rebalance_candidates[i]) {
2029                         num_rebalance_candidates++;
2030                 }
2031         }
2032         if (num_rebalance_candidates == 0) {
2033                 return;
2034         }
2035
2036         /* Put the imbalances and nodes into an array, sort them and
2037          * iterate through candidates.  Usually the 1st one will be
2038          * used, so this doesn't cost much...
2039          */
2040         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2041         for (i=0; i<numnodes; i++) {
2042                 lips[i].imbalance = lcp2_imbalances[i];
2043                 lips[i].pnn = i;
2044         }
2045         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2046               lcp2_cmp_imbalance_pnn);
2047
2048         again = false;
2049         for (i=0; i<numnodes; i++) {
2050                 /* This means that all nodes had 0 or 1 addresses, so
2051                  * can't be imbalanced.
2052                  */
2053                 if (lips[i].imbalance == 0) {
2054                         break;
2055                 }
2056
2057                 if (lcp2_failback_candidate(ctdb,
2058                                             ipflags,
2059                                             all_ips,
2060                                             lips[i].pnn,
2061                                             lips[i].imbalance,
2062                                             lcp2_imbalances,
2063                                             rebalance_candidates)) {
2064                         again = true;
2065                         break;
2066                 }
2067         }
2068
2069         talloc_free(lips);
2070         if (again) {
2071                 goto try_again;
2072         }
2073 }
2074
2075 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2076                                     struct ctdb_ipflags *ipflags,
2077                                     struct ctdb_public_ip_list *all_ips)
2078 {
2079         struct ctdb_public_ip_list *tmp_ip;
2080
2081         /* verify that the assigned nodes can serve that public ip
2082            and set it to -1 if not
2083         */
2084         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2085                 if (tmp_ip->pnn == -1) {
2086                         continue;
2087                 }
2088                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2089                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2090                         /* this node can not serve this ip. */
2091                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2092                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2093                                            tmp_ip->pnn));
2094                         tmp_ip->pnn = -1;
2095                 }
2096         }
2097 }
2098
2099 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2100                                        struct ctdb_ipflags *ipflags,
2101                                        struct ctdb_public_ip_list *all_ips)
2102 {
2103         struct ctdb_public_ip_list *tmp_ip;
2104         int i, numnodes;
2105
2106         numnodes = talloc_array_length(ipflags);
2107
2108         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2109        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2110         *  always be allocated the same way for a specific set of
2111         *  available/unavailable nodes.
2112         */
2113
2114         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2115                 tmp_ip->pnn = i % numnodes;
2116         }
2117
2118         /* IP failback doesn't make sense with deterministic
2119          * IPs, since the modulo step above implicitly fails
2120          * back IPs to their "home" node.
2121          */
2122         if (1 == ctdb->tunable.no_ip_failback) {
2123                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2124         }
2125
2126         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2127
2128         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2129
2130         /* No failback here! */
2131 }
2132
2133 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2134                                           struct ctdb_ipflags *ipflags,
2135                                           struct ctdb_public_ip_list *all_ips)
2136 {
2137         /* This should be pushed down into basic_failback. */
2138         struct ctdb_public_ip_list *tmp_ip;
2139         int num_ips = 0;
2140         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2141                 num_ips++;
2142         }
2143
2144         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2145
2146         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2147
2148         /* If we don't want IPs to fail back then don't rebalance IPs. */
2149         if (1 == ctdb->tunable.no_ip_failback) {
2150                 return;
2151         }
2152
2153         /* Now, try to make sure the ip adresses are evenly distributed
2154            across the nodes.
2155         */
2156         basic_failback(ctdb, ipflags, all_ips, num_ips);
2157 }
2158
2159 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2160                           struct ctdb_ipflags *ipflags,
2161                           struct ctdb_public_ip_list *all_ips,
2162                           uint32_t *force_rebalance_nodes)
2163 {
2164         uint32_t *lcp2_imbalances;
2165         bool *rebalance_candidates;
2166
2167         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2168
2169         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2170
2171         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2172                   &lcp2_imbalances, &rebalance_candidates);
2173
2174         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2175
2176         /* If we don't want IPs to fail back then don't rebalance IPs. */
2177         if (1 == ctdb->tunable.no_ip_failback) {
2178                 goto finished;
2179         }
2180
2181         /* Now, try to make sure the ip adresses are evenly distributed
2182            across the nodes.
2183         */
2184         lcp2_failback(ctdb, ipflags, all_ips,
2185                       lcp2_imbalances, rebalance_candidates);
2186
2187 finished:
2188         talloc_free(tmp_ctx);
2189 }
2190
2191 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2192 {
2193         int i, num_healthy;
2194
2195         /* Count how many completely healthy nodes we have */
2196         num_healthy = 0;
2197         for (i=0;i<nodemap->num;i++) {
2198                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2199                         num_healthy++;
2200                 }
2201         }
2202
2203         return num_healthy == 0;
2204 }
2205
2206 /* The calculation part of the IP allocation algorithm. */
2207 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2208                                    struct ctdb_ipflags *ipflags,
2209                                    struct ctdb_public_ip_list **all_ips_p,
2210                                    uint32_t *force_rebalance_nodes)
2211 {
2212         /* since nodes only know about those public addresses that
2213            can be served by that particular node, no single node has
2214            a full list of all public addresses that exist in the cluster.
2215            Walk over all node structures and create a merged list of
2216            all public addresses that exist in the cluster.
2217
2218            keep the tree of ips around as ctdb->ip_tree
2219         */
2220         *all_ips_p = create_merged_ip_list(ctdb);
2221
2222         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2223                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2224         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2225                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2226         } else {
2227                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2228         }
2229
2230         /* at this point ->pnn is the node which will own each IP
2231            or -1 if there is no node that can cover this ip
2232         */
2233
2234         return;
2235 }
2236
2237 struct get_tunable_callback_data {
2238         const char *tunable;
2239         uint32_t *out;
2240         bool fatal;
2241 };
2242
2243 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2244                                  int32_t res, TDB_DATA outdata,
2245                                  void *callback)
2246 {
2247         struct get_tunable_callback_data *cd =
2248                 (struct get_tunable_callback_data *)callback;
2249         int size;
2250
2251         if (res != 0) {
2252                 /* Already handled in fail callback */
2253                 return;
2254         }
2255
2256         if (outdata.dsize != sizeof(uint32_t)) {
2257                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2258                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2259                                  (int)outdata.dsize));
2260                 cd->fatal = true;
2261                 return;
2262         }
2263
2264         size = talloc_array_length(cd->out);
2265         if (pnn >= size) {
2266                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2267                                  cd->tunable, pnn, size));
2268                 return;
2269         }
2270
2271                 
2272         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2273 }
2274
2275 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2276                                        int32_t res, TDB_DATA outdata,
2277                                        void *callback)
2278 {
2279         struct get_tunable_callback_data *cd =
2280                 (struct get_tunable_callback_data *)callback;
2281
2282         switch (res) {
2283         case -ETIME:
2284                 DEBUG(DEBUG_ERR,
2285                       ("Timed out getting tunable \"%s\" from node %d\n",
2286                        cd->tunable, pnn));
2287                 cd->fatal = true;
2288                 break;
2289         case -EINVAL:
2290         case -1:
2291                 DEBUG(DEBUG_WARNING,
2292                       ("Tunable \"%s\" not implemented on node %d\n",
2293                        cd->tunable, pnn));
2294                 break;
2295         default:
2296                 DEBUG(DEBUG_ERR,
2297                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2298                        cd->tunable, pnn));
2299                 cd->fatal = true;
2300         }
2301 }
2302
2303 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2304                                         TALLOC_CTX *tmp_ctx,
2305                                         struct ctdb_node_map *nodemap,
2306                                         const char *tunable,
2307                                         uint32_t default_value)
2308 {
2309         TDB_DATA data;
2310         struct ctdb_control_get_tunable *t;
2311         uint32_t *nodes;
2312         uint32_t *tvals;
2313         struct get_tunable_callback_data callback_data;
2314         int i;
2315
2316         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2317         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2318         for (i=0; i<nodemap->num; i++) {
2319                 tvals[i] = default_value;
2320         }
2321                 
2322         callback_data.out = tvals;
2323         callback_data.tunable = tunable;
2324         callback_data.fatal = false;
2325
2326         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2327         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2328         t = (struct ctdb_control_get_tunable *)data.dptr;
2329         t->length = strlen(tunable)+1;
2330         memcpy(t->name, tunable, t->length);
2331         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2332         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2333                                       nodes, 0, TAKEOVER_TIMEOUT(),
2334                                       false, data,
2335                                       get_tunable_callback,
2336                                       get_tunable_fail_callback,
2337                                       &callback_data) != 0) {
2338                 if (callback_data.fatal) {
2339                         talloc_free(tvals);
2340                         tvals = NULL;
2341                 }
2342         }
2343         talloc_free(nodes);
2344         talloc_free(data.dptr);
2345
2346         return tvals;
2347 }
2348
2349 struct get_runstate_callback_data {
2350         enum ctdb_runstate *out;
2351         bool fatal;
2352 };
2353
2354 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2355                                   int32_t res, TDB_DATA outdata,
2356                                   void *callback_data)
2357 {
2358         struct get_runstate_callback_data *cd =
2359                 (struct get_runstate_callback_data *)callback_data;
2360         int size;
2361
2362         if (res != 0) {
2363                 /* Already handled in fail callback */
2364                 return;
2365         }
2366
2367         if (outdata.dsize != sizeof(uint32_t)) {
2368                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2369                                  pnn, (int)sizeof(uint32_t),
2370                                  (int)outdata.dsize));
2371                 cd->fatal = true;
2372                 return;
2373         }
2374
2375         size = talloc_array_length(cd->out);
2376         if (pnn >= size) {
2377                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2378                                  pnn, size));
2379                 return;
2380         }
2381
2382         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2383 }
2384
2385 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2386                                        int32_t res, TDB_DATA outdata,
2387                                        void *callback)
2388 {
2389         struct get_runstate_callback_data *cd =
2390                 (struct get_runstate_callback_data *)callback;
2391
2392         switch (res) {
2393         case -ETIME:
2394                 DEBUG(DEBUG_ERR,
2395                       ("Timed out getting runstate from node %d\n", pnn));
2396                 cd->fatal = true;
2397                 break;
2398         default:
2399                 DEBUG(DEBUG_WARNING,
2400                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2401                        pnn));
2402         }
2403 }
2404
2405 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2406                                                     TALLOC_CTX *tmp_ctx,
2407                                                     struct ctdb_node_map *nodemap,
2408                                                     enum ctdb_runstate default_value)
2409 {
2410         uint32_t *nodes;
2411         enum ctdb_runstate *rs;
2412         struct get_runstate_callback_data callback_data;
2413         int i;
2414
2415         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2416         CTDB_NO_MEMORY_NULL(ctdb, rs);
2417         for (i=0; i<nodemap->num; i++) {
2418                 rs[i] = default_value;
2419         }
2420
2421         callback_data.out = rs;
2422         callback_data.fatal = false;
2423
2424         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2425         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2426                                       nodes, 0, TAKEOVER_TIMEOUT(),
2427                                       true, tdb_null,
2428                                       get_runstate_callback,
2429                                       get_runstate_fail_callback,
2430                                       &callback_data) != 0) {
2431                 if (callback_data.fatal) {
2432                         free(rs);
2433                         rs = NULL;
2434                 }
2435         }
2436         talloc_free(nodes);
2437
2438         return rs;
2439 }
2440
2441 /* Set internal flags for IP allocation:
2442  *   Clear ip flags
2443  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2444  *   Set NOIPHOST ip flag for each INACTIVE node
2445  *   if all nodes are disabled:
2446  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2447  *   else
2448  *     Set NOIPHOST ip flags for disabled nodes
2449  */
2450 static struct ctdb_ipflags *
2451 set_ipflags_internal(struct ctdb_context *ctdb,
2452                      TALLOC_CTX *tmp_ctx,
2453                      struct ctdb_node_map *nodemap,
2454                      uint32_t *tval_noiptakeover,
2455                      uint32_t *tval_noiphostonalldisabled,
2456                      enum ctdb_runstate *runstate)
2457 {
2458         int i;
2459         struct ctdb_ipflags *ipflags;
2460
2461         /* Clear IP flags - implicit due to talloc_zero */
2462         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2463         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2464
2465         for (i=0;i<nodemap->num;i++) {
2466                 /* Can not take IPs on node with NoIPTakeover set */
2467                 if (tval_noiptakeover[i] != 0) {
2468                         ipflags[i].noiptakeover = true;
2469                 }
2470
2471                 /* Can not host IPs on node not in RUNNING state */
2472                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2473                         ipflags[i].noiphost = true;
2474                         continue;
2475                 }
2476                 /* Can not host IPs on INACTIVE node */
2477                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2478                         ipflags[i].noiphost = true;
2479                 }
2480         }
2481
2482         if (all_nodes_are_disabled(nodemap)) {
2483                 /* If all nodes are disabled, can not host IPs on node
2484                  * with NoIPHostOnAllDisabled set
2485                  */
2486                 for (i=0;i<nodemap->num;i++) {
2487                         if (tval_noiphostonalldisabled[i] != 0) {
2488                                 ipflags[i].noiphost = true;
2489                         }
2490                 }
2491         } else {
2492                 /* If some nodes are not disabled, then can not host
2493                  * IPs on DISABLED node
2494                  */
2495                 for (i=0;i<nodemap->num;i++) {
2496                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2497                                 ipflags[i].noiphost = true;
2498                         }
2499                 }
2500         }
2501
2502         return ipflags;
2503 }
2504
2505 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2506                                         TALLOC_CTX *tmp_ctx,
2507                                         struct ctdb_node_map *nodemap)
2508 {
2509         uint32_t *tval_noiptakeover;
2510         uint32_t *tval_noiphostonalldisabled;
2511         struct ctdb_ipflags *ipflags;
2512         enum ctdb_runstate *runstate;
2513
2514
2515         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2516                                                    "NoIPTakeover", 0);
2517         if (tval_noiptakeover == NULL) {
2518                 return NULL;
2519         }
2520
2521         tval_noiphostonalldisabled =
2522                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2523                                        "NoIPHostOnAllDisabled", 0);
2524         if (tval_noiphostonalldisabled == NULL) {
2525                 /* Caller frees tmp_ctx */
2526                 return NULL;
2527         }
2528
2529         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2530          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2531          * reasonable behaviour on a mixed cluster during upgrade.
2532          */
2533         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2534                                            CTDB_RUNSTATE_RUNNING);
2535         if (runstate == NULL) {
2536                 /* Caller frees tmp_ctx */
2537                 return NULL;
2538         }
2539
2540         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2541                                        tval_noiptakeover,
2542                                        tval_noiphostonalldisabled,
2543                                        runstate);
2544
2545         talloc_free(tval_noiptakeover);
2546         talloc_free(tval_noiphostonalldisabled);
2547         talloc_free(runstate);
2548
2549         return ipflags;
2550 }
2551
2552 struct iprealloc_callback_data {
2553         bool *retry_nodes;
2554         int retry_count;
2555         client_async_callback fail_callback;
2556         void *fail_callback_data;
2557         struct ctdb_node_map *nodemap;
2558 };
2559
2560 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2561                                         int32_t res, TDB_DATA outdata,
2562                                         void *callback)
2563 {
2564         int numnodes;
2565         struct iprealloc_callback_data *cd =
2566                 (struct iprealloc_callback_data *)callback;
2567
2568         switch (res) {
2569         case -ETIME:
2570                 /* If the control timed out then that's a real error,
2571                  * so call the real fail callback
2572                  */
2573                 cd->fail_callback(ctdb, pnn, res, outdata,
2574                                   cd->fail_callback_data);
2575                 break;
2576         default:
2577                 /* If not a timeout then either the ipreallocated
2578                  * eventscript (or some setup) failed.  This might
2579                  * have failed because the IPREALLOCATED control isn't
2580                  * implemented - right now there is no way of knowing
2581                  * because the error codes are all folded down to -1.
2582                  * Consider retrying using EVENTSCRIPT control...
2583                  */
2584
2585                 numnodes = talloc_array_length(cd->retry_nodes);
2586                 if (pnn > numnodes) {
2587                         DEBUG(DEBUG_ERR,
2588                               ("ipreallocated failure from node %d, but only %d nodes in nodemap\n",
2589                                pnn, numnodes));
2590                         return;
2591                 }
2592
2593                 /* Can't run the "ipreallocated" event on a INACTIVE node */
2594                 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2595                         DEBUG(DEBUG_ERR,
2596                               ("ipreallocated failure from node %d, but node is inactive - not flagging a retry\n",
2597                                pnn));
2598                         return;
2599                 }
2600
2601                 DEBUG(DEBUG_WARNING,
2602                       ("ipreallocated failure from node %d, flagging retry\n",
2603                        pnn));
2604                 cd->retry_nodes[pnn] = true;
2605                 cd->retry_count++;
2606         }
2607 }
2608
2609 struct takeover_callback_data {
2610         bool *node_failed;
2611         client_async_callback fail_callback;
2612         void *fail_callback_data;
2613         struct ctdb_node_map *nodemap;
2614 };
2615
2616 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2617                                        uint32_t node_pnn, int32_t res,
2618                                        TDB_DATA outdata, void *callback_data)
2619 {
2620         struct takeover_callback_data *cd =
2621                 talloc_get_type_abort(callback_data,
2622                                       struct takeover_callback_data);
2623         int i;
2624
2625         for (i = 0; i < cd->nodemap->num; i++) {
2626                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2627                         break;
2628                 }
2629         }
2630
2631         if (i == cd->nodemap->num) {
2632                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2633                 return;
2634         }
2635
2636         if (!cd->node_failed[i]) {
2637                 cd->node_failed[i] = true;
2638                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2639                                   cd->fail_callback_data);
2640         }
2641 }
2642
2643 /*
2644   make any IP alias changes for public addresses that are necessary 
2645  */
2646 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2647                       uint32_t *force_rebalance_nodes,
2648                       client_async_callback fail_callback, void *callback_data)
2649 {
2650         int i, j, ret;
2651         struct ctdb_public_ip ip;
2652         struct ctdb_public_ipv4 ipv4;
2653         uint32_t *nodes;
2654         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2655         TDB_DATA data;
2656         struct timeval timeout;
2657         struct client_async_data *async_data;
2658         struct ctdb_client_control_state *state;
2659         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2660         struct ctdb_ipflags *ipflags;
2661         struct takeover_callback_data *takeover_data;
2662         struct iprealloc_callback_data iprealloc_data;
2663         bool *retry_data;
2664
2665         /*
2666          * ip failover is completely disabled, just send out the 
2667          * ipreallocated event.
2668          */
2669         if (ctdb->tunable.disable_ip_failover != 0) {
2670                 goto ipreallocated;
2671         }
2672
2673         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2674         if (ipflags == NULL) {
2675                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2676                 talloc_free(tmp_ctx);
2677                 return -1;
2678         }
2679
2680         ZERO_STRUCT(ip);
2681
2682         /* Do the IP reassignment calculations */
2683         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2684
2685         /* Now tell all nodes to release any public IPs should not
2686          * host.  This will be a NOOP on nodes that don't currently
2687          * hold the given IP.
2688          */
2689         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2690         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2691
2692         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2693                                                        bool, nodemap->num);
2694         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2695         takeover_data->fail_callback = fail_callback;
2696         takeover_data->fail_callback_data = callback_data;
2697         takeover_data->nodemap = nodemap;
2698
2699         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2700         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2701
2702         async_data->fail_callback = takeover_run_fail_callback;
2703         async_data->callback_data = takeover_data;
2704
2705         for (i=0;i<nodemap->num;i++) {
2706                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2707                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2708                         continue;
2709                 }
2710
2711                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2712                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2713                                 /* This node should be serving this
2714                                    vnn so dont tell it to release the ip
2715                                 */
2716                                 continue;
2717                         }
2718                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2719                                 ipv4.pnn = tmp_ip->pnn;
2720                                 ipv4.sin = tmp_ip->addr.ip;
2721
2722                                 timeout = TAKEOVER_TIMEOUT();
2723                                 data.dsize = sizeof(ipv4);
2724                                 data.dptr  = (uint8_t *)&ipv4;
2725                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2726                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2727                                                 data, async_data,
2728                                                 &timeout, NULL);
2729                         } else {
2730                                 ip.pnn  = tmp_ip->pnn;
2731                                 ip.addr = tmp_ip->addr;
2732
2733                                 timeout = TAKEOVER_TIMEOUT();
2734                                 data.dsize = sizeof(ip);
2735                                 data.dptr  = (uint8_t *)&ip;
2736                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2737                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2738                                                 data, async_data,
2739                                                 &timeout, NULL);
2740                         }
2741
2742                         if (state == NULL) {
2743                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2744                                 talloc_free(tmp_ctx);
2745                                 return -1;
2746                         }
2747                 
2748                         ctdb_client_async_add(async_data, state);
2749                 }
2750         }
2751         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2752                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2753                 talloc_free(tmp_ctx);
2754                 return -1;
2755         }
2756         talloc_free(async_data);
2757
2758
2759         /* tell all nodes to get their own IPs */
2760         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2761         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2762
2763         async_data->fail_callback = fail_callback;
2764         async_data->callback_data = callback_data;
2765
2766         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2767                 if (tmp_ip->pnn == -1) {
2768                         /* this IP won't be taken over */
2769                         continue;
2770                 }
2771
2772                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2773                         ipv4.pnn = tmp_ip->pnn;
2774                         ipv4.sin = tmp_ip->addr.ip;
2775
2776                         timeout = TAKEOVER_TIMEOUT();
2777                         data.dsize = sizeof(ipv4);
2778                         data.dptr  = (uint8_t *)&ipv4;
2779                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2780                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2781                                         data, async_data,
2782                                         &timeout, NULL);
2783                 } else {
2784                         ip.pnn  = tmp_ip->pnn;
2785                         ip.addr = tmp_ip->addr;
2786
2787                         timeout = TAKEOVER_TIMEOUT();
2788                         data.dsize = sizeof(ip);
2789                         data.dptr  = (uint8_t *)&ip;
2790                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2791                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2792                                         data, async_data,
2793                                         &timeout, NULL);
2794                 }
2795                 if (state == NULL) {
2796                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2797                         talloc_free(tmp_ctx);
2798                         return -1;
2799                 }
2800                 
2801                 ctdb_client_async_add(async_data, state);
2802         }
2803         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2804                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2805                 talloc_free(tmp_ctx);
2806                 return -1;
2807         }
2808
2809 ipreallocated:
2810         /* 
2811          * Tell all nodes to run eventscripts to process the
2812          * "ipreallocated" event.  This can do a lot of things,
2813          * including restarting services to reconfigure them if public
2814          * IPs have moved.  Once upon a time this event only used to
2815          * update natwg.
2816          */
2817         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2818         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2819         iprealloc_data.retry_nodes = retry_data;
2820         iprealloc_data.retry_count = 0;
2821         iprealloc_data.fail_callback = fail_callback;
2822         iprealloc_data.fail_callback_data = callback_data;
2823         iprealloc_data.nodemap = nodemap;
2824
2825         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2826         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2827                                         nodes, 0, TAKEOVER_TIMEOUT(),
2828                                         false, tdb_null,
2829                                         NULL, iprealloc_fail_callback,
2830                                         &iprealloc_data);
2831         if (ret != 0) {
2832                 /* If the control failed then we should retry to any
2833                  * nodes flagged by iprealloc_fail_callback using the
2834                  * EVENTSCRIPT control.  This is a best-effort at
2835                  * backward compatiblity when running a mixed cluster
2836                  * where some nodes have not yet been upgraded to
2837                  * support the IPREALLOCATED control.
2838                  */
2839                 DEBUG(DEBUG_WARNING,
2840                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2841
2842                 nodes = talloc_array(tmp_ctx, uint32_t,
2843                                      iprealloc_data.retry_count);
2844                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2845
2846                 j = 0;
2847                 for (i=0; i<nodemap->num; i++) {
2848                         if (iprealloc_data.retry_nodes[i]) {
2849                                 nodes[j] = i;
2850                                 j++;
2851                         }
2852                 }
2853
2854                 data.dptr  = discard_const("ipreallocated");
2855                 data.dsize = strlen((char *)data.dptr) + 1; 
2856                 ret = ctdb_client_async_control(ctdb,
2857                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2858                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2859                                                 false, data,
2860                                                 NULL, fail_callback,
2861                                                 callback_data);
2862                 if (ret != 0) {
2863                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2864                 }
2865         }
2866
2867         talloc_free(tmp_ctx);
2868         return ret;
2869 }
2870
2871
2872 /*
2873   destroy a ctdb_client_ip structure
2874  */
2875 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2876 {
2877         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2878                 ctdb_addr_to_str(&ip->addr),
2879                 ntohs(ip->addr.ip.sin_port),
2880                 ip->client_id));
2881
2882         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2883         return 0;
2884 }
2885
2886 /*
2887   called by a client to inform us of a TCP connection that it is managing
2888   that should tickled with an ACK when IP takeover is done
2889   we handle both the old ipv4 style of packets as well as the new ipv4/6
2890   pdus.
2891  */
2892 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2893                                 TDB_DATA indata)
2894 {
2895         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2896         struct ctdb_control_tcp *old_addr = NULL;
2897         struct ctdb_control_tcp_addr new_addr;
2898         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2899         struct ctdb_tcp_list *tcp;
2900         struct ctdb_tcp_connection t;
2901         int ret;
2902         TDB_DATA data;
2903         struct ctdb_client_ip *ip;
2904         struct ctdb_vnn *vnn;
2905         ctdb_sock_addr addr;
2906
2907         switch (indata.dsize) {
2908         case sizeof(struct ctdb_control_tcp):
2909                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2910                 ZERO_STRUCT(new_addr);
2911                 tcp_sock = &new_addr;
2912                 tcp_sock->src.ip  = old_addr->src;
2913                 tcp_sock->dest.ip = old_addr->dest;
2914                 break;
2915         case sizeof(struct ctdb_control_tcp_addr):
2916                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2917                 break;
2918         default:
2919                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2920                                  "to ctdb_control_tcp_client. size was %d but "
2921                                  "only allowed sizes are %lu and %lu\n",
2922                                  (int)indata.dsize,
2923                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2924                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2925                 return -1;
2926         }
2927
2928         addr = tcp_sock->src;
2929         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2930         addr = tcp_sock->dest;
2931         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2932
2933         ZERO_STRUCT(addr);
2934         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2935         vnn = find_public_ip_vnn(ctdb, &addr);
2936         if (vnn == NULL) {
2937                 switch (addr.sa.sa_family) {
2938                 case AF_INET:
2939                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2940                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2941                                         ctdb_addr_to_str(&addr)));
2942                         }
2943                         break;
2944                 case AF_INET6:
2945                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2946                                 ctdb_addr_to_str(&addr)));
2947                         break;
2948                 default:
2949                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2950                 }
2951
2952                 return 0;
2953         }
2954
2955         if (vnn->pnn != ctdb->pnn) {
2956                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2957                         ctdb_addr_to_str(&addr),
2958                         client_id, client->pid));
2959                 /* failing this call will tell smbd to die */
2960                 return -1;
2961         }
2962
2963         ip = talloc(client, struct ctdb_client_ip);
2964         CTDB_NO_MEMORY(ctdb, ip);
2965
2966         ip->ctdb      = ctdb;
2967         ip->addr      = addr;
2968         ip->client_id = client_id;
2969         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2970         DLIST_ADD(ctdb->client_ip_list, ip);
2971
2972         tcp = talloc(client, struct ctdb_tcp_list);
2973         CTDB_NO_MEMORY(ctdb, tcp);
2974
2975         tcp->connection.src_addr = tcp_sock->src;
2976         tcp->connection.dst_addr = tcp_sock->dest;
2977
2978         DLIST_ADD(client->tcp_list, tcp);
2979
2980         t.src_addr = tcp_sock->src;
2981         t.dst_addr = tcp_sock->dest;
2982
2983         data.dptr = (uint8_t *)&t;
2984         data.dsize = sizeof(t);
2985
2986         switch (addr.sa.sa_family) {
2987         case AF_INET:
2988                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2989                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2990                         ctdb_addr_to_str(&tcp_sock->src),
2991                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2992                 break;
2993         case AF_INET6:
2994                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2995                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2996                         ctdb_addr_to_str(&tcp_sock->src),
2997                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2998                 break;
2999         default:
3000                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3001         }
3002
3003
3004         /* tell all nodes about this tcp connection */
3005         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3006                                        CTDB_CONTROL_TCP_ADD,
3007                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3008         if (ret != 0) {
3009                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3010                 return -1;
3011         }
3012
3013         return 0;
3014 }
3015
3016 /*
3017   find a tcp address on a list
3018  */
3019 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3020                                            struct ctdb_tcp_connection *tcp)
3021 {
3022         int i;
3023
3024         if (array == NULL) {
3025                 return NULL;
3026         }
3027
3028         for (i=0;i<array->num;i++) {
3029                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3030                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3031                         return &array->connections[i];
3032                 }
3033         }
3034         return NULL;
3035 }
3036
3037
3038
3039 /*
3040   called by a daemon to inform us of a TCP connection that one of its
3041   clients managing that should tickled with an ACK when IP takeover is
3042   done
3043  */
3044 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3045 {
3046         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3047         struct ctdb_tcp_array *tcparray;
3048         struct ctdb_tcp_connection tcp;
3049         struct ctdb_vnn *vnn;
3050
3051         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3052         if (vnn == NULL) {
3053                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3054                         ctdb_addr_to_str(&p->dst_addr)));
3055
3056                 return -1;
3057         }
3058
3059
3060         tcparray = vnn->tcp_array;
3061
3062         /* If this is the first tickle */
3063         if (tcparray == NULL) {
3064                 tcparray = talloc_size(ctdb->nodes, 
3065                         offsetof(struct ctdb_tcp_array, connections) +
3066                         sizeof(struct ctdb_tcp_connection) * 1);
3067                 CTDB_NO_MEMORY(ctdb, tcparray);
3068                 vnn->tcp_array = tcparray;
3069
3070                 tcparray->num = 0;
3071                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3072                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3073
3074                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3075                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3076                 tcparray->num++;
3077
3078                 if (tcp_update_needed) {
3079                         vnn->tcp_update_needed = true;
3080                 }
3081                 return 0;
3082         }
3083
3084
3085         /* Do we already have this tickle ?*/
3086         tcp.src_addr = p->src_addr;
3087         tcp.dst_addr = p->dst_addr;
3088         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
3089                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3090                         ctdb_addr_to_str(&tcp.dst_addr),
3091                         ntohs(tcp.dst_addr.ip.sin_port),
3092                         vnn->pnn));
3093                 return 0;
3094         }
3095
3096         /* A new tickle, we must add it to the array */
3097         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3098                                         struct ctdb_tcp_connection,
3099                                         tcparray->num+1);
3100         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3101
3102         vnn->tcp_array = tcparray;
3103         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3104         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3105         tcparray->num++;
3106                                 
3107         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3108                 ctdb_addr_to_str(&tcp.dst_addr),
3109                 ntohs(tcp.dst_addr.ip.sin_port),
3110                 vnn->pnn));
3111
3112         if (tcp_update_needed) {
3113                 vnn->tcp_update_needed = true;
3114         }
3115
3116         return 0;
3117 }
3118
3119
3120 /*
3121   called by a daemon to inform us of a TCP connection that one of its
3122   clients managing that should tickled with an ACK when IP takeover is
3123   done
3124  */
3125 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3126 {
3127         struct ctdb_tcp_connection *tcpp;
3128         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3129
3130         if (vnn == NULL) {
3131                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3132                         ctdb_addr_to_str(&conn->dst_addr)));
3133                 return;
3134         }
3135
3136         /* if the array is empty we cant remove it
3137            and we dont need to do anything
3138          */
3139         if (vnn->tcp_array == NULL) {
3140                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3141                         ctdb_addr_to_str(&conn->dst_addr),
3142                         ntohs(conn->dst_addr.ip.sin_port)));
3143                 return;
3144         }
3145
3146
3147         /* See if we know this connection
3148            if we dont know this connection  then we dont need to do anything
3149          */
3150         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3151         if (tcpp == NULL) {
3152                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3153                         ctdb_addr_to_str(&conn->dst_addr),
3154                         ntohs(conn->dst_addr.ip.sin_port)));
3155                 return;
3156         }
3157
3158
3159         /* We need to remove this entry from the array.
3160            Instead of allocating a new array and copying data to it
3161            we cheat and just copy the last entry in the existing array
3162            to the entry that is to be removed and just shring the 
3163            ->num field
3164          */
3165         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3166         vnn->tcp_array->num--;
3167
3168         /* If we deleted the last entry we also need to remove the entire array
3169          */
3170         if (vnn->tcp_array->num == 0) {
3171                 talloc_free(vnn->tcp_array);
3172                 vnn->tcp_array = NULL;
3173         }               
3174
3175         vnn->tcp_update_needed = true;
3176
3177         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3178                 ctdb_addr_to_str(&conn->src_addr),
3179                 ntohs(conn->src_addr.ip.sin_port)));
3180 }
3181
3182
3183 /*
3184   called by a daemon to inform us of a TCP connection that one of its
3185   clients used are no longer needed in the tickle database
3186  */
3187 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3188 {
3189         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3190
3191         ctdb_remove_tcp_connection(ctdb, conn);
3192
3193         return 0;
3194 }
3195
3196
3197 /*
3198   called when a daemon restarts - send all tickes for all public addresses
3199   we are serving immediately to the new node.
3200  */
3201 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
3202 {
3203 /*XXX here we should send all tickes we are serving to the new node */
3204         return 0;
3205 }
3206
3207
3208 /*
3209   called when a client structure goes away - hook to remove
3210   elements from the tcp_list in all daemons
3211  */
3212 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3213 {
3214         while (client->tcp_list) {
3215                 struct ctdb_tcp_list *tcp = client->tcp_list;
3216                 DLIST_REMOVE(client->tcp_list, tcp);
3217                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3218         }
3219 }
3220
3221
3222 /*
3223   release all IPs on shutdown
3224  */
3225 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3226 {
3227         struct ctdb_vnn *vnn;
3228         int count = 0;
3229
3230         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3231                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3232                         ctdb_vnn_unassign_iface(ctdb, vnn);
3233                         continue;
3234                 }
3235                 if (!vnn->iface) {
3236                         continue;
3237                 }
3238
3239                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3240                                     ctdb_addr_to_str(&vnn->public_address),
3241                                     vnn->public_netmask_bits,
3242                                     ctdb_vnn_iface_string(vnn)));
3243
3244                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3245                                   ctdb_vnn_iface_string(vnn),
3246                                   ctdb_addr_to_str(&vnn->public_address),
3247                                   vnn->public_netmask_bits);
3248                 release_kill_clients(ctdb, &vnn->public_address);
3249                 ctdb_vnn_unassign_iface(ctdb, vnn);
3250                 count++;
3251         }
3252
3253         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3254 }
3255
3256
3257 /*
3258   get list of public IPs
3259  */
3260 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3261                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3262 {
3263         int i, num, len;
3264         struct ctdb_all_public_ips *ips;
3265         struct ctdb_vnn *vnn;
3266         bool only_available = false;
3267
3268         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3269                 only_available = true;
3270         }
3271
3272         /* count how many public ip structures we have */
3273         num = 0;
3274         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3275                 num++;
3276         }
3277
3278         len = offsetof(struct ctdb_all_public_ips, ips) + 
3279                 num*sizeof(struct ctdb_public_ip);
3280         ips = talloc_zero_size(outdata, len);
3281         CTDB_NO_MEMORY(ctdb, ips);
3282
3283         i = 0;
3284         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3285                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3286                         continue;
3287                 }
3288                 ips->ips[i].pnn  = vnn->pnn;
3289                 ips->ips[i].addr = vnn->public_address;
3290                 i++;
3291         }
3292         ips->num = i;
3293         len = offsetof(struct ctdb_all_public_ips, ips) +
3294                 i*sizeof(struct ctdb_public_ip);
3295
3296         outdata->dsize = len;
3297         outdata->dptr  = (uint8_t *)ips;
3298
3299         return 0;
3300 }
3301
3302
3303 /*
3304   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3305  */
3306 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3307                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3308 {
3309         int i, num, len;
3310         struct ctdb_all_public_ipsv4 *ips;
3311         struct ctdb_vnn *vnn;
3312
3313         /* count how many public ip structures we have */
3314         num = 0;
3315         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3316                 if (vnn->public_address.sa.sa_family != AF_INET) {
3317                         continue;
3318                 }
3319                 num++;
3320         }
3321
3322         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3323                 num*sizeof(struct ctdb_public_ipv4);
3324         ips = talloc_zero_size(outdata, len);
3325         CTDB_NO_MEMORY(ctdb, ips);
3326
3327         outdata->dsize = len;
3328         outdata->dptr  = (uint8_t *)ips;
3329
3330         ips->num = num;
3331         i = 0;
3332         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3333                 if (vnn->public_address.sa.sa_family != AF_INET) {
3334                         continue;
3335                 }
3336                 ips->ips[i].pnn = vnn->pnn;
3337                 ips->ips[i].sin = vnn->public_address.ip;
3338                 i++;
3339         }
3340
3341         return 0;
3342 }
3343
3344 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3345                                         struct ctdb_req_control *c,
3346                                         TDB_DATA indata,
3347                                         TDB_DATA *outdata)
3348 {
3349         int i, num, len;
3350         ctdb_sock_addr *addr;
3351         struct ctdb_control_public_ip_info *info;
3352         struct ctdb_vnn *vnn;
3353
3354         addr = (ctdb_sock_addr *)indata.dptr;
3355
3356         vnn = find_public_ip_vnn(ctdb, addr);
3357         if (vnn == NULL) {
3358                 /* if it is not a public ip   it could be our 'single ip' */
3359                 if (ctdb->single_ip_vnn) {
3360                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3361                                 vnn = ctdb->single_ip_vnn;
3362                         }
3363                 }
3364         }
3365         if (vnn == NULL) {
3366                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3367                                  "'%s'not a public address\n",
3368                                  ctdb_addr_to_str(addr)));
3369                 return -1;
3370         }
3371
3372         /* count how many public ip structures we have */
3373         num = 0;
3374         for (;vnn->ifaces[num];) {
3375                 num++;
3376         }
3377
3378         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3379                 num*sizeof(struct ctdb_control_iface_info);
3380         info = talloc_zero_size(outdata, len);
3381         CTDB_NO_MEMORY(ctdb, info);
3382
3383         info->ip.addr = vnn->public_address;
3384         info->ip.pnn = vnn->pnn;
3385         info->active_idx = 0xFFFFFFFF;
3386
3387         for (i=0; vnn->ifaces[i]; i++) {
3388                 struct ctdb_iface *cur;
3389
3390                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3391                 if (cur == NULL) {
3392                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3393                                            vnn->ifaces[i]));
3394                         return -1;
3395                 }
3396                 if (vnn->iface == cur) {
3397                         info->active_idx = i;
3398                 }
3399                 strcpy(info->ifaces[i].name, cur->name);
3400                 info->ifaces[i].link_state = cur->link_up;
3401                 info->ifaces[i].references = cur->references;
3402         }
3403         info->num = i;
3404         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3405                 i*sizeof(struct ctdb_control_iface_info);
3406
3407         outdata->dsize = len;
3408         outdata->dptr  = (uint8_t *)info;
3409
3410         return 0;
3411 }
3412
3413 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3414                                 struct ctdb_req_control *c,
3415                                 TDB_DATA *outdata)
3416 {
3417         int i, num, len;
3418         struct ctdb_control_get_ifaces *ifaces;
3419         struct ctdb_iface *cur;
3420
3421         /* count how many public ip structures we have */
3422         num = 0;
3423         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3424                 num++;
3425         }
3426
3427         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3428                 num*sizeof(struct ctdb_control_iface_info);
3429         ifaces = talloc_zero_size(outdata, len);
3430         CTDB_NO_MEMORY(ctdb, ifaces);
3431
3432         i = 0;
3433         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3434                 strcpy(ifaces->ifaces[i].name, cur->name);
3435                 ifaces->ifaces[i].link_state = cur->link_up;
3436                 ifaces->ifaces[i].references = cur->references;
3437                 i++;
3438         }
3439         ifaces->num = i;
3440         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3441                 i*sizeof(struct ctdb_control_iface_info);
3442
3443         outdata->dsize = len;
3444         outdata->dptr  = (uint8_t *)ifaces;
3445
3446         return 0;
3447 }
3448
3449 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3450                                     struct ctdb_req_control *c,
3451                                     TDB_DATA indata)
3452 {
3453         struct ctdb_control_iface_info *info;
3454         struct ctdb_iface *iface;
3455         bool link_up = false;
3456
3457         info = (struct ctdb_control_iface_info *)indata.dptr;
3458
3459         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3460                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3461                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3462                                   len, len, info->name));
3463                 return -1;
3464         }
3465
3466         switch (info->link_state) {
3467         case 0:
3468                 link_up = false;
3469                 break;
3470         case 1:
3471                 link_up = true;
3472                 break;
3473         default:
3474                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3475                                   (unsigned int)info->link_state));
3476                 return -1;
3477         }
3478
3479         if (info->references != 0) {
3480                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3481                                   (unsigned int)info->references));
3482                 return -1;
3483         }
3484
3485         iface = ctdb_find_iface(ctdb, info->name);
3486         if (iface == NULL) {
3487                 return -1;
3488         }
3489
3490         if (link_up == iface->link_up) {
3491                 return 0;
3492         }
3493
3494         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3495               ("iface[%s] has changed it's link status %s => %s\n",
3496                iface->name,
3497                iface->link_up?"up":"down",
3498                link_up?"up":"down"));
3499
3500         iface->link_up = link_up;
3501         return 0;
3502 }
3503
3504
3505 /* 
3506    structure containing the listening socket and the list of tcp connections
3507    that the ctdb daemon is to kill
3508 */
3509 struct ctdb_kill_tcp {
3510         struct ctdb_vnn *vnn;
3511         struct ctdb_context *ctdb;
3512         int capture_fd;
3513         struct fd_event *fde;
3514         trbt_tree_t *connections;
3515         void *private_data;
3516 };
3517
3518 /*
3519   a tcp connection that is to be killed
3520  */
3521 struct ctdb_killtcp_con {
3522         ctdb_sock_addr src_addr;
3523         ctdb_sock_addr dst_addr;
3524         int count;
3525         struct ctdb_kill_tcp *killtcp;
3526 };
3527
3528 /* this function is used to create a key to represent this socketpair
3529    in the killtcp tree.
3530    this key is used to insert and lookup matching socketpairs that are
3531    to be tickled and RST
3532 */
3533 #define KILLTCP_KEYLEN  10
3534 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3535 {
3536         static uint32_t key[KILLTCP_KEYLEN];
3537
3538         bzero(key, sizeof(key));
3539
3540         if (src->sa.sa_family != dst->sa.sa_family) {
3541                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3542                 return key;
3543         }
3544         
3545         switch (src->sa.sa_family) {
3546         case AF_INET:
3547                 key[0]  = dst->ip.sin_addr.s_addr;
3548                 key[1]  = src->ip.sin_addr.s_addr;
3549                 key[2]  = dst->ip.sin_port;
3550                 key[3]  = src->ip.sin_port;
3551                 break;
3552         case AF_INET6: {
3553                 uint32_t *dst6_addr32 =
3554                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3555                 uint32_t *src6_addr32 =
3556                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3557                 key[0]  = dst6_addr32[3];
3558                 key[1]  = src6_addr32[3];
3559                 key[2]  = dst6_addr32[2];
3560                 key[3]  = src6_addr32[2];
3561                 key[4]  = dst6_addr32[1];
3562                 key[5]  = src6_addr32[1];
3563                 key[6]  = dst6_addr32[0];
3564                 key[7]  = src6_addr32[0];
3565                 key[8]  = dst->ip6.sin6_port;
3566                 key[9]  = src->ip6.sin6_port;
3567                 break;
3568         }
3569         default:
3570                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3571                 return key;
3572         }
3573
3574         return key;
3575 }
3576
3577 /*
3578   called when we get a read event on the raw socket
3579  */
3580 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3581                                 uint16_t flags, void *private_data)
3582 {
3583         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3584         struct ctdb_killtcp_con *con;
3585         ctdb_sock_addr src, dst;
3586         uint32_t ack_seq, seq;
3587
3588         if (!(flags & EVENT_FD_READ)) {
3589                 return;
3590         }
3591
3592         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3593                                 killtcp->private_data,
3594                                 &src, &dst,
3595                                 &ack_seq, &seq) != 0) {
3596                 /* probably a non-tcp ACK packet */
3597                 return;
3598         }
3599
3600         /* check if we have this guy in our list of connections
3601            to kill
3602         */
3603         con = trbt_lookuparray32(killtcp->connections, 
3604                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3605         if (con == NULL) {
3606                 /* no this was some other packet we can just ignore */
3607                 return;
3608         }
3609
3610         /* This one has been tickled !
3611            now reset him and remove him from the list.
3612          */
3613         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3614                 ntohs(con->dst_addr.ip.sin_port),
3615                 ctdb_addr_to_str(&con->src_addr),
3616                 ntohs(con->src_addr.ip.sin_port)));
3617
3618         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3619         talloc_free(con);
3620 }
3621
3622
3623 /* when traversing the list of all tcp connections to send tickle acks to
3624    (so that we can capture the ack coming back and kill the connection
3625     by a RST)
3626    this callback is called for each connection we are currently trying to kill
3627 */
3628 static int tickle_connection_traverse(void *param, void *data)
3629 {
3630         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3631
3632         /* have tried too many times, just give up */
3633         if (con->count >= 5) {
3634                 /* can't delete in traverse: reparent to delete_cons */
3635                 talloc_steal(param, con);
3636                 return 0;
3637         }
3638
3639         /* othervise, try tickling it again */
3640         con->count++;
3641         ctdb_sys_send_tcp(
3642                 (ctdb_sock_addr *)&con->dst_addr,
3643                 (ctdb_sock_addr *)&con->src_addr,
3644                 0, 0, 0);
3645         return 0;
3646 }
3647
3648
3649 /* 
3650    called every second until all sentenced connections have been reset
3651  */
3652 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3653                                               struct timeval t, void *private_data)
3654 {
3655         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3656         void *delete_cons = talloc_new(NULL);
3657
3658         /* loop over all connections sending tickle ACKs */
3659         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3660
3661         /* now we've finished traverse, it's safe to do deletion. */
3662         talloc_free(delete_cons);
3663
3664         /* If there are no more connections to kill we can remove the
3665            entire killtcp structure
3666          */
3667         if ( (killtcp->connections == NULL) || 
3668              (killtcp->connections->root == NULL) ) {
3669                 talloc_free(killtcp);
3670                 return;
3671         }
3672
3673         /* try tickling them again in a seconds time
3674          */
3675         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3676                         ctdb_tickle_sentenced_connections, killtcp);
3677 }
3678
3679 /*
3680   destroy the killtcp structure
3681  */
3682 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3683 {
3684         struct ctdb_vnn *tmpvnn;
3685
3686         /* verify that this vnn is still active */
3687         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3688                 if (tmpvnn == killtcp->vnn) {
3689                         break;
3690                 }
3691         }
3692
3693         if (tmpvnn == NULL) {
3694                 return 0;
3695         }
3696
3697         if (killtcp->vnn->killtcp != killtcp) {
3698                 return 0;
3699         }
3700
3701         killtcp->vnn->killtcp = NULL;
3702
3703         return 0;
3704 }
3705
3706
3707 /* nothing fancy here, just unconditionally replace any existing
3708    connection structure with the new one.
3709
3710    dont even free the old one if it did exist, that one is talloc_stolen
3711    by the same node in the tree anyway and will be deleted when the new data 
3712    is deleted
3713 */
3714 static void *add_killtcp_callback(void *parm, void *data)
3715 {
3716         return parm;
3717 }
3718
3719 /*
3720   add a tcp socket to the list of connections we want to RST
3721  */
3722 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3723                                        ctdb_sock_addr *s,
3724                                        ctdb_sock_addr *d)
3725 {
3726         ctdb_sock_addr src, dst;
3727         struct ctdb_kill_tcp *killtcp;
3728         struct ctdb_killtcp_con *con;
3729         struct ctdb_vnn *vnn;
3730
3731         ctdb_canonicalize_ip(s, &src);
3732         ctdb_canonicalize_ip(d, &dst);
3733
3734         vnn = find_public_ip_vnn(ctdb, &dst);
3735         if (vnn == NULL) {
3736                 vnn = find_public_ip_vnn(ctdb, &src);
3737         }
3738         if (vnn == NULL) {
3739                 /* if it is not a public ip   it could be our 'single ip' */
3740                 if (ctdb->single_ip_vnn) {
3741                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3742                                 vnn = ctdb->single_ip_vnn;
3743                         }
3744                 }
3745         }
3746         if (vnn == NULL) {
3747                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3748                 return -1;
3749         }
3750
3751         killtcp = vnn->killtcp;
3752         
3753         /* If this is the first connection to kill we must allocate
3754            a new structure
3755          */
3756         if (killtcp == NULL) {
3757                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3758                 CTDB_NO_MEMORY(ctdb, killtcp);
3759
3760                 killtcp->vnn         = vnn;
3761                 killtcp->ctdb        = ctdb;
3762                 killtcp->capture_fd  = -1;
3763                 killtcp->connections = trbt_create(killtcp, 0);
3764
3765                 vnn->killtcp         = killtcp;
3766                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3767         }
3768
3769
3770
3771         /* create a structure that describes this connection we want to
3772            RST and store it in killtcp->connections
3773         */
3774         con = talloc(killtcp, struct ctdb_killtcp_con);
3775         CTDB_NO_MEMORY(ctdb, con);
3776         con->src_addr = src;
3777         con->dst_addr = dst;
3778         con->count    = 0;
3779         con->killtcp  = killtcp;
3780
3781
3782         trbt_insertarray32_callback(killtcp->connections,
3783                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3784                         add_killtcp_callback, con);
3785
3786         /* 
3787            If we dont have a socket to listen on yet we must create it
3788          */
3789         if (killtcp->capture_fd == -1) {
3790                 const char *iface = ctdb_vnn_iface_string(vnn);
3791                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3792                 if (killtcp->capture_fd == -1) {
3793                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3794                                           "socket on iface '%s' for killtcp (%s)\n",
3795                                           iface, strerror(errno)));
3796                         goto failed;
3797                 }
3798         }
3799
3800
3801         if (killtcp->fde == NULL) {
3802                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3803                                             EVENT_FD_READ,
3804                                             capture_tcp_handler, killtcp);
3805                 tevent_fd_set_auto_close(killtcp->fde);
3806
3807                 /* We also need to set up some events to tickle all these connections
3808                    until they are all reset
3809                 */
3810                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3811                                 ctdb_tickle_sentenced_connections, killtcp);
3812         }
3813
3814         /* tickle him once now */
3815         ctdb_sys_send_tcp(
3816                 &con->dst_addr,
3817                 &con->src_addr,
3818                 0, 0, 0);
3819
3820         return 0;
3821
3822 failed:
3823         talloc_free(vnn->killtcp);
3824         vnn->killtcp = NULL;
3825         return -1;
3826 }
3827
3828 /*
3829   kill a TCP connection.
3830  */
3831 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3832 {
3833         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3834
3835         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3836 }
3837
3838 /*
3839   called by a daemon to inform us of the entire list of TCP tickles for
3840   a particular public address.
3841   this control should only be sent by the node that is currently serving
3842   that public address.
3843  */
3844 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3845 {
3846         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3847         struct ctdb_tcp_array *tcparray;
3848         struct ctdb_vnn *vnn;
3849
3850         /* We must at least have tickles.num or else we cant verify the size
3851            of the received data blob
3852          */
3853         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3854                                         tickles.connections)) {
3855                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3856                 return -1;
3857         }
3858
3859         /* verify that the size of data matches what we expect */
3860         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3861                                 tickles.connections)
3862                          + sizeof(struct ctdb_tcp_connection)
3863                                  * list->tickles.num) {
3864                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3865                 return -1;
3866         }       
3867
3868         vnn = find_public_ip_vnn(ctdb, &list->addr);
3869         if (vnn == NULL) {
3870                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3871                         ctdb_addr_to_str(&list->addr)));
3872
3873                 return 1;
3874         }
3875
3876         /* remove any old ticklelist we might have */
3877         talloc_free(vnn->tcp_array);
3878         vnn->tcp_array = NULL;
3879
3880         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3881         CTDB_NO_MEMORY(ctdb, tcparray);
3882
3883         tcparray->num = list->tickles.num;
3884
3885         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3886         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3887
3888         memcpy(tcparray->connections, &list->tickles.connections[0], 
3889                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3890
3891         /* We now have a new fresh tickle list array for this vnn */
3892         vnn->tcp_array = talloc_steal(vnn, tcparray);
3893         
3894         return 0;
3895 }
3896
3897 /*
3898   called to return the full list of tickles for the puclic address associated 
3899   with the provided vnn
3900  */
3901 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3902 {
3903         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3904         struct ctdb_control_tcp_tickle_list *list;
3905         struct ctdb_tcp_array *tcparray;
3906         int num;
3907         struct ctdb_vnn *vnn;
3908
3909         vnn = find_public_ip_vnn(ctdb, addr);
3910         if (vnn == NULL) {
3911                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3912                         ctdb_addr_to_str(addr)));
3913
3914                 return 1;
3915         }
3916
3917         tcparray = vnn->tcp_array;
3918         if (tcparray) {
3919                 num = tcparray->num;
3920         } else {
3921                 num = 0;
3922         }
3923
3924         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3925                                 tickles.connections)
3926                         + sizeof(struct ctdb_tcp_connection) * num;
3927
3928         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3929         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3930         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3931
3932         list->addr = *addr;
3933         list->tickles.num = num;
3934         if (num) {
3935                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3936                         sizeof(struct ctdb_tcp_connection) * num);
3937         }
3938
3939         return 0;
3940 }
3941
3942
3943 /*
3944   set the list of all tcp tickles for a public address
3945  */
3946 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3947                               struct timeval timeout, uint32_t destnode, 
3948                               ctdb_sock_addr *addr,
3949                               struct ctdb_tcp_array *tcparray)
3950 {
3951         int ret, num;
3952         TDB_DATA data;
3953         struct ctdb_control_tcp_tickle_list *list;
3954
3955         if (tcparray) {
3956                 num = tcparray->num;
3957         } else {
3958                 num = 0;
3959         }
3960
3961         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3962                                 tickles.connections) +
3963                         sizeof(struct ctdb_tcp_connection) * num;
3964         data.dptr = talloc_size(ctdb, data.dsize);
3965         CTDB_NO_MEMORY(ctdb, data.dptr);
3966
3967         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3968         list->addr = *addr;
3969         list->tickles.num = num;
3970         if (tcparray) {
3971                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3972         }
3973
3974         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3975                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3976                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3977         if (ret != 0) {
3978                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3979                 return -1;
3980         }
3981
3982         talloc_free(data.dptr);
3983
3984         return ret;
3985 }
3986
3987
3988 /*
3989   perform tickle updates if required
3990  */
3991 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3992                                 struct timed_event *te, 
3993                                 struct timeval t, void *private_data)
3994 {
3995         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3996         int ret;
3997         struct ctdb_vnn *vnn;
3998
3999         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4000                 /* we only send out updates for public addresses that 
4001                    we have taken over
4002                  */
4003                 if (ctdb->pnn != vnn->pnn) {
4004                         continue;
4005                 }
4006                 /* We only send out the updates if we need to */
4007                 if (!vnn->tcp_update_needed) {
4008                         continue;
4009                 }
4010                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
4011                                 TAKEOVER_TIMEOUT(),
4012                                 CTDB_BROADCAST_CONNECTED,
4013                                 &vnn->public_address,
4014                                 vnn->tcp_array);
4015                 if (ret != 0) {
4016                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4017                                 ctdb_addr_to_str(&vnn->public_address)));
4018                 }
4019         }
4020
4021         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4022                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4023                              ctdb_update_tcp_tickles, ctdb);
4024 }               
4025         
4026
4027 /*
4028   start periodic update of tcp tickles
4029  */
4030 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4031 {
4032         ctdb->tickle_update_context = talloc_new(ctdb);
4033
4034         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4035                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4036                              ctdb_update_tcp_tickles, ctdb);
4037 }
4038
4039
4040
4041
4042 struct control_gratious_arp {
4043         struct ctdb_context *ctdb;
4044         ctdb_sock_addr addr;
4045         const char *iface;
4046         int count;
4047 };
4048
4049 /*
4050   send a control_gratuitous arp
4051  */
4052 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4053                                   struct timeval t, void *private_data)
4054 {
4055         int ret;
4056         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4057                                                         struct control_gratious_arp);
4058
4059         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4060         if (ret != 0) {
4061                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4062                                  arp->iface, strerror(errno)));
4063         }
4064
4065
4066         arp->count++;
4067         if (arp->count == CTDB_ARP_REPEAT) {
4068                 talloc_free(arp);
4069                 return;
4070         }
4071
4072         event_add_timed(arp->ctdb->ev, arp, 
4073                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4074                         send_gratious_arp, arp);
4075 }
4076
4077
4078 /*
4079   send a gratious arp 
4080  */
4081 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4082 {
4083         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4084         struct control_gratious_arp *arp;
4085
4086         /* verify the size of indata */
4087         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4088                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4089                                  (unsigned)indata.dsize, 
4090                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4091                 return -1;
4092         }
4093         if (indata.dsize != 
4094                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4095                 + gratious_arp->len ) ){
4096
4097                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4098                         "but should be %u bytes\n", 
4099                          (unsigned)indata.dsize, 
4100                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4101                 return -1;
4102         }
4103
4104
4105         arp = talloc(ctdb, struct control_gratious_arp);
4106         CTDB_NO_MEMORY(ctdb, arp);
4107
4108         arp->ctdb  = ctdb;
4109         arp->addr   = gratious_arp->addr;
4110         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4111         CTDB_NO_MEMORY(ctdb, arp->iface);
4112         arp->count = 0;
4113         
4114         event_add_timed(arp->ctdb->ev, arp, 
4115                         timeval_zero(), send_gratious_arp, arp);
4116
4117         return 0;
4118 }
4119
4120 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4121 {
4122         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4123         int ret;
4124
4125         /* verify the size of indata */
4126         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4127                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4128                 return -1;
4129         }
4130         if (indata.dsize != 
4131                 ( offsetof(struct ctdb_control_ip_iface, iface)
4132                 + pub->len ) ){
4133
4134                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4135                         "but should be %u bytes\n", 
4136                          (unsigned)indata.dsize, 
4137                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4138                 return -1;
4139         }
4140
4141         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4142
4143         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4144
4145         if (ret != 0) {
4146                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4147                 return -1;
4148         }
4149
4150         return 0;
4151 }
4152
4153 /*
4154   called when releaseip event finishes for del_public_address
4155  */
4156 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
4157                                 void *private_data)
4158 {
4159         talloc_free(private_data);
4160 }
4161
4162 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4163 {
4164         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4165         struct ctdb_vnn *vnn;
4166         int ret;
4167
4168         /* verify the size of indata */
4169         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4170                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4171                 return -1;
4172         }
4173         if (indata.dsize != 
4174                 ( offsetof(struct ctdb_control_ip_iface, iface)
4175                 + pub->len ) ){
4176
4177                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4178                         "but should be %u bytes\n", 
4179                          (unsigned)indata.dsize, 
4180                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4181                 return -1;
4182         }
4183
4184         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4185
4186         /* walk over all public addresses until we find a match */
4187         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4188                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4189                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4190
4191                         DLIST_REMOVE(ctdb->vnn, vnn);
4192                         talloc_steal(mem_ctx, vnn);
4193                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
4194                         if (vnn->pnn != ctdb->pnn) {
4195                                 if (vnn->iface != NULL) {
4196                                         ctdb_vnn_unassign_iface(ctdb, vnn);
4197                                 }
4198                                 talloc_free(mem_ctx);
4199                                 return 0;
4200                         }
4201                         vnn->pnn = -1;
4202
4203                         ret = ctdb_event_script_callback(ctdb, 
4204                                          mem_ctx, delete_ip_callback, mem_ctx,
4205                                          false,
4206                                          CTDB_EVENT_RELEASE_IP,
4207                                          "%s %s %u",
4208                                          ctdb_vnn_iface_string(vnn),
4209                                          ctdb_addr_to_str(&vnn->public_address),
4210                                          vnn->public_netmask_bits);
4211                         if (vnn->iface != NULL) {
4212                                 ctdb_vnn_unassign_iface(ctdb, vnn);
4213                         }
4214                         if (ret != 0) {
4215                                 return -1;
4216                         }
4217                         return 0;
4218                 }
4219         }
4220
4221         return -1;
4222 }
4223
4224
4225 struct ipreallocated_callback_state {
4226         struct ctdb_req_control *c;
4227 };
4228
4229 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4230                                         int status, void *p)
4231 {
4232         struct ipreallocated_callback_state *state =
4233                 talloc_get_type(p, struct ipreallocated_callback_state);
4234
4235         if (status != 0) {
4236                 DEBUG(DEBUG_ERR,
4237                       (" \"ipreallocated\" event script failed (status %d)\n",
4238                        status));
4239                 if (status == -ETIME) {
4240                         ctdb_ban_self(ctdb);
4241                 }
4242         }
4243
4244         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4245         talloc_free(state);
4246 }
4247
4248 /* A control to run the ipreallocated event */
4249 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4250                                    struct ctdb_req_control *c,
4251                                    bool *async_reply)
4252 {
4253         int ret;
4254         struct ipreallocated_callback_state *state;
4255
4256         state = talloc(ctdb, struct ipreallocated_callback_state);
4257         CTDB_NO_MEMORY(ctdb, state);
4258
4259         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4260
4261         ret = ctdb_event_script_callback(ctdb, state,
4262                                          ctdb_ipreallocated_callback, state,
4263                                          false, CTDB_EVENT_IPREALLOCATED,
4264                                          "%s", "");
4265
4266         if (ret != 0) {
4267                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4268                 talloc_free(state);
4269                 return -1;
4270         }
4271
4272         /* tell the control that we will be reply asynchronously */
4273         state->c    = talloc_steal(state, c);
4274         *async_reply = true;
4275
4276         return 0;
4277 }
4278
4279
4280 /* This function is called from the recovery daemon to verify that a remote
4281    node has the expected ip allocation.
4282    This is verified against ctdb->ip_tree
4283 */
4284 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4285                                 struct ctdb_all_public_ips *ips,
4286                                 uint32_t pnn)
4287 {
4288         struct ctdb_public_ip_list *tmp_ip; 
4289         int i;
4290
4291         if (ctdb->ip_tree == NULL) {
4292                 /* dont know the expected allocation yet, assume remote node
4293                    is correct. */
4294                 return 0;
4295         }
4296
4297         if (ips == NULL) {
4298                 return 0;
4299         }
4300
4301         for (i=0; i<ips->num; i++) {
4302                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4303                 if (tmp_ip == NULL) {
4304                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4305                         return -1;
4306                 }
4307
4308                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4309                         continue;
4310                 }
4311
4312                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4313                         DEBUG(DEBUG_ERR,
4314                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4315                                pnn,
4316                                ctdb_addr_to_str(&ips->ips[i].addr),
4317                                ips->ips[i].pnn, tmp_ip->pnn));
4318                         return -1;
4319                 }
4320         }
4321
4322         return 0;
4323 }
4324
4325 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4326 {
4327         struct ctdb_public_ip_list *tmp_ip; 
4328
4329         if (ctdb->ip_tree == NULL) {
4330                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4331                 return -1;
4332         }
4333
4334         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4335         if (tmp_ip == NULL) {
4336                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4337                 return -1;
4338         }
4339
4340         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4341         tmp_ip->pnn = ip->pnn;
4342
4343         return 0;
4344 }
4345
4346
4347 struct ctdb_reloadips_handle {
4348         struct ctdb_context *ctdb;
4349         struct ctdb_req_control *c;
4350         int status;
4351         int fd[2];
4352         pid_t child;
4353         struct fd_event *fde;
4354 };
4355
4356 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4357 {
4358         if (h == h->ctdb->reload_ips) {
4359                 h->ctdb->reload_ips = NULL;
4360         }
4361         if (h->c != NULL) {
4362                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4363                 h->c = NULL;
4364         }
4365         ctdb_kill(h->ctdb, h->child, SIGKILL);
4366         return 0;
4367 }
4368
4369 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4370                                 struct timed_event *te,
4371                                 struct timeval t, void *private_data)
4372 {
4373         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4374
4375         talloc_free(h);
4376 }       
4377
4378 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4379                              uint16_t flags, void *private_data)
4380 {
4381         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4382
4383         char res;
4384         int ret;
4385
4386         ret = read(h->fd[0], &res, 1);
4387         if (ret < 1 || res != 0) {
4388                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4389                 res = 1;
4390         }
4391         h->status = res;
4392
4393         talloc_free(h);
4394 }
4395
4396 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4397 {
4398         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4399         struct ctdb_all_public_ips *ips;
4400         struct ctdb_vnn *vnn;
4401         struct client_async_data *async_data;
4402         struct timeval timeout;
4403         TDB_DATA data;
4404         struct ctdb_client_control_state *state;
4405         bool first_add;
4406         int i, ret;
4407
4408         CTDB_NO_MEMORY(ctdb, mem_ctx);
4409
4410         /* Read IPs from local node */
4411         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4412                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4413         if (ret != 0) {
4414                 DEBUG(DEBUG_ERR,
4415                       ("Unable to fetch public IPs from local node\n"));
4416                 talloc_free(mem_ctx);
4417                 return -1;
4418         }
4419
4420         /* Read IPs file - this is safe since this is a child process */
4421         ctdb->vnn = NULL;
4422         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4423                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4424                 talloc_free(mem_ctx);
4425                 return -1;
4426         }
4427
4428         async_data = talloc_zero(mem_ctx, struct client_async_data);
4429         CTDB_NO_MEMORY(ctdb, async_data);
4430
4431         /* Compare IPs between node and file for IPs to be deleted */
4432         for (i = 0; i < ips->num; i++) {
4433                 /* */
4434                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4435                         if (ctdb_same_ip(&vnn->public_address,
4436                                          &ips->ips[i].addr)) {
4437                                 /* IP is still in file */
4438                                 break;
4439                         }
4440                 }
4441
4442                 if (vnn == NULL) {
4443                         /* Delete IP ips->ips[i] */
4444                         struct ctdb_control_ip_iface *pub;
4445
4446                         DEBUG(DEBUG_NOTICE,
4447                               ("IP %s no longer configured, deleting it\n",
4448                                ctdb_addr_to_str(&ips->ips[i].addr)));
4449
4450                         pub = talloc_zero(mem_ctx,
4451                                           struct ctdb_control_ip_iface);
4452                         CTDB_NO_MEMORY(ctdb, pub);
4453
4454                         pub->addr  = ips->ips[i].addr;
4455                         pub->mask  = 0;
4456                         pub->len   = 0;
4457
4458                         timeout = TAKEOVER_TIMEOUT();
4459
4460                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4461                                               iface) + pub->len;
4462                         data.dptr = (uint8_t *)pub;
4463
4464                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4465                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4466                                                   0, data, async_data,
4467                                                   &timeout, NULL);
4468                         if (state == NULL) {
4469                                 DEBUG(DEBUG_ERR,
4470                                       (__location__
4471                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4472                                 goto failed;
4473                         }
4474
4475                 }
4476         }
4477
4478         /* Compare IPs between node and file for IPs to be added */
4479         first_add = true;
4480         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4481                 for (i = 0; i < ips->num; i++) {
4482                         if (ctdb_same_ip(&vnn->public_address,
4483                                          &ips->ips[i].addr)) {
4484                                 /* IP already on node */
4485                                 break;
4486                         }
4487                 }
4488                 if (i == ips->num) {
4489                         /* Add IP ips->ips[i] */
4490                         struct ctdb_control_ip_iface *pub;
4491                         const char *ifaces = NULL;
4492                         uint32_t len;
4493                         int iface = 0;
4494
4495                         DEBUG(DEBUG_NOTICE,
4496                               ("New IP %s configured, adding it\n",
4497                                ctdb_addr_to_str(&vnn->public_address)));
4498                         if (first_add) {
4499                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4500
4501                                 data.dsize = sizeof(pnn);
4502                                 data.dptr  = (uint8_t *)&pnn;
4503
4504                                 ret = ctdb_client_send_message(
4505                                         ctdb,
4506                                         CTDB_BROADCAST_CONNECTED,
4507                                         CTDB_SRVID_REBALANCE_NODE,
4508                                         data);
4509                                 if (ret != 0) {
4510                                         DEBUG(DEBUG_WARNING,
4511                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4512                                 }
4513
4514                                 first_add = false;
4515                         }
4516
4517                         ifaces = vnn->ifaces[0];
4518                         iface = 1;
4519                         while (vnn->ifaces[iface] != NULL) {
4520                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4521                                                          vnn->ifaces[iface]);
4522                                 iface++;
4523                         }
4524
4525                         len   = strlen(ifaces) + 1;
4526                         pub = talloc_zero_size(mem_ctx,
4527                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4528                         CTDB_NO_MEMORY(ctdb, pub);
4529
4530                         pub->addr  = vnn->public_address;
4531                         pub->mask  = vnn->public_netmask_bits;
4532                         pub->len   = len;
4533                         memcpy(&pub->iface[0], ifaces, pub->len);
4534
4535                         timeout = TAKEOVER_TIMEOUT();
4536
4537                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4538                                               iface) + pub->len;
4539                         data.dptr = (uint8_t *)pub;
4540
4541                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4542                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4543                                                   0, data, async_data,
4544                                                   &timeout, NULL);
4545                         if (state == NULL) {
4546                                 DEBUG(DEBUG_ERR,
4547                                       (__location__
4548                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4549                                 goto failed;
4550                         }
4551                 }
4552         }
4553
4554         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4555                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4556                 goto failed;
4557         }
4558
4559         talloc_free(mem_ctx);
4560         return 0;
4561
4562 failed:
4563         talloc_free(mem_ctx);
4564         return -1;
4565 }
4566
4567 /* This control is sent to force the node to re-read the public addresses file
4568    and drop any addresses we should nnot longer host, and add new addresses
4569    that we are now able to host
4570 */
4571 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4572 {
4573         struct ctdb_reloadips_handle *h;
4574         pid_t parent = getpid();
4575
4576         if (ctdb->reload_ips != NULL) {
4577                 talloc_free(ctdb->reload_ips);
4578                 ctdb->reload_ips = NULL;
4579         }
4580
4581         h = talloc(ctdb, struct ctdb_reloadips_handle);
4582         CTDB_NO_MEMORY(ctdb, h);
4583         h->ctdb     = ctdb;
4584         h->c        = NULL;
4585         h->status   = -1;
4586         
4587         if (pipe(h->fd) == -1) {
4588                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4589                 talloc_free(h);
4590                 return -1;
4591         }
4592
4593         h->child = ctdb_fork(ctdb);
4594         if (h->child == (pid_t)-1) {
4595                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4596                 close(h->fd[0]);
4597                 close(h->fd[1]);
4598                 talloc_free(h);
4599                 return -1;
4600         }
4601
4602         /* child process */
4603         if (h->child == 0) {
4604                 signed char res = 0;
4605
4606                 close(h->fd[0]);
4607                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4608
4609                 ctdb_set_process_name("ctdb_reloadips");
4610                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4611                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4612                         res = -1;
4613                 } else {
4614                         res = ctdb_reloadips_child(ctdb);
4615                         if (res != 0) {
4616                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4617                         }
4618                 }
4619
4620                 write(h->fd[1], &res, 1);
4621                 /* make sure we die when our parent dies */
4622                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4623                         sleep(5);
4624                 }
4625                 _exit(0);
4626         }
4627
4628         h->c             = talloc_steal(h, c);
4629
4630         close(h->fd[1]);
4631         set_close_on_exec(h->fd[0]);
4632
4633         talloc_set_destructor(h, ctdb_reloadips_destructor);
4634
4635
4636         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4637                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4638                         (void *)h);
4639         tevent_fd_set_auto_close(h->fde);
4640
4641         event_add_timed(ctdb->ev, h,
4642                         timeval_current_ofs(120, 0),
4643                         ctdb_reloadips_timeout_event, h);
4644
4645         /* we reply later */
4646         *async_reply = true;
4647         return 0;
4648 }