ctdb-daemon: Remove obsolete IPv4 only controls
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40         enum ctdb_runstate runstate;
41 };
42
43 struct ctdb_iface {
44         struct ctdb_iface *prev, *next;
45         const char *name;
46         bool link_up;
47         uint32_t references;
48 };
49
50 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
51 {
52         if (vnn->iface) {
53                 return vnn->iface->name;
54         }
55
56         return "__none__";
57 }
58
59 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
60 {
61         struct ctdb_iface *i;
62
63         /* Verify that we dont have an entry for this ip yet */
64         for (i=ctdb->ifaces;i;i=i->next) {
65                 if (strcmp(i->name, iface) == 0) {
66                         return 0;
67                 }
68         }
69
70         /* create a new structure for this interface */
71         i = talloc_zero(ctdb, struct ctdb_iface);
72         CTDB_NO_MEMORY_FATAL(ctdb, i);
73         i->name = talloc_strdup(i, iface);
74         CTDB_NO_MEMORY(ctdb, i->name);
75         /*
76          * If link_up defaults to true then IPs can be allocated to a
77          * node during the first recovery.  However, then an interface
78          * could have its link marked down during the startup event,
79          * causing the IP to move almost immediately.  If link_up
80          * defaults to false then, during normal operation, IPs added
81          * to a new interface can't be assigned until a monitor cycle
82          * has occurred and marked the new interfaces up.  This makes
83          * IP allocation unpredictable.  The following is a neat
84          * compromise: early in startup link_up defaults to false, so
85          * IPs can't be assigned, and after startup IPs can be
86          * assigned immediately.
87          */
88         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
89
90         DLIST_ADD(ctdb->ifaces, i);
91
92         return 0;
93 }
94
95 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
96                                         const char *name)
97 {
98         int n;
99
100         for (n = 0; vnn->ifaces[n] != NULL; n++) {
101                 if (strcmp(name, vnn->ifaces[n]) == 0) {
102                         return true;
103                 }
104         }
105
106         return false;
107 }
108
109 /* If any interfaces now have no possible IPs then delete them.  This
110  * implementation is naive (i.e. simple) rather than clever
111  * (i.e. complex).  Given that this is run on delip and that operation
112  * is rare, this doesn't need to be efficient - it needs to be
113  * foolproof.  One alternative is reference counting, where the logic
114  * is distributed and can, therefore, be broken in multiple places.
115  * Another alternative is to build a red-black tree of interfaces that
116  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
117  * once) and then walking ctdb->ifaces once and deleting those not in
118  * the tree.  Let's go to one of those if the naive implementation
119  * causes problems...  :-)
120  */
121 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
122                                         struct ctdb_vnn *vnn)
123 {
124         struct ctdb_iface *i, *next;
125
126         /* For each interface, check if there's an IP using it. */
127         for (i = ctdb->ifaces; i != NULL; i = next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130                 next = i->next;
131
132                 /* Only consider interfaces named in the given VNN. */
133                 if (!vnn_has_interface_with_name(vnn, i->name)) {
134                         continue;
135                 }
136
137                 /* Is the "single IP" on this interface? */
138                 if ((ctdb->single_ip_vnn != NULL) &&
139                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
140                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
141                         /* Found, next interface please... */
142                         continue;
143                 }
144                 /* Search for a vnn with this interface. */
145                 found = false;
146                 for (tv=ctdb->vnn; tv; tv=tv->next) {
147                         if (vnn_has_interface_with_name(tv, i->name)) {
148                                 found = true;
149                                 break;
150                         }
151                 }
152
153                 if (!found) {
154                         /* None of the VNNs are using this interface. */
155                         DLIST_REMOVE(ctdb->ifaces, i);
156                         talloc_free(i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->delete_pending) {
264                 return false;
265         }
266
267         if (vnn->iface && vnn->iface->link_up) {
268                 return true;
269         }
270
271         for (i=0; vnn->ifaces[i]; i++) {
272                 struct ctdb_iface *cur;
273
274                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
275                 if (cur == NULL) {
276                         continue;
277                 }
278
279                 if (cur->link_up) {
280                         return true;
281                 }
282         }
283
284         return false;
285 }
286
287 struct ctdb_takeover_arp {
288         struct ctdb_context *ctdb;
289         uint32_t count;
290         ctdb_sock_addr addr;
291         struct ctdb_tcp_array *tcparray;
292         struct ctdb_vnn *vnn;
293 };
294
295
296 /*
297   lists of tcp endpoints
298  */
299 struct ctdb_tcp_list {
300         struct ctdb_tcp_list *prev, *next;
301         struct ctdb_tcp_connection connection;
302 };
303
304 /*
305   list of clients to kill on IP release
306  */
307 struct ctdb_client_ip {
308         struct ctdb_client_ip *prev, *next;
309         struct ctdb_context *ctdb;
310         ctdb_sock_addr addr;
311         uint32_t client_id;
312 };
313
314
315 /*
316   send a gratuitous arp
317  */
318 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
319                                   struct timeval t, void *private_data)
320 {
321         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
322                                                         struct ctdb_takeover_arp);
323         int i, ret;
324         struct ctdb_tcp_array *tcparray;
325         const char *iface = ctdb_vnn_iface_string(arp->vnn);
326
327         ret = ctdb_sys_send_arp(&arp->addr, iface);
328         if (ret != 0) {
329                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
330                                   iface, strerror(errno)));
331         }
332
333         tcparray = arp->tcparray;
334         if (tcparray) {
335                 for (i=0;i<tcparray->num;i++) {
336                         struct ctdb_tcp_connection *tcon;
337
338                         tcon = &tcparray->connections[i];
339                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
340                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
341                                 ctdb_addr_to_str(&tcon->src_addr),
342                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
343                         ret = ctdb_sys_send_tcp(
344                                 &tcon->src_addr, 
345                                 &tcon->dst_addr,
346                                 0, 0, 0);
347                         if (ret != 0) {
348                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
349                                         ctdb_addr_to_str(&tcon->src_addr)));
350                         }
351                 }
352         }
353
354         arp->count++;
355
356         if (arp->count == CTDB_ARP_REPEAT) {
357                 talloc_free(arp);
358                 return;
359         }
360
361         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
362                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
363                         ctdb_control_send_arp, arp);
364 }
365
366 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
367                                        struct ctdb_vnn *vnn)
368 {
369         struct ctdb_takeover_arp *arp;
370         struct ctdb_tcp_array *tcparray;
371
372         if (!vnn->takeover_ctx) {
373                 vnn->takeover_ctx = talloc_new(vnn);
374                 if (!vnn->takeover_ctx) {
375                         return -1;
376                 }
377         }
378
379         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
380         if (!arp) {
381                 return -1;
382         }
383
384         arp->ctdb = ctdb;
385         arp->addr = vnn->public_address;
386         arp->vnn  = vnn;
387
388         tcparray = vnn->tcp_array;
389         if (tcparray) {
390                 /* add all of the known tcp connections for this IP to the
391                    list of tcp connections to send tickle acks for */
392                 arp->tcparray = talloc_steal(arp, tcparray);
393
394                 vnn->tcp_array = NULL;
395                 vnn->tcp_update_needed = true;
396         }
397
398         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
399                         timeval_zero(), ctdb_control_send_arp, arp);
400
401         return 0;
402 }
403
404 struct takeover_callback_state {
405         struct ctdb_req_control *c;
406         ctdb_sock_addr *addr;
407         struct ctdb_vnn *vnn;
408 };
409
410 struct ctdb_do_takeip_state {
411         struct ctdb_req_control *c;
412         struct ctdb_vnn *vnn;
413 };
414
415 /*
416   called when takeip event finishes
417  */
418 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
419                                     void *private_data)
420 {
421         struct ctdb_do_takeip_state *state =
422                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
423         int32_t ret;
424         TDB_DATA data;
425
426         if (status != 0) {
427                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
428         
429                 if (status == -ETIME) {
430                         ctdb_ban_self(ctdb);
431                 }
432                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
433                                  ctdb_addr_to_str(&state->vnn->public_address),
434                                  ctdb_vnn_iface_string(state->vnn)));
435                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
436
437                 node->flags |= NODE_FLAGS_UNHEALTHY;
438                 talloc_free(state);
439                 return;
440         }
441
442         if (ctdb->do_checkpublicip) {
443
444         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
445         if (ret != 0) {
446                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
447                 talloc_free(state);
448                 return;
449         }
450
451         }
452
453         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
454         data.dsize = strlen((char *)data.dptr) + 1;
455         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
456
457         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
458
459
460         /* the control succeeded */
461         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
462         talloc_free(state);
463         return;
464 }
465
466 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
467 {
468         state->vnn->update_in_flight = false;
469         return 0;
470 }
471
472 /*
473   take over an ip address
474  */
475 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
476                               struct ctdb_req_control *c,
477                               struct ctdb_vnn *vnn)
478 {
479         int ret;
480         struct ctdb_do_takeip_state *state;
481
482         if (vnn->update_in_flight) {
483                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
484                                     "update for this IP already in flight\n",
485                                     ctdb_addr_to_str(&vnn->public_address),
486                                     vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         ret = ctdb_vnn_assign_iface(ctdb, vnn);
491         if (ret != 0) {
492                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
493                                  "assign a usable interface\n",
494                                  ctdb_addr_to_str(&vnn->public_address),
495                                  vnn->public_netmask_bits));
496                 return -1;
497         }
498
499         state = talloc(vnn, struct ctdb_do_takeip_state);
500         CTDB_NO_MEMORY(ctdb, state);
501
502         state->c = talloc_steal(ctdb, c);
503         state->vnn   = vnn;
504
505         vnn->update_in_flight = true;
506         talloc_set_destructor(state, ctdb_takeip_destructor);
507
508         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
509                             ctdb_addr_to_str(&vnn->public_address),
510                             vnn->public_netmask_bits,
511                             ctdb_vnn_iface_string(vnn)));
512
513         ret = ctdb_event_script_callback(ctdb,
514                                          state,
515                                          ctdb_do_takeip_callback,
516                                          state,
517                                          CTDB_EVENT_TAKE_IP,
518                                          "%s %s %u",
519                                          ctdb_vnn_iface_string(vnn),
520                                          ctdb_addr_to_str(&vnn->public_address),
521                                          vnn->public_netmask_bits);
522
523         if (ret != 0) {
524                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
525                         ctdb_addr_to_str(&vnn->public_address),
526                         ctdb_vnn_iface_string(vnn)));
527                 talloc_free(state);
528                 return -1;
529         }
530
531         return 0;
532 }
533
534 struct ctdb_do_updateip_state {
535         struct ctdb_req_control *c;
536         struct ctdb_iface *old;
537         struct ctdb_vnn *vnn;
538 };
539
540 /*
541   called when updateip event finishes
542  */
543 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
544                                       void *private_data)
545 {
546         struct ctdb_do_updateip_state *state =
547                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
548         int32_t ret;
549
550         if (status != 0) {
551                 if (status == -ETIME) {
552                         ctdb_ban_self(ctdb);
553                 }
554                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
555                         ctdb_addr_to_str(&state->vnn->public_address),
556                         state->old->name,
557                         ctdb_vnn_iface_string(state->vnn)));
558
559                 /*
560                  * All we can do is reset the old interface
561                  * and let the next run fix it
562                  */
563                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
564                 state->vnn->iface = state->old;
565                 state->vnn->iface->references++;
566
567                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
568                 talloc_free(state);
569                 return;
570         }
571
572         if (ctdb->do_checkpublicip) {
573
574         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
575         if (ret != 0) {
576                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
577                 talloc_free(state);
578                 return;
579         }
580
581         }
582
583         /* the control succeeded */
584         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
585         talloc_free(state);
586         return;
587 }
588
589 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
590 {
591         state->vnn->update_in_flight = false;
592         return 0;
593 }
594
595 /*
596   update (move) an ip address
597  */
598 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
599                                 struct ctdb_req_control *c,
600                                 struct ctdb_vnn *vnn)
601 {
602         int ret;
603         struct ctdb_do_updateip_state *state;
604         struct ctdb_iface *old = vnn->iface;
605         const char *new_name;
606
607         if (vnn->update_in_flight) {
608                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
609                                     "update for this IP already in flight\n",
610                                     ctdb_addr_to_str(&vnn->public_address),
611                                     vnn->public_netmask_bits));
612                 return -1;
613         }
614
615         ctdb_vnn_unassign_iface(ctdb, vnn);
616         ret = ctdb_vnn_assign_iface(ctdb, vnn);
617         if (ret != 0) {
618                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
619                                  "assin a usable interface (old iface '%s')\n",
620                                  ctdb_addr_to_str(&vnn->public_address),
621                                  vnn->public_netmask_bits,
622                                  old->name));
623                 return -1;
624         }
625
626         new_name = ctdb_vnn_iface_string(vnn);
627         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
628                 /* A benign update from one interface onto itself.
629                  * no need to run the eventscripts in this case, just return
630                  * success.
631                  */
632                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
633                 return 0;
634         }
635
636         state = talloc(vnn, struct ctdb_do_updateip_state);
637         CTDB_NO_MEMORY(ctdb, state);
638
639         state->c = talloc_steal(ctdb, c);
640         state->old = old;
641         state->vnn = vnn;
642
643         vnn->update_in_flight = true;
644         talloc_set_destructor(state, ctdb_updateip_destructor);
645
646         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
647                             "interface %s to %s\n",
648                             ctdb_addr_to_str(&vnn->public_address),
649                             vnn->public_netmask_bits,
650                             old->name,
651                             new_name));
652
653         ret = ctdb_event_script_callback(ctdb,
654                                          state,
655                                          ctdb_do_updateip_callback,
656                                          state,
657                                          CTDB_EVENT_UPDATE_IP,
658                                          "%s %s %s %u",
659                                          state->old->name,
660                                          new_name,
661                                          ctdb_addr_to_str(&vnn->public_address),
662                                          vnn->public_netmask_bits);
663         if (ret != 0) {
664                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
665                                  ctdb_addr_to_str(&vnn->public_address),
666                                  old->name, new_name));
667                 talloc_free(state);
668                 return -1;
669         }
670
671         return 0;
672 }
673
674 /*
675   Find the vnn of the node that has a public ip address
676   returns -1 if the address is not known as a public address
677  */
678 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
679 {
680         struct ctdb_vnn *vnn;
681
682         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
683                 if (ctdb_same_ip(&vnn->public_address, addr)) {
684                         return vnn;
685                 }
686         }
687
688         return NULL;
689 }
690
691 /*
692   take over an ip address
693  */
694 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
695                                  struct ctdb_req_control *c,
696                                  TDB_DATA indata,
697                                  bool *async_reply)
698 {
699         int ret;
700         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
701         struct ctdb_vnn *vnn;
702         bool have_ip = false;
703         bool do_updateip = false;
704         bool do_takeip = false;
705         struct ctdb_iface *best_iface = NULL;
706
707         if (pip->pnn != ctdb->pnn) {
708                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
709                                  "with pnn %d, but we're node %d\n",
710                                  ctdb_addr_to_str(&pip->addr),
711                                  pip->pnn, ctdb->pnn));
712                 return -1;
713         }
714
715         /* update out vnn list */
716         vnn = find_public_ip_vnn(ctdb, &pip->addr);
717         if (vnn == NULL) {
718                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
719                         ctdb_addr_to_str(&pip->addr)));
720                 return 0;
721         }
722
723         if (ctdb->do_checkpublicip) {
724                 have_ip = ctdb_sys_have_ip(&pip->addr);
725         }
726         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
727         if (best_iface == NULL) {
728                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
729                                  "a usable interface (old %s, have_ip %d)\n",
730                                  ctdb_addr_to_str(&vnn->public_address),
731                                  vnn->public_netmask_bits,
732                                  ctdb_vnn_iface_string(vnn),
733                                  have_ip));
734                 return -1;
735         }
736
737         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
738                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
739                 have_ip = false;
740         }
741
742
743         if (vnn->iface == NULL && have_ip) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
746                                  ctdb_addr_to_str(&vnn->public_address)));
747                 return 0;
748         }
749
750         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
751                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752                                   "and we have it on iface[%s], but it was assigned to node %d"
753                                   "and we are node %d, banning ourself\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
756                 ctdb_ban_self(ctdb);
757                 return -1;
758         }
759
760         if (vnn->pnn == -1 && have_ip) {
761                 vnn->pnn = ctdb->pnn;
762                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
763                                   "and we already have it on iface[%s], update local daemon\n",
764                                  ctdb_addr_to_str(&vnn->public_address),
765                                   ctdb_vnn_iface_string(vnn)));
766                 return 0;
767         }
768
769         if (vnn->iface) {
770                 if (vnn->iface != best_iface) {
771                         if (!vnn->iface->link_up) {
772                                 do_updateip = true;
773                         } else if (vnn->iface->references > (best_iface->references + 1)) {
774                                 /* only move when the rebalance gains something */
775                                         do_updateip = true;
776                         }
777                 }
778         }
779
780         if (!have_ip) {
781                 if (do_updateip) {
782                         ctdb_vnn_unassign_iface(ctdb, vnn);
783                         do_updateip = false;
784                 }
785                 do_takeip = true;
786         }
787
788         if (do_takeip) {
789                 ret = ctdb_do_takeip(ctdb, c, vnn);
790                 if (ret != 0) {
791                         return -1;
792                 }
793         } else if (do_updateip) {
794                 ret = ctdb_do_updateip(ctdb, c, vnn);
795                 if (ret != 0) {
796                         return -1;
797                 }
798         } else {
799                 /*
800                  * The interface is up and the kernel known the ip
801                  * => do nothing
802                  */
803                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
804                         ctdb_addr_to_str(&pip->addr),
805                         vnn->public_netmask_bits,
806                         ctdb_vnn_iface_string(vnn)));
807                 return 0;
808         }
809
810         /* tell ctdb_control.c that we will be replying asynchronously */
811         *async_reply = true;
812
813         return 0;
814 }
815
816 /*
817   kill any clients that are registered with a IP that is being released
818  */
819 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
820 {
821         struct ctdb_client_ip *ip;
822
823         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
824                 ctdb_addr_to_str(addr)));
825
826         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
827                 ctdb_sock_addr tmp_addr;
828
829                 tmp_addr = ip->addr;
830                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
831                         ip->client_id,
832                         ctdb_addr_to_str(&ip->addr)));
833
834                 if (ctdb_same_ip(&tmp_addr, addr)) {
835                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
836                                                                      ip->client_id, 
837                                                                      struct ctdb_client);
838                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
839                                 ip->client_id,
840                                 ctdb_addr_to_str(&ip->addr),
841                                 client->pid));
842
843                         if (client->pid != 0) {
844                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
845                                         (unsigned)client->pid,
846                                         ctdb_addr_to_str(addr),
847                                         ip->client_id));
848                                 kill(client->pid, SIGKILL);
849                         }
850                 }
851         }
852 }
853
854 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
855 {
856         DLIST_REMOVE(ctdb->vnn, vnn);
857         ctdb_vnn_unassign_iface(ctdb, vnn);
858         ctdb_remove_orphaned_ifaces(ctdb, vnn);
859         talloc_free(vnn);
860 }
861
862 /*
863   called when releaseip event finishes
864  */
865 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
866                                 void *private_data)
867 {
868         struct takeover_callback_state *state = 
869                 talloc_get_type(private_data, struct takeover_callback_state);
870         TDB_DATA data;
871
872         if (status == -ETIME) {
873                 ctdb_ban_self(ctdb);
874         }
875
876         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
877                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
878                                   ctdb_addr_to_str(state->addr)));
879                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
880                 talloc_free(state);
881                 return;
882         }
883
884         /* send a message to all clients of this node telling them
885            that the cluster has been reconfigured and they should
886            release any sockets on this IP */
887         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
888         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
889         data.dsize = strlen((char *)data.dptr)+1;
890
891         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
892
893         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
894
895         /* kill clients that have registered with this IP */
896         release_kill_clients(ctdb, state->addr);
897
898         ctdb_vnn_unassign_iface(ctdb, state->vnn);
899
900         /* Process the IP if it has been marked for deletion */
901         if (state->vnn->delete_pending) {
902                 do_delete_ip(ctdb, state->vnn);
903                 state->vnn = NULL;
904         }
905
906         /* the control succeeded */
907         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
908         talloc_free(state);
909 }
910
911 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
912 {
913         if (state->vnn != NULL) {
914                 state->vnn->update_in_flight = false;
915         }
916         return 0;
917 }
918
919 /*
920   release an ip address
921  */
922 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
923                                 struct ctdb_req_control *c,
924                                 TDB_DATA indata, 
925                                 bool *async_reply)
926 {
927         int ret;
928         struct takeover_callback_state *state;
929         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
930         struct ctdb_vnn *vnn;
931         char *iface;
932
933         /* update our vnn list */
934         vnn = find_public_ip_vnn(ctdb, &pip->addr);
935         if (vnn == NULL) {
936                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
937                         ctdb_addr_to_str(&pip->addr)));
938                 return 0;
939         }
940         vnn->pnn = pip->pnn;
941
942         /* stop any previous arps */
943         talloc_free(vnn->takeover_ctx);
944         vnn->takeover_ctx = NULL;
945
946         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
947          * lazy multicast to drop an IP from any node that isn't the
948          * intended new node.  The following causes makes ctdbd ignore
949          * a release for any address it doesn't host.
950          */
951         if (ctdb->do_checkpublicip) {
952                 if (!ctdb_sys_have_ip(&pip->addr)) {
953                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
954                                 ctdb_addr_to_str(&pip->addr),
955                                 vnn->public_netmask_bits,
956                                 ctdb_vnn_iface_string(vnn)));
957                         ctdb_vnn_unassign_iface(ctdb, vnn);
958                         return 0;
959                 }
960         } else {
961                 if (vnn->iface == NULL) {
962                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
963                                            ctdb_addr_to_str(&pip->addr),
964                                            vnn->public_netmask_bits));
965                         return 0;
966                 }
967         }
968
969         /* There is a potential race between take_ip and us because we
970          * update the VNN via a callback that run when the
971          * eventscripts have been run.  Avoid the race by allowing one
972          * update to be in flight at a time.
973          */
974         if (vnn->update_in_flight) {
975                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
976                                     "update for this IP already in flight\n",
977                                     ctdb_addr_to_str(&vnn->public_address),
978                                     vnn->public_netmask_bits));
979                 return -1;
980         }
981
982         iface = strdup(ctdb_vnn_iface_string(vnn));
983
984         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
985                 ctdb_addr_to_str(&pip->addr),
986                 vnn->public_netmask_bits,
987                 iface,
988                 pip->pnn));
989
990         state = talloc(ctdb, struct takeover_callback_state);
991         if (state == NULL) {
992                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
993                                __FILE__, __LINE__);
994                 free(iface);
995                 return -1;
996         }
997
998         state->c = talloc_steal(state, c);
999         state->addr = talloc(state, ctdb_sock_addr);       
1000         if (state->addr == NULL) {
1001                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1002                                __FILE__, __LINE__);
1003                 free(iface);
1004                 talloc_free(state);
1005                 return -1;
1006         }
1007         *state->addr = pip->addr;
1008         state->vnn   = vnn;
1009
1010         vnn->update_in_flight = true;
1011         talloc_set_destructor(state, ctdb_releaseip_destructor);
1012
1013         ret = ctdb_event_script_callback(ctdb, 
1014                                          state, release_ip_callback, state,
1015                                          CTDB_EVENT_RELEASE_IP,
1016                                          "%s %s %u",
1017                                          iface,
1018                                          ctdb_addr_to_str(&pip->addr),
1019                                          vnn->public_netmask_bits);
1020         free(iface);
1021         if (ret != 0) {
1022                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1023                         ctdb_addr_to_str(&pip->addr),
1024                         ctdb_vnn_iface_string(vnn)));
1025                 talloc_free(state);
1026                 return -1;
1027         }
1028
1029         /* tell the control that we will be reply asynchronously */
1030         *async_reply = true;
1031         return 0;
1032 }
1033
1034 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1035                                    ctdb_sock_addr *addr,
1036                                    unsigned mask, const char *ifaces,
1037                                    bool check_address)
1038 {
1039         struct ctdb_vnn      *vnn;
1040         uint32_t num = 0;
1041         char *tmp;
1042         const char *iface;
1043         int i;
1044         int ret;
1045
1046         tmp = strdup(ifaces);
1047         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1048                 if (!ctdb_sys_check_iface_exists(iface)) {
1049                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1050                         free(tmp);
1051                         return -1;
1052                 }
1053         }
1054         free(tmp);
1055
1056         /* Verify that we dont have an entry for this ip yet */
1057         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1058                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1059                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1060                                 ctdb_addr_to_str(addr)));
1061                         return -1;
1062                 }               
1063         }
1064
1065         /* create a new vnn structure for this ip address */
1066         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1067         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1068         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1069         tmp = talloc_strdup(vnn, ifaces);
1070         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1071         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1072                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1073                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1074                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1075                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1076                 num++;
1077         }
1078         talloc_free(tmp);
1079         vnn->ifaces[num] = NULL;
1080         vnn->public_address      = *addr;
1081         vnn->public_netmask_bits = mask;
1082         vnn->pnn                 = -1;
1083         if (check_address) {
1084                 if (ctdb_sys_have_ip(addr)) {
1085                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1086                         vnn->pnn = ctdb->pnn;
1087                 }
1088         }
1089
1090         for (i=0; vnn->ifaces[i]; i++) {
1091                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1092                 if (ret != 0) {
1093                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1094                                            "for public_address[%s]\n",
1095                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1096                         talloc_free(vnn);
1097                         return -1;
1098                 }
1099         }
1100
1101         DLIST_ADD(ctdb->vnn, vnn);
1102
1103         return 0;
1104 }
1105
1106 /*
1107   setup the public address lists from a file
1108 */
1109 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1110 {
1111         char **lines;
1112         int nlines;
1113         int i;
1114
1115         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1116         if (lines == NULL) {
1117                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1118                 return -1;
1119         }
1120         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1121                 nlines--;
1122         }
1123
1124         for (i=0;i<nlines;i++) {
1125                 unsigned mask;
1126                 ctdb_sock_addr addr;
1127                 const char *addrstr;
1128                 const char *ifaces;
1129                 char *tok, *line;
1130
1131                 line = lines[i];
1132                 while ((*line == ' ') || (*line == '\t')) {
1133                         line++;
1134                 }
1135                 if (*line == '#') {
1136                         continue;
1137                 }
1138                 if (strcmp(line, "") == 0) {
1139                         continue;
1140                 }
1141                 tok = strtok(line, " \t");
1142                 addrstr = tok;
1143                 tok = strtok(NULL, " \t");
1144                 if (tok == NULL) {
1145                         if (NULL == ctdb->default_public_interface) {
1146                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1147                                          i+1));
1148                                 talloc_free(lines);
1149                                 return -1;
1150                         }
1151                         ifaces = ctdb->default_public_interface;
1152                 } else {
1153                         ifaces = tok;
1154                 }
1155
1156                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1157                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1158                         talloc_free(lines);
1159                         return -1;
1160                 }
1161                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1162                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1163                         talloc_free(lines);
1164                         return -1;
1165                 }
1166         }
1167
1168
1169         talloc_free(lines);
1170         return 0;
1171 }
1172
1173 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1174                               const char *iface,
1175                               const char *ip)
1176 {
1177         struct ctdb_vnn *svnn;
1178         struct ctdb_iface *cur = NULL;
1179         bool ok;
1180         int ret;
1181
1182         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1183         CTDB_NO_MEMORY(ctdb, svnn);
1184
1185         svnn->ifaces = talloc_array(svnn, const char *, 2);
1186         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1187         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1188         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1189         svnn->ifaces[1] = NULL;
1190
1191         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1192         if (!ok) {
1193                 talloc_free(svnn);
1194                 return -1;
1195         }
1196
1197         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1198         if (ret != 0) {
1199                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1200                                    "for single_ip[%s]\n",
1201                                    svnn->ifaces[0],
1202                                    ctdb_addr_to_str(&svnn->public_address)));
1203                 talloc_free(svnn);
1204                 return -1;
1205         }
1206
1207         /* assume the single public ip interface is initially "good" */
1208         cur = ctdb_find_iface(ctdb, iface);
1209         if (cur == NULL) {
1210                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1211                 return -1;
1212         }
1213         cur->link_up = true;
1214
1215         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1216         if (ret != 0) {
1217                 talloc_free(svnn);
1218                 return -1;
1219         }
1220
1221         ctdb->single_ip_vnn = svnn;
1222         return 0;
1223 }
1224
1225 struct ctdb_public_ip_list {
1226         struct ctdb_public_ip_list *next;
1227         uint32_t pnn;
1228         ctdb_sock_addr addr;
1229 };
1230
1231 /* Given a physical node, return the number of
1232    public addresses that is currently assigned to this node.
1233 */
1234 static int node_ip_coverage(struct ctdb_context *ctdb, 
1235         int32_t pnn,
1236         struct ctdb_public_ip_list *ips)
1237 {
1238         int num=0;
1239
1240         for (;ips;ips=ips->next) {
1241                 if (ips->pnn == pnn) {
1242                         num++;
1243                 }
1244         }
1245         return num;
1246 }
1247
1248
1249 /* Can the given node host the given IP: is the public IP known to the
1250  * node and is NOIPHOST unset?
1251 */
1252 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1253                              struct ctdb_ipflags ipflags,
1254                              struct ctdb_public_ip_list *ip)
1255 {
1256         struct ctdb_all_public_ips *public_ips;
1257         int i;
1258
1259         if (ipflags.noiphost) {
1260                 return false;
1261         }
1262
1263         public_ips = ctdb->nodes[pnn]->available_public_ips;
1264
1265         if (public_ips == NULL) {
1266                 return false;
1267         }
1268
1269         for (i=0; i<public_ips->num; i++) {
1270                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1271                         /* yes, this node can serve this public ip */
1272                         return true;
1273                 }
1274         }
1275
1276         return false;
1277 }
1278
1279 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1280                                  struct ctdb_ipflags ipflags,
1281                                  struct ctdb_public_ip_list *ip)
1282 {
1283         if (ipflags.noiptakeover) {
1284                 return false;
1285         }
1286
1287         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1288 }
1289
1290 /* search the node lists list for a node to takeover this ip.
1291    pick the node that currently are serving the least number of ips
1292    so that the ips get spread out evenly.
1293 */
1294 static int find_takeover_node(struct ctdb_context *ctdb, 
1295                 struct ctdb_ipflags *ipflags,
1296                 struct ctdb_public_ip_list *ip,
1297                 struct ctdb_public_ip_list *all_ips)
1298 {
1299         int pnn, min=0, num;
1300         int i, numnodes;
1301
1302         numnodes = talloc_array_length(ipflags);
1303         pnn    = -1;
1304         for (i=0; i<numnodes; i++) {
1305                 /* verify that this node can serve this ip */
1306                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1307                         /* no it couldnt   so skip to the next node */
1308                         continue;
1309                 }
1310
1311                 num = node_ip_coverage(ctdb, i, all_ips);
1312                 /* was this the first node we checked ? */
1313                 if (pnn == -1) {
1314                         pnn = i;
1315                         min  = num;
1316                 } else {
1317                         if (num < min) {
1318                                 pnn = i;
1319                                 min  = num;
1320                         }
1321                 }
1322         }       
1323         if (pnn == -1) {
1324                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1325                         ctdb_addr_to_str(&ip->addr)));
1326
1327                 return -1;
1328         }
1329
1330         ip->pnn = pnn;
1331         return 0;
1332 }
1333
1334 #define IP_KEYLEN       4
1335 static uint32_t *ip_key(ctdb_sock_addr *ip)
1336 {
1337         static uint32_t key[IP_KEYLEN];
1338
1339         bzero(key, sizeof(key));
1340
1341         switch (ip->sa.sa_family) {
1342         case AF_INET:
1343                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1344                 break;
1345         case AF_INET6: {
1346                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1347                 key[0]  = htonl(s6_a32[0]);
1348                 key[1]  = htonl(s6_a32[1]);
1349                 key[2]  = htonl(s6_a32[2]);
1350                 key[3]  = htonl(s6_a32[3]);
1351                 break;
1352         }
1353         default:
1354                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1355                 return key;
1356         }
1357
1358         return key;
1359 }
1360
1361 static void *add_ip_callback(void *parm, void *data)
1362 {
1363         struct ctdb_public_ip_list *this_ip = parm; 
1364         struct ctdb_public_ip_list *prev_ip = data; 
1365
1366         if (prev_ip == NULL) {
1367                 return parm;
1368         }
1369         if (this_ip->pnn == -1) {
1370                 this_ip->pnn = prev_ip->pnn;
1371         }
1372
1373         return parm;
1374 }
1375
1376 static int getips_count_callback(void *param, void *data)
1377 {
1378         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1379         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1380
1381         new_ip->next = *ip_list;
1382         *ip_list     = new_ip;
1383         return 0;
1384 }
1385
1386 static struct ctdb_public_ip_list *
1387 create_merged_ip_list(struct ctdb_context *ctdb)
1388 {
1389         int i, j;
1390         struct ctdb_public_ip_list *ip_list;
1391         struct ctdb_all_public_ips *public_ips;
1392
1393         if (ctdb->ip_tree != NULL) {
1394                 talloc_free(ctdb->ip_tree);
1395                 ctdb->ip_tree = NULL;
1396         }
1397         ctdb->ip_tree = trbt_create(ctdb, 0);
1398
1399         for (i=0;i<ctdb->num_nodes;i++) {
1400                 public_ips = ctdb->nodes[i]->known_public_ips;
1401
1402                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1403                         continue;
1404                 }
1405
1406                 /* there were no public ips for this node */
1407                 if (public_ips == NULL) {
1408                         continue;
1409                 }               
1410
1411                 for (j=0;j<public_ips->num;j++) {
1412                         struct ctdb_public_ip_list *tmp_ip; 
1413
1414                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1415                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1416                         /* Do not use information about IP addresses hosted
1417                          * on other nodes, it may not be accurate */
1418                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1419                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1420                         } else {
1421                                 tmp_ip->pnn = -1;
1422                         }
1423                         tmp_ip->addr = public_ips->ips[j].addr;
1424                         tmp_ip->next = NULL;
1425
1426                         trbt_insertarray32_callback(ctdb->ip_tree,
1427                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1428                                 add_ip_callback,
1429                                 tmp_ip);
1430                 }
1431         }
1432
1433         ip_list = NULL;
1434         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1435
1436         return ip_list;
1437 }
1438
1439 /* 
1440  * This is the length of the longtest common prefix between the IPs.
1441  * It is calculated by XOR-ing the 2 IPs together and counting the
1442  * number of leading zeroes.  The implementation means that all
1443  * addresses end up being 128 bits long.
1444  *
1445  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1446  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1447  * lots of nodes and IP addresses?
1448  */
1449 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1450 {
1451         uint32_t ip1_k[IP_KEYLEN];
1452         uint32_t *t;
1453         int i;
1454         uint32_t x;
1455
1456         uint32_t distance = 0;
1457
1458         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1459         t = ip_key(ip2);
1460         for (i=0; i<IP_KEYLEN; i++) {
1461                 x = ip1_k[i] ^ t[i];
1462                 if (x == 0) {
1463                         distance += 32;
1464                 } else {
1465                         /* Count number of leading zeroes. 
1466                          * FIXME? This could be optimised...
1467                          */
1468                         while ((x & (1 << 31)) == 0) {
1469                                 x <<= 1;
1470                                 distance += 1;
1471                         }
1472                 }
1473         }
1474
1475         return distance;
1476 }
1477
1478 /* Calculate the IP distance for the given IP relative to IPs on the
1479    given node.  The ips argument is generally the all_ips variable
1480    used in the main part of the algorithm.
1481  */
1482 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1483                                   struct ctdb_public_ip_list *ips,
1484                                   int pnn)
1485 {
1486         struct ctdb_public_ip_list *t;
1487         uint32_t d;
1488
1489         uint32_t sum = 0;
1490
1491         for (t=ips; t != NULL; t=t->next) {
1492                 if (t->pnn != pnn) {
1493                         continue;
1494                 }
1495
1496                 /* Optimisation: We never calculate the distance
1497                  * between an address and itself.  This allows us to
1498                  * calculate the effect of removing an address from a
1499                  * node by simply calculating the distance between
1500                  * that address and all of the exitsing addresses.
1501                  * Moreover, we assume that we're only ever dealing
1502                  * with addresses from all_ips so we can identify an
1503                  * address via a pointer rather than doing a more
1504                  * expensive address comparison. */
1505                 if (&(t->addr) == ip) {
1506                         continue;
1507                 }
1508
1509                 d = ip_distance(ip, &(t->addr));
1510                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1511         }
1512
1513         return sum;
1514 }
1515
1516 /* Return the LCP2 imbalance metric for addresses currently assigned
1517    to the given node.
1518  */
1519 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1520 {
1521         struct ctdb_public_ip_list *t;
1522
1523         uint32_t imbalance = 0;
1524
1525         for (t=all_ips; t!=NULL; t=t->next) {
1526                 if (t->pnn != pnn) {
1527                         continue;
1528                 }
1529                 /* Pass the rest of the IPs rather than the whole
1530                    all_ips input list.
1531                 */
1532                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1533         }
1534
1535         return imbalance;
1536 }
1537
1538 /* Allocate any unassigned IPs just by looping through the IPs and
1539  * finding the best node for each.
1540  */
1541 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1542                                       struct ctdb_ipflags *ipflags,
1543                                       struct ctdb_public_ip_list *all_ips)
1544 {
1545         struct ctdb_public_ip_list *tmp_ip;
1546
1547         /* loop over all ip's and find a physical node to cover for 
1548            each unassigned ip.
1549         */
1550         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1551                 if (tmp_ip->pnn == -1) {
1552                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1553                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1554                                         ctdb_addr_to_str(&tmp_ip->addr)));
1555                         }
1556                 }
1557         }
1558 }
1559
1560 /* Basic non-deterministic rebalancing algorithm.
1561  */
1562 static void basic_failback(struct ctdb_context *ctdb,
1563                            struct ctdb_ipflags *ipflags,
1564                            struct ctdb_public_ip_list *all_ips,
1565                            int num_ips)
1566 {
1567         int i, numnodes;
1568         int maxnode, maxnum, minnode, minnum, num, retries;
1569         struct ctdb_public_ip_list *tmp_ip;
1570
1571         numnodes = talloc_array_length(ipflags);
1572         retries = 0;
1573
1574 try_again:
1575         maxnum=0;
1576         minnum=0;
1577
1578         /* for each ip address, loop over all nodes that can serve
1579            this ip and make sure that the difference between the node
1580            serving the most and the node serving the least ip's are
1581            not greater than 1.
1582         */
1583         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1584                 if (tmp_ip->pnn == -1) {
1585                         continue;
1586                 }
1587
1588                 /* Get the highest and lowest number of ips's served by any 
1589                    valid node which can serve this ip.
1590                 */
1591                 maxnode = -1;
1592                 minnode = -1;
1593                 for (i=0; i<numnodes; i++) {
1594                         /* only check nodes that can actually serve this ip */
1595                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1596                                 /* no it couldnt   so skip to the next node */
1597                                 continue;
1598                         }
1599
1600                         num = node_ip_coverage(ctdb, i, all_ips);
1601                         if (maxnode == -1) {
1602                                 maxnode = i;
1603                                 maxnum  = num;
1604                         } else {
1605                                 if (num > maxnum) {
1606                                         maxnode = i;
1607                                         maxnum  = num;
1608                                 }
1609                         }
1610                         if (minnode == -1) {
1611                                 minnode = i;
1612                                 minnum  = num;
1613                         } else {
1614                                 if (num < minnum) {
1615                                         minnode = i;
1616                                         minnum  = num;
1617                                 }
1618                         }
1619                 }
1620                 if (maxnode == -1) {
1621                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1622                                 ctdb_addr_to_str(&tmp_ip->addr)));
1623
1624                         continue;
1625                 }
1626
1627                 /* if the spread between the smallest and largest coverage by
1628                    a node is >=2 we steal one of the ips from the node with
1629                    most coverage to even things out a bit.
1630                    try to do this a limited number of times since we dont
1631                    want to spend too much time balancing the ip coverage.
1632                 */
1633                 if ( (maxnum > minnum+1)
1634                      && (retries < (num_ips + 5)) ){
1635                         struct ctdb_public_ip_list *tmp;
1636
1637                         /* Reassign one of maxnode's VNNs */
1638                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1639                                 if (tmp->pnn == maxnode) {
1640                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1641                                         retries++;
1642                                         goto try_again;;
1643                                 }
1644                         }
1645                 }
1646         }
1647 }
1648
1649 static void lcp2_init(struct ctdb_context *tmp_ctx,
1650                       struct ctdb_ipflags *ipflags,
1651                       struct ctdb_public_ip_list *all_ips,
1652                       uint32_t *force_rebalance_nodes,
1653                       uint32_t **lcp2_imbalances,
1654                       bool **rebalance_candidates)
1655 {
1656         int i, numnodes;
1657         struct ctdb_public_ip_list *tmp_ip;
1658
1659         numnodes = talloc_array_length(ipflags);
1660
1661         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1662         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1663         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1664         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1665
1666         for (i=0; i<numnodes; i++) {
1667                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1668                 /* First step: assume all nodes are candidates */
1669                 (*rebalance_candidates)[i] = true;
1670         }
1671
1672         /* 2nd step: if a node has IPs assigned then it must have been
1673          * healthy before, so we remove it from consideration.  This
1674          * is overkill but is all we have because we don't maintain
1675          * state between takeover runs.  An alternative would be to
1676          * keep state and invalidate it every time the recovery master
1677          * changes.
1678          */
1679         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1680                 if (tmp_ip->pnn != -1) {
1681                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1682                 }
1683         }
1684
1685         /* 3rd step: if a node is forced to re-balance then
1686            we allow failback onto the node */
1687         if (force_rebalance_nodes == NULL) {
1688                 return;
1689         }
1690         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1691                 uint32_t pnn = force_rebalance_nodes[i];
1692                 if (pnn >= numnodes) {
1693                         DEBUG(DEBUG_ERR,
1694                               (__location__ "unknown node %u\n", pnn));
1695                         continue;
1696                 }
1697
1698                 DEBUG(DEBUG_NOTICE,
1699                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1700                 (*rebalance_candidates)[pnn] = true;
1701         }
1702 }
1703
1704 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1705  * the IP/node combination that will cost the least.
1706  */
1707 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1708                                      struct ctdb_ipflags *ipflags,
1709                                      struct ctdb_public_ip_list *all_ips,
1710                                      uint32_t *lcp2_imbalances)
1711 {
1712         struct ctdb_public_ip_list *tmp_ip;
1713         int dstnode, numnodes;
1714
1715         int minnode;
1716         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1717         struct ctdb_public_ip_list *minip;
1718
1719         bool should_loop = true;
1720         bool have_unassigned = true;
1721
1722         numnodes = talloc_array_length(ipflags);
1723
1724         while (have_unassigned && should_loop) {
1725                 should_loop = false;
1726
1727                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1728                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1729
1730                 minnode = -1;
1731                 mindsum = 0;
1732                 minip = NULL;
1733
1734                 /* loop over each unassigned ip. */
1735                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1736                         if (tmp_ip->pnn != -1) {
1737                                 continue;
1738                         }
1739
1740                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1741                                 /* only check nodes that can actually takeover this ip */
1742                                 if (!can_node_takeover_ip(ctdb, dstnode,
1743                                                           ipflags[dstnode],
1744                                                           tmp_ip)) {
1745                                         /* no it couldnt   so skip to the next node */
1746                                         continue;
1747                                 }
1748
1749                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1750                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1751                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1752                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1753                                                    dstnode,
1754                                                    dstimbl - lcp2_imbalances[dstnode]));
1755
1756
1757                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1758                                         minnode = dstnode;
1759                                         minimbl = dstimbl;
1760                                         mindsum = dstdsum;
1761                                         minip = tmp_ip;
1762                                         should_loop = true;
1763                                 }
1764                         }
1765                 }
1766
1767                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1768
1769                 /* If we found one then assign it to the given node. */
1770                 if (minnode != -1) {
1771                         minip->pnn = minnode;
1772                         lcp2_imbalances[minnode] = minimbl;
1773                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1774                                           ctdb_addr_to_str(&(minip->addr)),
1775                                           minnode,
1776                                           mindsum));
1777                 }
1778
1779                 /* There might be a better way but at least this is clear. */
1780                 have_unassigned = false;
1781                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1782                         if (tmp_ip->pnn == -1) {
1783                                 have_unassigned = true;
1784                         }
1785                 }
1786         }
1787
1788         /* We know if we have an unassigned addresses so we might as
1789          * well optimise.
1790          */
1791         if (have_unassigned) {
1792                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1793                         if (tmp_ip->pnn == -1) {
1794                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1795                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1796                         }
1797                 }
1798         }
1799 }
1800
1801 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1802  * to move IPs from, determines the best IP/destination node
1803  * combination to move from the source node.
1804  */
1805 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1806                                     struct ctdb_ipflags *ipflags,
1807                                     struct ctdb_public_ip_list *all_ips,
1808                                     int srcnode,
1809                                     uint32_t *lcp2_imbalances,
1810                                     bool *rebalance_candidates)
1811 {
1812         int dstnode, mindstnode, numnodes;
1813         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1814         uint32_t minsrcimbl, mindstimbl;
1815         struct ctdb_public_ip_list *minip;
1816         struct ctdb_public_ip_list *tmp_ip;
1817
1818         /* Find an IP and destination node that best reduces imbalance. */
1819         srcimbl = 0;
1820         minip = NULL;
1821         minsrcimbl = 0;
1822         mindstnode = -1;
1823         mindstimbl = 0;
1824
1825         numnodes = talloc_array_length(ipflags);
1826
1827         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1828         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1829                            srcnode, lcp2_imbalances[srcnode]));
1830
1831         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1832                 /* Only consider addresses on srcnode. */
1833                 if (tmp_ip->pnn != srcnode) {
1834                         continue;
1835                 }
1836
1837                 /* What is this IP address costing the source node? */
1838                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1839                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1840
1841                 /* Consider this IP address would cost each potential
1842                  * destination node.  Destination nodes are limited to
1843                  * those that are newly healthy, since we don't want
1844                  * to do gratuitous failover of IPs just to make minor
1845                  * balance improvements.
1846                  */
1847                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1848                         if (!rebalance_candidates[dstnode]) {
1849                                 continue;
1850                         }
1851
1852                         /* only check nodes that can actually takeover this ip */
1853                         if (!can_node_takeover_ip(ctdb, dstnode,
1854                                                   ipflags[dstnode], tmp_ip)) {
1855                                 /* no it couldnt   so skip to the next node */
1856                                 continue;
1857                         }
1858
1859                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1860                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1861                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1862                                            srcnode, -srcdsum,
1863                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1864                                            dstnode, dstdsum));
1865
1866                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1867                             (dstdsum < srcdsum) &&                      \
1868                             ((mindstnode == -1) ||                              \
1869                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1870
1871                                 minip = tmp_ip;
1872                                 minsrcimbl = srcimbl;
1873                                 mindstnode = dstnode;
1874                                 mindstimbl = dstimbl;
1875                         }
1876                 }
1877         }
1878         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1879
1880         if (mindstnode != -1) {
1881                 /* We found a move that makes things better... */
1882                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1883                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1884                                   ctdb_addr_to_str(&(minip->addr)),
1885                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1886
1887
1888                 lcp2_imbalances[srcnode] = minsrcimbl;
1889                 lcp2_imbalances[mindstnode] = mindstimbl;
1890                 minip->pnn = mindstnode;
1891
1892                 return true;
1893         }
1894
1895         return false;
1896         
1897 }
1898
1899 struct lcp2_imbalance_pnn {
1900         uint32_t imbalance;
1901         int pnn;
1902 };
1903
1904 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1905 {
1906         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1907         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1908
1909         if (lipa->imbalance > lipb->imbalance) {
1910                 return -1;
1911         } else if (lipa->imbalance == lipb->imbalance) {
1912                 return 0;
1913         } else {
1914                 return 1;
1915         }
1916 }
1917
1918 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1919  * node with the highest LCP2 imbalance, and then determines the best
1920  * IP/destination node combination to move from the source node.
1921  */
1922 static void lcp2_failback(struct ctdb_context *ctdb,
1923                           struct ctdb_ipflags *ipflags,
1924                           struct ctdb_public_ip_list *all_ips,
1925                           uint32_t *lcp2_imbalances,
1926                           bool *rebalance_candidates)
1927 {
1928         int i, numnodes;
1929         struct lcp2_imbalance_pnn * lips;
1930         bool again;
1931
1932         numnodes = talloc_array_length(ipflags);
1933
1934 try_again:
1935         /* Put the imbalances and nodes into an array, sort them and
1936          * iterate through candidates.  Usually the 1st one will be
1937          * used, so this doesn't cost much...
1938          */
1939         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1940         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1941         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1942         for (i=0; i<numnodes; i++) {
1943                 lips[i].imbalance = lcp2_imbalances[i];
1944                 lips[i].pnn = i;
1945                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1946         }
1947         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1948               lcp2_cmp_imbalance_pnn);
1949
1950         again = false;
1951         for (i=0; i<numnodes; i++) {
1952                 /* This means that all nodes had 0 or 1 addresses, so
1953                  * can't be imbalanced.
1954                  */
1955                 if (lips[i].imbalance == 0) {
1956                         break;
1957                 }
1958
1959                 if (lcp2_failback_candidate(ctdb,
1960                                             ipflags,
1961                                             all_ips,
1962                                             lips[i].pnn,
1963                                             lcp2_imbalances,
1964                                             rebalance_candidates)) {
1965                         again = true;
1966                         break;
1967                 }
1968         }
1969
1970         talloc_free(lips);
1971         if (again) {
1972                 goto try_again;
1973         }
1974 }
1975
1976 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1977                                     struct ctdb_ipflags *ipflags,
1978                                     struct ctdb_public_ip_list *all_ips)
1979 {
1980         struct ctdb_public_ip_list *tmp_ip;
1981
1982         /* verify that the assigned nodes can serve that public ip
1983            and set it to -1 if not
1984         */
1985         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1986                 if (tmp_ip->pnn == -1) {
1987                         continue;
1988                 }
1989                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1990                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1991                         /* this node can not serve this ip. */
1992                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1993                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1994                                            tmp_ip->pnn));
1995                         tmp_ip->pnn = -1;
1996                 }
1997         }
1998 }
1999
2000 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2001                                        struct ctdb_ipflags *ipflags,
2002                                        struct ctdb_public_ip_list *all_ips)
2003 {
2004         struct ctdb_public_ip_list *tmp_ip;
2005         int i, numnodes;
2006
2007         numnodes = talloc_array_length(ipflags);
2008
2009         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2010        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2011         *  always be allocated the same way for a specific set of
2012         *  available/unavailable nodes.
2013         */
2014
2015         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2016                 tmp_ip->pnn = i % numnodes;
2017         }
2018
2019         /* IP failback doesn't make sense with deterministic
2020          * IPs, since the modulo step above implicitly fails
2021          * back IPs to their "home" node.
2022          */
2023         if (1 == ctdb->tunable.no_ip_failback) {
2024                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2025         }
2026
2027         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2028
2029         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2030
2031         /* No failback here! */
2032 }
2033
2034 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2035                                           struct ctdb_ipflags *ipflags,
2036                                           struct ctdb_public_ip_list *all_ips)
2037 {
2038         /* This should be pushed down into basic_failback. */
2039         struct ctdb_public_ip_list *tmp_ip;
2040         int num_ips = 0;
2041         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2042                 num_ips++;
2043         }
2044
2045         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2046
2047         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2048
2049         /* If we don't want IPs to fail back then don't rebalance IPs. */
2050         if (1 == ctdb->tunable.no_ip_failback) {
2051                 return;
2052         }
2053
2054         /* Now, try to make sure the ip adresses are evenly distributed
2055            across the nodes.
2056         */
2057         basic_failback(ctdb, ipflags, all_ips, num_ips);
2058 }
2059
2060 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2061                           struct ctdb_ipflags *ipflags,
2062                           struct ctdb_public_ip_list *all_ips,
2063                           uint32_t *force_rebalance_nodes)
2064 {
2065         uint32_t *lcp2_imbalances;
2066         bool *rebalance_candidates;
2067         int numnodes, num_rebalance_candidates, i;
2068
2069         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2070
2071         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2072
2073         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2074                   &lcp2_imbalances, &rebalance_candidates);
2075
2076         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2077
2078         /* If we don't want IPs to fail back then don't rebalance IPs. */
2079         if (1 == ctdb->tunable.no_ip_failback) {
2080                 goto finished;
2081         }
2082
2083         /* It is only worth continuing if we have suitable target
2084          * nodes to transfer IPs to.  This check is much cheaper than
2085          * continuing on...
2086          */
2087         numnodes = talloc_array_length(ipflags);
2088         num_rebalance_candidates = 0;
2089         for (i=0; i<numnodes; i++) {
2090                 if (rebalance_candidates[i]) {
2091                         num_rebalance_candidates++;
2092                 }
2093         }
2094         if (num_rebalance_candidates == 0) {
2095                 goto finished;
2096         }
2097
2098         /* Now, try to make sure the ip adresses are evenly distributed
2099            across the nodes.
2100         */
2101         lcp2_failback(ctdb, ipflags, all_ips,
2102                       lcp2_imbalances, rebalance_candidates);
2103
2104 finished:
2105         talloc_free(tmp_ctx);
2106 }
2107
2108 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2109 {
2110         int i;
2111
2112         for (i=0;i<nodemap->num;i++) {
2113                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2114                         /* Found one completely healthy node */
2115                         return false;
2116                 }
2117         }
2118
2119         return true;
2120 }
2121
2122 /* The calculation part of the IP allocation algorithm. */
2123 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2124                                    struct ctdb_ipflags *ipflags,
2125                                    struct ctdb_public_ip_list **all_ips_p,
2126                                    uint32_t *force_rebalance_nodes)
2127 {
2128         /* since nodes only know about those public addresses that
2129            can be served by that particular node, no single node has
2130            a full list of all public addresses that exist in the cluster.
2131            Walk over all node structures and create a merged list of
2132            all public addresses that exist in the cluster.
2133
2134            keep the tree of ips around as ctdb->ip_tree
2135         */
2136         *all_ips_p = create_merged_ip_list(ctdb);
2137
2138         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2139                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2140         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2141                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2142         } else {
2143                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2144         }
2145
2146         /* at this point ->pnn is the node which will own each IP
2147            or -1 if there is no node that can cover this ip
2148         */
2149
2150         return;
2151 }
2152
2153 struct get_tunable_callback_data {
2154         const char *tunable;
2155         uint32_t *out;
2156         bool fatal;
2157 };
2158
2159 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2160                                  int32_t res, TDB_DATA outdata,
2161                                  void *callback)
2162 {
2163         struct get_tunable_callback_data *cd =
2164                 (struct get_tunable_callback_data *)callback;
2165         int size;
2166
2167         if (res != 0) {
2168                 /* Already handled in fail callback */
2169                 return;
2170         }
2171
2172         if (outdata.dsize != sizeof(uint32_t)) {
2173                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2174                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2175                                  (int)outdata.dsize));
2176                 cd->fatal = true;
2177                 return;
2178         }
2179
2180         size = talloc_array_length(cd->out);
2181         if (pnn >= size) {
2182                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2183                                  cd->tunable, pnn, size));
2184                 return;
2185         }
2186
2187                 
2188         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2189 }
2190
2191 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2192                                        int32_t res, TDB_DATA outdata,
2193                                        void *callback)
2194 {
2195         struct get_tunable_callback_data *cd =
2196                 (struct get_tunable_callback_data *)callback;
2197
2198         switch (res) {
2199         case -ETIME:
2200                 DEBUG(DEBUG_ERR,
2201                       ("Timed out getting tunable \"%s\" from node %d\n",
2202                        cd->tunable, pnn));
2203                 cd->fatal = true;
2204                 break;
2205         case -EINVAL:
2206         case -1:
2207                 DEBUG(DEBUG_WARNING,
2208                       ("Tunable \"%s\" not implemented on node %d\n",
2209                        cd->tunable, pnn));
2210                 break;
2211         default:
2212                 DEBUG(DEBUG_ERR,
2213                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2214                        cd->tunable, pnn));
2215                 cd->fatal = true;
2216         }
2217 }
2218
2219 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2220                                         TALLOC_CTX *tmp_ctx,
2221                                         struct ctdb_node_map *nodemap,
2222                                         const char *tunable,
2223                                         uint32_t default_value)
2224 {
2225         TDB_DATA data;
2226         struct ctdb_control_get_tunable *t;
2227         uint32_t *nodes;
2228         uint32_t *tvals;
2229         struct get_tunable_callback_data callback_data;
2230         int i;
2231
2232         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2233         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2234         for (i=0; i<nodemap->num; i++) {
2235                 tvals[i] = default_value;
2236         }
2237                 
2238         callback_data.out = tvals;
2239         callback_data.tunable = tunable;
2240         callback_data.fatal = false;
2241
2242         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2243         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2244         t = (struct ctdb_control_get_tunable *)data.dptr;
2245         t->length = strlen(tunable)+1;
2246         memcpy(t->name, tunable, t->length);
2247         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2248         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2249                                       nodes, 0, TAKEOVER_TIMEOUT(),
2250                                       false, data,
2251                                       get_tunable_callback,
2252                                       get_tunable_fail_callback,
2253                                       &callback_data) != 0) {
2254                 if (callback_data.fatal) {
2255                         talloc_free(tvals);
2256                         tvals = NULL;
2257                 }
2258         }
2259         talloc_free(nodes);
2260         talloc_free(data.dptr);
2261
2262         return tvals;
2263 }
2264
2265 struct get_runstate_callback_data {
2266         enum ctdb_runstate *out;
2267         bool fatal;
2268 };
2269
2270 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2271                                   int32_t res, TDB_DATA outdata,
2272                                   void *callback_data)
2273 {
2274         struct get_runstate_callback_data *cd =
2275                 (struct get_runstate_callback_data *)callback_data;
2276         int size;
2277
2278         if (res != 0) {
2279                 /* Already handled in fail callback */
2280                 return;
2281         }
2282
2283         if (outdata.dsize != sizeof(uint32_t)) {
2284                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2285                                  pnn, (int)sizeof(uint32_t),
2286                                  (int)outdata.dsize));
2287                 cd->fatal = true;
2288                 return;
2289         }
2290
2291         size = talloc_array_length(cd->out);
2292         if (pnn >= size) {
2293                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2294                                  pnn, size));
2295                 return;
2296         }
2297
2298         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2299 }
2300
2301 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2302                                        int32_t res, TDB_DATA outdata,
2303                                        void *callback)
2304 {
2305         struct get_runstate_callback_data *cd =
2306                 (struct get_runstate_callback_data *)callback;
2307
2308         switch (res) {
2309         case -ETIME:
2310                 DEBUG(DEBUG_ERR,
2311                       ("Timed out getting runstate from node %d\n", pnn));
2312                 cd->fatal = true;
2313                 break;
2314         default:
2315                 DEBUG(DEBUG_WARNING,
2316                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2317                        pnn));
2318         }
2319 }
2320
2321 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2322                                                     TALLOC_CTX *tmp_ctx,
2323                                                     struct ctdb_node_map *nodemap,
2324                                                     enum ctdb_runstate default_value)
2325 {
2326         uint32_t *nodes;
2327         enum ctdb_runstate *rs;
2328         struct get_runstate_callback_data callback_data;
2329         int i;
2330
2331         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2332         CTDB_NO_MEMORY_NULL(ctdb, rs);
2333         for (i=0; i<nodemap->num; i++) {
2334                 rs[i] = default_value;
2335         }
2336
2337         callback_data.out = rs;
2338         callback_data.fatal = false;
2339
2340         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2341         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2342                                       nodes, 0, TAKEOVER_TIMEOUT(),
2343                                       true, tdb_null,
2344                                       get_runstate_callback,
2345                                       get_runstate_fail_callback,
2346                                       &callback_data) != 0) {
2347                 if (callback_data.fatal) {
2348                         free(rs);
2349                         rs = NULL;
2350                 }
2351         }
2352         talloc_free(nodes);
2353
2354         return rs;
2355 }
2356
2357 /* Set internal flags for IP allocation:
2358  *   Clear ip flags
2359  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2360  *   Set NOIPHOST ip flag for each INACTIVE node
2361  *   if all nodes are disabled:
2362  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2363  *   else
2364  *     Set NOIPHOST ip flags for disabled nodes
2365  */
2366 static struct ctdb_ipflags *
2367 set_ipflags_internal(struct ctdb_context *ctdb,
2368                      TALLOC_CTX *tmp_ctx,
2369                      struct ctdb_node_map *nodemap,
2370                      uint32_t *tval_noiptakeover,
2371                      uint32_t *tval_noiphostonalldisabled,
2372                      enum ctdb_runstate *runstate)
2373 {
2374         int i;
2375         struct ctdb_ipflags *ipflags;
2376
2377         /* Clear IP flags - implicit due to talloc_zero */
2378         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2379         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2380
2381         for (i=0;i<nodemap->num;i++) {
2382                 /* Can not take IPs on node with NoIPTakeover set */
2383                 if (tval_noiptakeover[i] != 0) {
2384                         ipflags[i].noiptakeover = true;
2385                 }
2386
2387                 /* Can not host IPs on node not in RUNNING state */
2388                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2389                         ipflags[i].noiphost = true;
2390                         continue;
2391                 }
2392                 /* Can not host IPs on INACTIVE node */
2393                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2394                         ipflags[i].noiphost = true;
2395                 }
2396                 /* Remember the runstate */
2397                 ipflags[i].runstate = runstate[i];
2398         }
2399
2400         if (all_nodes_are_disabled(nodemap)) {
2401                 /* If all nodes are disabled, can not host IPs on node
2402                  * with NoIPHostOnAllDisabled set
2403                  */
2404                 for (i=0;i<nodemap->num;i++) {
2405                         if (tval_noiphostonalldisabled[i] != 0) {
2406                                 ipflags[i].noiphost = true;
2407                         }
2408                 }
2409         } else {
2410                 /* If some nodes are not disabled, then can not host
2411                  * IPs on DISABLED node
2412                  */
2413                 for (i=0;i<nodemap->num;i++) {
2414                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2415                                 ipflags[i].noiphost = true;
2416                         }
2417                 }
2418         }
2419
2420         return ipflags;
2421 }
2422
2423 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2424                                         TALLOC_CTX *tmp_ctx,
2425                                         struct ctdb_node_map *nodemap)
2426 {
2427         uint32_t *tval_noiptakeover;
2428         uint32_t *tval_noiphostonalldisabled;
2429         struct ctdb_ipflags *ipflags;
2430         enum ctdb_runstate *runstate;
2431
2432
2433         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2434                                                    "NoIPTakeover", 0);
2435         if (tval_noiptakeover == NULL) {
2436                 return NULL;
2437         }
2438
2439         tval_noiphostonalldisabled =
2440                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2441                                        "NoIPHostOnAllDisabled", 0);
2442         if (tval_noiphostonalldisabled == NULL) {
2443                 /* Caller frees tmp_ctx */
2444                 return NULL;
2445         }
2446
2447         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2448          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2449          * reasonable behaviour on a mixed cluster during upgrade.
2450          */
2451         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2452                                            CTDB_RUNSTATE_RUNNING);
2453         if (runstate == NULL) {
2454                 /* Caller frees tmp_ctx */
2455                 return NULL;
2456         }
2457
2458         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2459                                        tval_noiptakeover,
2460                                        tval_noiphostonalldisabled,
2461                                        runstate);
2462
2463         talloc_free(tval_noiptakeover);
2464         talloc_free(tval_noiphostonalldisabled);
2465         talloc_free(runstate);
2466
2467         return ipflags;
2468 }
2469
2470 struct iprealloc_callback_data {
2471         bool *retry_nodes;
2472         int retry_count;
2473         client_async_callback fail_callback;
2474         void *fail_callback_data;
2475         struct ctdb_node_map *nodemap;
2476 };
2477
2478 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2479                                         int32_t res, TDB_DATA outdata,
2480                                         void *callback)
2481 {
2482         int numnodes;
2483         struct iprealloc_callback_data *cd =
2484                 (struct iprealloc_callback_data *)callback;
2485
2486         numnodes = talloc_array_length(cd->retry_nodes);
2487         if (pnn > numnodes) {
2488                 DEBUG(DEBUG_ERR,
2489                       ("ipreallocated failure from node %d, "
2490                        "but only %d nodes in nodemap\n",
2491                        pnn, numnodes));
2492                 return;
2493         }
2494
2495         /* Can't run the "ipreallocated" event on a INACTIVE node */
2496         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2497                 DEBUG(DEBUG_WARNING,
2498                       ("ipreallocated failed on inactive node %d, ignoring\n",
2499                        pnn));
2500                 return;
2501         }
2502
2503         switch (res) {
2504         case -ETIME:
2505                 /* If the control timed out then that's a real error,
2506                  * so call the real fail callback
2507                  */
2508                 if (cd->fail_callback) {
2509                         cd->fail_callback(ctdb, pnn, res, outdata,
2510                                           cd->fail_callback_data);
2511                 } else {
2512                         DEBUG(DEBUG_WARNING,
2513                               ("iprealloc timed out but no callback registered\n"));
2514                 }
2515                 break;
2516         default:
2517                 /* If not a timeout then either the ipreallocated
2518                  * eventscript (or some setup) failed.  This might
2519                  * have failed because the IPREALLOCATED control isn't
2520                  * implemented - right now there is no way of knowing
2521                  * because the error codes are all folded down to -1.
2522                  * Consider retrying using EVENTSCRIPT control...
2523                  */
2524                 DEBUG(DEBUG_WARNING,
2525                       ("ipreallocated failure from node %d, flagging retry\n",
2526                        pnn));
2527                 cd->retry_nodes[pnn] = true;
2528                 cd->retry_count++;
2529         }
2530 }
2531
2532 struct takeover_callback_data {
2533         bool *node_failed;
2534         client_async_callback fail_callback;
2535         void *fail_callback_data;
2536         struct ctdb_node_map *nodemap;
2537 };
2538
2539 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2540                                        uint32_t node_pnn, int32_t res,
2541                                        TDB_DATA outdata, void *callback_data)
2542 {
2543         struct takeover_callback_data *cd =
2544                 talloc_get_type_abort(callback_data,
2545                                       struct takeover_callback_data);
2546         int i;
2547
2548         for (i = 0; i < cd->nodemap->num; i++) {
2549                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2550                         break;
2551                 }
2552         }
2553
2554         if (i == cd->nodemap->num) {
2555                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2556                 return;
2557         }
2558
2559         if (!cd->node_failed[i]) {
2560                 cd->node_failed[i] = true;
2561                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2562                                   cd->fail_callback_data);
2563         }
2564 }
2565
2566 /*
2567   make any IP alias changes for public addresses that are necessary 
2568  */
2569 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2570                       uint32_t *force_rebalance_nodes,
2571                       client_async_callback fail_callback, void *callback_data)
2572 {
2573         int i, j, ret;
2574         struct ctdb_public_ip ip;
2575         uint32_t *nodes;
2576         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2577         TDB_DATA data;
2578         struct timeval timeout;
2579         struct client_async_data *async_data;
2580         struct ctdb_client_control_state *state;
2581         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2582         struct ctdb_ipflags *ipflags;
2583         struct takeover_callback_data *takeover_data;
2584         struct iprealloc_callback_data iprealloc_data;
2585         bool *retry_data;
2586         bool can_host_ips;
2587
2588         /*
2589          * ip failover is completely disabled, just send out the 
2590          * ipreallocated event.
2591          */
2592         if (ctdb->tunable.disable_ip_failover != 0) {
2593                 goto ipreallocated;
2594         }
2595
2596         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2597         if (ipflags == NULL) {
2598                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2599                 talloc_free(tmp_ctx);
2600                 return -1;
2601         }
2602
2603         /* Short-circuit IP allocation if no nodes are in the RUNNING
2604          * runstate yet, since no nodes will be able to host IPs */
2605         can_host_ips = false;
2606         for (i=0; i<nodemap->num; i++) {
2607                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2608                         can_host_ips = true;
2609                 }
2610         }
2611         if (!can_host_ips) {
2612                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2613                 return 0;
2614         }
2615
2616         /* Do the IP reassignment calculations */
2617         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2618
2619         /* Now tell all nodes to release any public IPs should not
2620          * host.  This will be a NOOP on nodes that don't currently
2621          * hold the given IP.
2622          */
2623         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2624         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2625
2626         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2627                                                        bool, nodemap->num);
2628         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2629         takeover_data->fail_callback = fail_callback;
2630         takeover_data->fail_callback_data = callback_data;
2631         takeover_data->nodemap = nodemap;
2632
2633         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2634         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2635
2636         async_data->fail_callback = takeover_run_fail_callback;
2637         async_data->callback_data = takeover_data;
2638
2639         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2640
2641         /* Send a RELEASE_IP to all nodes that should not be hosting
2642          * each IP.  For each IP, all but one of these will be
2643          * redundant.  However, the redundant ones are used to tell
2644          * nodes which node should be hosting the IP so that commands
2645          * like "ctdb ip" can display a particular nodes idea of who
2646          * is hosting what. */
2647         for (i=0;i<nodemap->num;i++) {
2648                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2649                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2650                         continue;
2651                 }
2652
2653                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2654                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2655                                 /* This node should be serving this
2656                                    vnn so dont tell it to release the ip
2657                                 */
2658                                 continue;
2659                         }
2660                         ip.pnn  = tmp_ip->pnn;
2661                         ip.addr = tmp_ip->addr;
2662
2663                         timeout = TAKEOVER_TIMEOUT();
2664                         data.dsize = sizeof(ip);
2665                         data.dptr  = (uint8_t *)&ip;
2666                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2667                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2668                                                   data, async_data,
2669                                                   &timeout, NULL);
2670                         if (state == NULL) {
2671                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2672                                 talloc_free(tmp_ctx);
2673                                 return -1;
2674                         }
2675
2676                         ctdb_client_async_add(async_data, state);
2677                 }
2678         }
2679         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2680                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2681                 talloc_free(tmp_ctx);
2682                 return -1;
2683         }
2684         talloc_free(async_data);
2685
2686
2687         /* For each IP, send a TAKOVER_IP to the node that should be
2688          * hosting it.  Many of these will often be redundant (since
2689          * the allocation won't have changed) but they can be useful
2690          * to recover from inconsistencies. */
2691         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2692         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2693
2694         async_data->fail_callback = fail_callback;
2695         async_data->callback_data = callback_data;
2696
2697         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2698                 if (tmp_ip->pnn == -1) {
2699                         /* this IP won't be taken over */
2700                         continue;
2701                 }
2702
2703                 ip.pnn  = tmp_ip->pnn;
2704                 ip.addr = tmp_ip->addr;
2705
2706                 timeout = TAKEOVER_TIMEOUT();
2707                 data.dsize = sizeof(ip);
2708                 data.dptr  = (uint8_t *)&ip;
2709                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2710                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2711                                           data, async_data, &timeout, NULL);
2712                 if (state == NULL) {
2713                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2714                         talloc_free(tmp_ctx);
2715                         return -1;
2716                 }
2717
2718                 ctdb_client_async_add(async_data, state);
2719         }
2720         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2721                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2722                 talloc_free(tmp_ctx);
2723                 return -1;
2724         }
2725
2726 ipreallocated:
2727         /*
2728          * Tell all nodes to run eventscripts to process the
2729          * "ipreallocated" event.  This can do a lot of things,
2730          * including restarting services to reconfigure them if public
2731          * IPs have moved.  Once upon a time this event only used to
2732          * update natgw.
2733          */
2734         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2735         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2736         iprealloc_data.retry_nodes = retry_data;
2737         iprealloc_data.retry_count = 0;
2738         iprealloc_data.fail_callback = fail_callback;
2739         iprealloc_data.fail_callback_data = callback_data;
2740         iprealloc_data.nodemap = nodemap;
2741
2742         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2743         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2744                                         nodes, 0, TAKEOVER_TIMEOUT(),
2745                                         false, tdb_null,
2746                                         NULL, iprealloc_fail_callback,
2747                                         &iprealloc_data);
2748         if (ret != 0) {
2749                 /* If the control failed then we should retry to any
2750                  * nodes flagged by iprealloc_fail_callback using the
2751                  * EVENTSCRIPT control.  This is a best-effort at
2752                  * backward compatiblity when running a mixed cluster
2753                  * where some nodes have not yet been upgraded to
2754                  * support the IPREALLOCATED control.
2755                  */
2756                 DEBUG(DEBUG_WARNING,
2757                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2758
2759                 nodes = talloc_array(tmp_ctx, uint32_t,
2760                                      iprealloc_data.retry_count);
2761                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2762
2763                 j = 0;
2764                 for (i=0; i<nodemap->num; i++) {
2765                         if (iprealloc_data.retry_nodes[i]) {
2766                                 nodes[j] = i;
2767                                 j++;
2768                         }
2769                 }
2770
2771                 data.dptr  = discard_const("ipreallocated");
2772                 data.dsize = strlen((char *)data.dptr) + 1; 
2773                 ret = ctdb_client_async_control(ctdb,
2774                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2775                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2776                                                 false, data,
2777                                                 NULL, fail_callback,
2778                                                 callback_data);
2779                 if (ret != 0) {
2780                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2781                 }
2782         }
2783
2784         talloc_free(tmp_ctx);
2785         return ret;
2786 }
2787
2788
2789 /*
2790   destroy a ctdb_client_ip structure
2791  */
2792 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2793 {
2794         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2795                 ctdb_addr_to_str(&ip->addr),
2796                 ntohs(ip->addr.ip.sin_port),
2797                 ip->client_id));
2798
2799         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2800         return 0;
2801 }
2802
2803 /*
2804   called by a client to inform us of a TCP connection that it is managing
2805   that should tickled with an ACK when IP takeover is done
2806  */
2807 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2808                                 TDB_DATA indata)
2809 {
2810         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2811         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2812         struct ctdb_tcp_list *tcp;
2813         struct ctdb_tcp_connection t;
2814         int ret;
2815         TDB_DATA data;
2816         struct ctdb_client_ip *ip;
2817         struct ctdb_vnn *vnn;
2818         ctdb_sock_addr addr;
2819
2820         /* If we don't have public IPs, tickles are useless */
2821         if (ctdb->vnn == NULL) {
2822                 return 0;
2823         }
2824
2825         tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2826
2827         addr = tcp_sock->src;
2828         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2829         addr = tcp_sock->dest;
2830         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2831
2832         ZERO_STRUCT(addr);
2833         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2834         vnn = find_public_ip_vnn(ctdb, &addr);
2835         if (vnn == NULL) {
2836                 switch (addr.sa.sa_family) {
2837                 case AF_INET:
2838                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2839                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2840                                         ctdb_addr_to_str(&addr)));
2841                         }
2842                         break;
2843                 case AF_INET6:
2844                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2845                                 ctdb_addr_to_str(&addr)));
2846                         break;
2847                 default:
2848                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2849                 }
2850
2851                 return 0;
2852         }
2853
2854         if (vnn->pnn != ctdb->pnn) {
2855                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2856                         ctdb_addr_to_str(&addr),
2857                         client_id, client->pid));
2858                 /* failing this call will tell smbd to die */
2859                 return -1;
2860         }
2861
2862         ip = talloc(client, struct ctdb_client_ip);
2863         CTDB_NO_MEMORY(ctdb, ip);
2864
2865         ip->ctdb      = ctdb;
2866         ip->addr      = addr;
2867         ip->client_id = client_id;
2868         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2869         DLIST_ADD(ctdb->client_ip_list, ip);
2870
2871         tcp = talloc(client, struct ctdb_tcp_list);
2872         CTDB_NO_MEMORY(ctdb, tcp);
2873
2874         tcp->connection.src_addr = tcp_sock->src;
2875         tcp->connection.dst_addr = tcp_sock->dest;
2876
2877         DLIST_ADD(client->tcp_list, tcp);
2878
2879         t.src_addr = tcp_sock->src;
2880         t.dst_addr = tcp_sock->dest;
2881
2882         data.dptr = (uint8_t *)&t;
2883         data.dsize = sizeof(t);
2884
2885         switch (addr.sa.sa_family) {
2886         case AF_INET:
2887                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2888                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2889                         ctdb_addr_to_str(&tcp_sock->src),
2890                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2891                 break;
2892         case AF_INET6:
2893                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2894                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2895                         ctdb_addr_to_str(&tcp_sock->src),
2896                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2897                 break;
2898         default:
2899                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2900         }
2901
2902
2903         /* tell all nodes about this tcp connection */
2904         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2905                                        CTDB_CONTROL_TCP_ADD,
2906                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2907         if (ret != 0) {
2908                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2909                 return -1;
2910         }
2911
2912         return 0;
2913 }
2914
2915 /*
2916   find a tcp address on a list
2917  */
2918 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2919                                            struct ctdb_tcp_connection *tcp)
2920 {
2921         int i;
2922
2923         if (array == NULL) {
2924                 return NULL;
2925         }
2926
2927         for (i=0;i<array->num;i++) {
2928                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2929                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2930                         return &array->connections[i];
2931                 }
2932         }
2933         return NULL;
2934 }
2935
2936
2937
2938 /*
2939   called by a daemon to inform us of a TCP connection that one of its
2940   clients managing that should tickled with an ACK when IP takeover is
2941   done
2942  */
2943 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2944 {
2945         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2946         struct ctdb_tcp_array *tcparray;
2947         struct ctdb_tcp_connection tcp;
2948         struct ctdb_vnn *vnn;
2949
2950         /* If we don't have public IPs, tickles are useless */
2951         if (ctdb->vnn == NULL) {
2952                 return 0;
2953         }
2954
2955         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2956         if (vnn == NULL) {
2957                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2958                         ctdb_addr_to_str(&p->dst_addr)));
2959
2960                 return -1;
2961         }
2962
2963
2964         tcparray = vnn->tcp_array;
2965
2966         /* If this is the first tickle */
2967         if (tcparray == NULL) {
2968                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2969                 CTDB_NO_MEMORY(ctdb, tcparray);
2970                 vnn->tcp_array = tcparray;
2971
2972                 tcparray->num = 0;
2973                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2974                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2975
2976                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2977                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2978                 tcparray->num++;
2979
2980                 if (tcp_update_needed) {
2981                         vnn->tcp_update_needed = true;
2982                 }
2983                 return 0;
2984         }
2985
2986
2987         /* Do we already have this tickle ?*/
2988         tcp.src_addr = p->src_addr;
2989         tcp.dst_addr = p->dst_addr;
2990         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2991                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2992                         ctdb_addr_to_str(&tcp.dst_addr),
2993                         ntohs(tcp.dst_addr.ip.sin_port),
2994                         vnn->pnn));
2995                 return 0;
2996         }
2997
2998         /* A new tickle, we must add it to the array */
2999         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3000                                         struct ctdb_tcp_connection,
3001                                         tcparray->num+1);
3002         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3003
3004         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3005         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3006         tcparray->num++;
3007
3008         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3009                 ctdb_addr_to_str(&tcp.dst_addr),
3010                 ntohs(tcp.dst_addr.ip.sin_port),
3011                 vnn->pnn));
3012
3013         if (tcp_update_needed) {
3014                 vnn->tcp_update_needed = true;
3015         }
3016
3017         return 0;
3018 }
3019
3020
3021 /*
3022   called by a daemon to inform us of a TCP connection that one of its
3023   clients managing that should tickled with an ACK when IP takeover is
3024   done
3025  */
3026 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3027 {
3028         struct ctdb_tcp_connection *tcpp;
3029         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3030
3031         if (vnn == NULL) {
3032                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3033                         ctdb_addr_to_str(&conn->dst_addr)));
3034                 return;
3035         }
3036
3037         /* if the array is empty we cant remove it
3038            and we dont need to do anything
3039          */
3040         if (vnn->tcp_array == NULL) {
3041                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3042                         ctdb_addr_to_str(&conn->dst_addr),
3043                         ntohs(conn->dst_addr.ip.sin_port)));
3044                 return;
3045         }
3046
3047
3048         /* See if we know this connection
3049            if we dont know this connection  then we dont need to do anything
3050          */
3051         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3052         if (tcpp == NULL) {
3053                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3054                         ctdb_addr_to_str(&conn->dst_addr),
3055                         ntohs(conn->dst_addr.ip.sin_port)));
3056                 return;
3057         }
3058
3059
3060         /* We need to remove this entry from the array.
3061            Instead of allocating a new array and copying data to it
3062            we cheat and just copy the last entry in the existing array
3063            to the entry that is to be removed and just shring the 
3064            ->num field
3065          */
3066         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3067         vnn->tcp_array->num--;
3068
3069         /* If we deleted the last entry we also need to remove the entire array
3070          */
3071         if (vnn->tcp_array->num == 0) {
3072                 talloc_free(vnn->tcp_array);
3073                 vnn->tcp_array = NULL;
3074         }               
3075
3076         vnn->tcp_update_needed = true;
3077
3078         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3079                 ctdb_addr_to_str(&conn->src_addr),
3080                 ntohs(conn->src_addr.ip.sin_port)));
3081 }
3082
3083
3084 /*
3085   called by a daemon to inform us of a TCP connection that one of its
3086   clients used are no longer needed in the tickle database
3087  */
3088 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3089 {
3090         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3091
3092         /* If we don't have public IPs, tickles are useless */
3093         if (ctdb->vnn == NULL) {
3094                 return 0;
3095         }
3096
3097         ctdb_remove_tcp_connection(ctdb, conn);
3098
3099         return 0;
3100 }
3101
3102
3103 /*
3104   Called when another daemon starts - causes all tickles for all
3105   public addresses we are serving to be sent to the new node on the
3106   next check.  This actually causes the next scheduled call to
3107   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3108   doesn't require careful error handling.
3109  */
3110 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3111 {
3112         struct ctdb_vnn *vnn;
3113
3114         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3115                            (unsigned long) pnn));
3116
3117         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3118                 vnn->tcp_update_needed = true;
3119         }
3120
3121         return 0;
3122 }
3123
3124
3125 /*
3126   called when a client structure goes away - hook to remove
3127   elements from the tcp_list in all daemons
3128  */
3129 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3130 {
3131         while (client->tcp_list) {
3132                 struct ctdb_tcp_list *tcp = client->tcp_list;
3133                 DLIST_REMOVE(client->tcp_list, tcp);
3134                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3135         }
3136 }
3137
3138
3139 /*
3140   release all IPs on shutdown
3141  */
3142 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3143 {
3144         struct ctdb_vnn *vnn;
3145         int count = 0;
3146
3147         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3148                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3149                         ctdb_vnn_unassign_iface(ctdb, vnn);
3150                         continue;
3151                 }
3152                 if (!vnn->iface) {
3153                         continue;
3154                 }
3155
3156                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3157                                     ctdb_addr_to_str(&vnn->public_address),
3158                                     vnn->public_netmask_bits,
3159                                     ctdb_vnn_iface_string(vnn)));
3160
3161                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3162                                   ctdb_vnn_iface_string(vnn),
3163                                   ctdb_addr_to_str(&vnn->public_address),
3164                                   vnn->public_netmask_bits);
3165                 release_kill_clients(ctdb, &vnn->public_address);
3166                 ctdb_vnn_unassign_iface(ctdb, vnn);
3167                 count++;
3168         }
3169
3170         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3171 }
3172
3173
3174 /*
3175   get list of public IPs
3176  */
3177 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3178                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3179 {
3180         int i, num, len;
3181         struct ctdb_all_public_ips *ips;
3182         struct ctdb_vnn *vnn;
3183         bool only_available = false;
3184
3185         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3186                 only_available = true;
3187         }
3188
3189         /* count how many public ip structures we have */
3190         num = 0;
3191         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3192                 num++;
3193         }
3194
3195         len = offsetof(struct ctdb_all_public_ips, ips) + 
3196                 num*sizeof(struct ctdb_public_ip);
3197         ips = talloc_zero_size(outdata, len);
3198         CTDB_NO_MEMORY(ctdb, ips);
3199
3200         i = 0;
3201         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3202                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3203                         continue;
3204                 }
3205                 ips->ips[i].pnn  = vnn->pnn;
3206                 ips->ips[i].addr = vnn->public_address;
3207                 i++;
3208         }
3209         ips->num = i;
3210         len = offsetof(struct ctdb_all_public_ips, ips) +
3211                 i*sizeof(struct ctdb_public_ip);
3212
3213         outdata->dsize = len;
3214         outdata->dptr  = (uint8_t *)ips;
3215
3216         return 0;
3217 }
3218
3219
3220 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3221                                         struct ctdb_req_control *c,
3222                                         TDB_DATA indata,
3223                                         TDB_DATA *outdata)
3224 {
3225         int i, num, len;
3226         ctdb_sock_addr *addr;
3227         struct ctdb_control_public_ip_info *info;
3228         struct ctdb_vnn *vnn;
3229
3230         addr = (ctdb_sock_addr *)indata.dptr;
3231
3232         vnn = find_public_ip_vnn(ctdb, addr);
3233         if (vnn == NULL) {
3234                 /* if it is not a public ip   it could be our 'single ip' */
3235                 if (ctdb->single_ip_vnn) {
3236                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3237                                 vnn = ctdb->single_ip_vnn;
3238                         }
3239                 }
3240         }
3241         if (vnn == NULL) {
3242                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3243                                  "'%s'not a public address\n",
3244                                  ctdb_addr_to_str(addr)));
3245                 return -1;
3246         }
3247
3248         /* count how many public ip structures we have */
3249         num = 0;
3250         for (;vnn->ifaces[num];) {
3251                 num++;
3252         }
3253
3254         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3255                 num*sizeof(struct ctdb_control_iface_info);
3256         info = talloc_zero_size(outdata, len);
3257         CTDB_NO_MEMORY(ctdb, info);
3258
3259         info->ip.addr = vnn->public_address;
3260         info->ip.pnn = vnn->pnn;
3261         info->active_idx = 0xFFFFFFFF;
3262
3263         for (i=0; vnn->ifaces[i]; i++) {
3264                 struct ctdb_iface *cur;
3265
3266                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3267                 if (cur == NULL) {
3268                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3269                                            vnn->ifaces[i]));
3270                         return -1;
3271                 }
3272                 if (vnn->iface == cur) {
3273                         info->active_idx = i;
3274                 }
3275                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3276                 info->ifaces[i].link_state = cur->link_up;
3277                 info->ifaces[i].references = cur->references;
3278         }
3279         info->num = i;
3280         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3281                 i*sizeof(struct ctdb_control_iface_info);
3282
3283         outdata->dsize = len;
3284         outdata->dptr  = (uint8_t *)info;
3285
3286         return 0;
3287 }
3288
3289 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3290                                 struct ctdb_req_control *c,
3291                                 TDB_DATA *outdata)
3292 {
3293         int i, num, len;
3294         struct ctdb_control_get_ifaces *ifaces;
3295         struct ctdb_iface *cur;
3296
3297         /* count how many public ip structures we have */
3298         num = 0;
3299         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3300                 num++;
3301         }
3302
3303         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3304                 num*sizeof(struct ctdb_control_iface_info);
3305         ifaces = talloc_zero_size(outdata, len);
3306         CTDB_NO_MEMORY(ctdb, ifaces);
3307
3308         i = 0;
3309         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3310                 strcpy(ifaces->ifaces[i].name, cur->name);
3311                 ifaces->ifaces[i].link_state = cur->link_up;
3312                 ifaces->ifaces[i].references = cur->references;
3313                 i++;
3314         }
3315         ifaces->num = i;
3316         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3317                 i*sizeof(struct ctdb_control_iface_info);
3318
3319         outdata->dsize = len;
3320         outdata->dptr  = (uint8_t *)ifaces;
3321
3322         return 0;
3323 }
3324
3325 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3326                                     struct ctdb_req_control *c,
3327                                     TDB_DATA indata)
3328 {
3329         struct ctdb_control_iface_info *info;
3330         struct ctdb_iface *iface;
3331         bool link_up = false;
3332
3333         info = (struct ctdb_control_iface_info *)indata.dptr;
3334
3335         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3336                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3337                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3338                                   len, len, info->name));
3339                 return -1;
3340         }
3341
3342         switch (info->link_state) {
3343         case 0:
3344                 link_up = false;
3345                 break;
3346         case 1:
3347                 link_up = true;
3348                 break;
3349         default:
3350                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3351                                   (unsigned int)info->link_state));
3352                 return -1;
3353         }
3354
3355         if (info->references != 0) {
3356                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3357                                   (unsigned int)info->references));
3358                 return -1;
3359         }
3360
3361         iface = ctdb_find_iface(ctdb, info->name);
3362         if (iface == NULL) {
3363                 return -1;
3364         }
3365
3366         if (link_up == iface->link_up) {
3367                 return 0;
3368         }
3369
3370         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3371               ("iface[%s] has changed it's link status %s => %s\n",
3372                iface->name,
3373                iface->link_up?"up":"down",
3374                link_up?"up":"down"));
3375
3376         iface->link_up = link_up;
3377         return 0;
3378 }
3379
3380
3381 /* 
3382    structure containing the listening socket and the list of tcp connections
3383    that the ctdb daemon is to kill
3384 */
3385 struct ctdb_kill_tcp {
3386         struct ctdb_vnn *vnn;
3387         struct ctdb_context *ctdb;
3388         int capture_fd;
3389         struct fd_event *fde;
3390         trbt_tree_t *connections;
3391         void *private_data;
3392 };
3393
3394 /*
3395   a tcp connection that is to be killed
3396  */
3397 struct ctdb_killtcp_con {
3398         ctdb_sock_addr src_addr;
3399         ctdb_sock_addr dst_addr;
3400         int count;
3401         struct ctdb_kill_tcp *killtcp;
3402 };
3403
3404 /* this function is used to create a key to represent this socketpair
3405    in the killtcp tree.
3406    this key is used to insert and lookup matching socketpairs that are
3407    to be tickled and RST
3408 */
3409 #define KILLTCP_KEYLEN  10
3410 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3411 {
3412         static uint32_t key[KILLTCP_KEYLEN];
3413
3414         bzero(key, sizeof(key));
3415
3416         if (src->sa.sa_family != dst->sa.sa_family) {
3417                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3418                 return key;
3419         }
3420         
3421         switch (src->sa.sa_family) {
3422         case AF_INET:
3423                 key[0]  = dst->ip.sin_addr.s_addr;
3424                 key[1]  = src->ip.sin_addr.s_addr;
3425                 key[2]  = dst->ip.sin_port;
3426                 key[3]  = src->ip.sin_port;
3427                 break;
3428         case AF_INET6: {
3429                 uint32_t *dst6_addr32 =
3430                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3431                 uint32_t *src6_addr32 =
3432                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3433                 key[0]  = dst6_addr32[3];
3434                 key[1]  = src6_addr32[3];
3435                 key[2]  = dst6_addr32[2];
3436                 key[3]  = src6_addr32[2];
3437                 key[4]  = dst6_addr32[1];
3438                 key[5]  = src6_addr32[1];
3439                 key[6]  = dst6_addr32[0];
3440                 key[7]  = src6_addr32[0];
3441                 key[8]  = dst->ip6.sin6_port;
3442                 key[9]  = src->ip6.sin6_port;
3443                 break;
3444         }
3445         default:
3446                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3447                 return key;
3448         }
3449
3450         return key;
3451 }
3452
3453 /*
3454   called when we get a read event on the raw socket
3455  */
3456 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3457                                 uint16_t flags, void *private_data)
3458 {
3459         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3460         struct ctdb_killtcp_con *con;
3461         ctdb_sock_addr src, dst;
3462         uint32_t ack_seq, seq;
3463
3464         if (!(flags & EVENT_FD_READ)) {
3465                 return;
3466         }
3467
3468         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3469                                 killtcp->private_data,
3470                                 &src, &dst,
3471                                 &ack_seq, &seq) != 0) {
3472                 /* probably a non-tcp ACK packet */
3473                 return;
3474         }
3475
3476         /* check if we have this guy in our list of connections
3477            to kill
3478         */
3479         con = trbt_lookuparray32(killtcp->connections, 
3480                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3481         if (con == NULL) {
3482                 /* no this was some other packet we can just ignore */
3483                 return;
3484         }
3485
3486         /* This one has been tickled !
3487            now reset him and remove him from the list.
3488          */
3489         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3490                 ntohs(con->dst_addr.ip.sin_port),
3491                 ctdb_addr_to_str(&con->src_addr),
3492                 ntohs(con->src_addr.ip.sin_port)));
3493
3494         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3495         talloc_free(con);
3496 }
3497
3498
3499 /* when traversing the list of all tcp connections to send tickle acks to
3500    (so that we can capture the ack coming back and kill the connection
3501     by a RST)
3502    this callback is called for each connection we are currently trying to kill
3503 */
3504 static int tickle_connection_traverse(void *param, void *data)
3505 {
3506         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3507
3508         /* have tried too many times, just give up */
3509         if (con->count >= 5) {
3510                 /* can't delete in traverse: reparent to delete_cons */
3511                 talloc_steal(param, con);
3512                 return 0;
3513         }
3514
3515         /* othervise, try tickling it again */
3516         con->count++;
3517         ctdb_sys_send_tcp(
3518                 (ctdb_sock_addr *)&con->dst_addr,
3519                 (ctdb_sock_addr *)&con->src_addr,
3520                 0, 0, 0);
3521         return 0;
3522 }
3523
3524
3525 /* 
3526    called every second until all sentenced connections have been reset
3527  */
3528 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3529                                               struct timeval t, void *private_data)
3530 {
3531         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3532         void *delete_cons = talloc_new(NULL);
3533
3534         /* loop over all connections sending tickle ACKs */
3535         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3536
3537         /* now we've finished traverse, it's safe to do deletion. */
3538         talloc_free(delete_cons);
3539
3540         /* If there are no more connections to kill we can remove the
3541            entire killtcp structure
3542          */
3543         if ( (killtcp->connections == NULL) || 
3544              (killtcp->connections->root == NULL) ) {
3545                 talloc_free(killtcp);
3546                 return;
3547         }
3548
3549         /* try tickling them again in a seconds time
3550          */
3551         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3552                         ctdb_tickle_sentenced_connections, killtcp);
3553 }
3554
3555 /*
3556   destroy the killtcp structure
3557  */
3558 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3559 {
3560         struct ctdb_vnn *tmpvnn;
3561
3562         /* verify that this vnn is still active */
3563         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3564                 if (tmpvnn == killtcp->vnn) {
3565                         break;
3566                 }
3567         }
3568
3569         if (tmpvnn == NULL) {
3570                 return 0;
3571         }
3572
3573         if (killtcp->vnn->killtcp != killtcp) {
3574                 return 0;
3575         }
3576
3577         killtcp->vnn->killtcp = NULL;
3578
3579         return 0;
3580 }
3581
3582
3583 /* nothing fancy here, just unconditionally replace any existing
3584    connection structure with the new one.
3585
3586    dont even free the old one if it did exist, that one is talloc_stolen
3587    by the same node in the tree anyway and will be deleted when the new data 
3588    is deleted
3589 */
3590 static void *add_killtcp_callback(void *parm, void *data)
3591 {
3592         return parm;
3593 }
3594
3595 /*
3596   add a tcp socket to the list of connections we want to RST
3597  */
3598 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3599                                        ctdb_sock_addr *s,
3600                                        ctdb_sock_addr *d)
3601 {
3602         ctdb_sock_addr src, dst;
3603         struct ctdb_kill_tcp *killtcp;
3604         struct ctdb_killtcp_con *con;
3605         struct ctdb_vnn *vnn;
3606
3607         ctdb_canonicalize_ip(s, &src);
3608         ctdb_canonicalize_ip(d, &dst);
3609
3610         vnn = find_public_ip_vnn(ctdb, &dst);
3611         if (vnn == NULL) {
3612                 vnn = find_public_ip_vnn(ctdb, &src);
3613         }
3614         if (vnn == NULL) {
3615                 /* if it is not a public ip   it could be our 'single ip' */
3616                 if (ctdb->single_ip_vnn) {
3617                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3618                                 vnn = ctdb->single_ip_vnn;
3619                         }
3620                 }
3621         }
3622         if (vnn == NULL) {
3623                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3624                 return -1;
3625         }
3626
3627         killtcp = vnn->killtcp;
3628         
3629         /* If this is the first connection to kill we must allocate
3630            a new structure
3631          */
3632         if (killtcp == NULL) {
3633                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3634                 CTDB_NO_MEMORY(ctdb, killtcp);
3635
3636                 killtcp->vnn         = vnn;
3637                 killtcp->ctdb        = ctdb;
3638                 killtcp->capture_fd  = -1;
3639                 killtcp->connections = trbt_create(killtcp, 0);
3640
3641                 vnn->killtcp         = killtcp;
3642                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3643         }
3644
3645
3646
3647         /* create a structure that describes this connection we want to
3648            RST and store it in killtcp->connections
3649         */
3650         con = talloc(killtcp, struct ctdb_killtcp_con);
3651         CTDB_NO_MEMORY(ctdb, con);
3652         con->src_addr = src;
3653         con->dst_addr = dst;
3654         con->count    = 0;
3655         con->killtcp  = killtcp;
3656
3657
3658         trbt_insertarray32_callback(killtcp->connections,
3659                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3660                         add_killtcp_callback, con);
3661
3662         /* 
3663            If we dont have a socket to listen on yet we must create it
3664          */
3665         if (killtcp->capture_fd == -1) {
3666                 const char *iface = ctdb_vnn_iface_string(vnn);
3667                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3668                 if (killtcp->capture_fd == -1) {
3669                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3670                                           "socket on iface '%s' for killtcp (%s)\n",
3671                                           iface, strerror(errno)));
3672                         goto failed;
3673                 }
3674         }
3675
3676
3677         if (killtcp->fde == NULL) {
3678                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3679                                             EVENT_FD_READ,
3680                                             capture_tcp_handler, killtcp);
3681                 tevent_fd_set_auto_close(killtcp->fde);
3682
3683                 /* We also need to set up some events to tickle all these connections
3684                    until they are all reset
3685                 */
3686                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3687                                 ctdb_tickle_sentenced_connections, killtcp);
3688         }
3689
3690         /* tickle him once now */
3691         ctdb_sys_send_tcp(
3692                 &con->dst_addr,
3693                 &con->src_addr,
3694                 0, 0, 0);
3695
3696         return 0;
3697
3698 failed:
3699         talloc_free(vnn->killtcp);
3700         vnn->killtcp = NULL;
3701         return -1;
3702 }
3703
3704 /*
3705   kill a TCP connection.
3706  */
3707 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3708 {
3709         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3710
3711         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3712 }
3713
3714 /*
3715   called by a daemon to inform us of the entire list of TCP tickles for
3716   a particular public address.
3717   this control should only be sent by the node that is currently serving
3718   that public address.
3719  */
3720 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3721 {
3722         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3723         struct ctdb_tcp_array *tcparray;
3724         struct ctdb_vnn *vnn;
3725
3726         /* We must at least have tickles.num or else we cant verify the size
3727            of the received data blob
3728          */
3729         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3730                                         tickles.connections)) {
3731                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3732                 return -1;
3733         }
3734
3735         /* verify that the size of data matches what we expect */
3736         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3737                                 tickles.connections)
3738                          + sizeof(struct ctdb_tcp_connection)
3739                                  * list->tickles.num) {
3740                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3741                 return -1;
3742         }
3743
3744         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3745                            ctdb_addr_to_str(&list->addr)));
3746
3747         vnn = find_public_ip_vnn(ctdb, &list->addr);
3748         if (vnn == NULL) {
3749                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3750                         ctdb_addr_to_str(&list->addr)));
3751
3752                 return 1;
3753         }
3754
3755         /* remove any old ticklelist we might have */
3756         talloc_free(vnn->tcp_array);
3757         vnn->tcp_array = NULL;
3758
3759         tcparray = talloc(vnn, struct ctdb_tcp_array);
3760         CTDB_NO_MEMORY(ctdb, tcparray);
3761
3762         tcparray->num = list->tickles.num;
3763
3764         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3765         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3766
3767         memcpy(tcparray->connections, &list->tickles.connections[0],
3768                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3769
3770         /* We now have a new fresh tickle list array for this vnn */
3771         vnn->tcp_array = tcparray;
3772
3773         return 0;
3774 }
3775
3776 /*
3777   called to return the full list of tickles for the puclic address associated 
3778   with the provided vnn
3779  */
3780 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3781 {
3782         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3783         struct ctdb_control_tcp_tickle_list *list;
3784         struct ctdb_tcp_array *tcparray;
3785         int num;
3786         struct ctdb_vnn *vnn;
3787
3788         vnn = find_public_ip_vnn(ctdb, addr);
3789         if (vnn == NULL) {
3790                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3791                         ctdb_addr_to_str(addr)));
3792
3793                 return 1;
3794         }
3795
3796         tcparray = vnn->tcp_array;
3797         if (tcparray) {
3798                 num = tcparray->num;
3799         } else {
3800                 num = 0;
3801         }
3802
3803         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3804                                 tickles.connections)
3805                         + sizeof(struct ctdb_tcp_connection) * num;
3806
3807         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3808         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3809         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3810
3811         list->addr = *addr;
3812         list->tickles.num = num;
3813         if (num) {
3814                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3815                         sizeof(struct ctdb_tcp_connection) * num);
3816         }
3817
3818         return 0;
3819 }
3820
3821
3822 /*
3823   set the list of all tcp tickles for a public address
3824  */
3825 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3826                                             ctdb_sock_addr *addr,
3827                                             struct ctdb_tcp_array *tcparray)
3828 {
3829         int ret, num;
3830         TDB_DATA data;
3831         struct ctdb_control_tcp_tickle_list *list;
3832
3833         if (tcparray) {
3834                 num = tcparray->num;
3835         } else {
3836                 num = 0;
3837         }
3838
3839         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3840                                 tickles.connections) +
3841                         sizeof(struct ctdb_tcp_connection) * num;
3842         data.dptr = talloc_size(ctdb, data.dsize);
3843         CTDB_NO_MEMORY(ctdb, data.dptr);
3844
3845         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3846         list->addr = *addr;
3847         list->tickles.num = num;
3848         if (tcparray) {
3849                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3850         }
3851
3852         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3853                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3854                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3855         if (ret != 0) {
3856                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3857                 return -1;
3858         }
3859
3860         talloc_free(data.dptr);
3861
3862         return ret;
3863 }
3864
3865
3866 /*
3867   perform tickle updates if required
3868  */
3869 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3870                                 struct timed_event *te, 
3871                                 struct timeval t, void *private_data)
3872 {
3873         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3874         int ret;
3875         struct ctdb_vnn *vnn;
3876
3877         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3878                 /* we only send out updates for public addresses that 
3879                    we have taken over
3880                  */
3881                 if (ctdb->pnn != vnn->pnn) {
3882                         continue;
3883                 }
3884                 /* We only send out the updates if we need to */
3885                 if (!vnn->tcp_update_needed) {
3886                         continue;
3887                 }
3888                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3889                                                        &vnn->public_address,
3890                                                        vnn->tcp_array);
3891                 if (ret != 0) {
3892                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3893                                 ctdb_addr_to_str(&vnn->public_address)));
3894                 } else {
3895                         DEBUG(DEBUG_INFO,
3896                               ("Sent tickle update for public address %s\n",
3897                                ctdb_addr_to_str(&vnn->public_address)));
3898                         vnn->tcp_update_needed = false;
3899                 }
3900         }
3901
3902         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3903                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3904                              ctdb_update_tcp_tickles, ctdb);
3905 }               
3906         
3907
3908 /*
3909   start periodic update of tcp tickles
3910  */
3911 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3912 {
3913         ctdb->tickle_update_context = talloc_new(ctdb);
3914
3915         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3916                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3917                              ctdb_update_tcp_tickles, ctdb);
3918 }
3919
3920
3921
3922
3923 struct control_gratious_arp {
3924         struct ctdb_context *ctdb;
3925         ctdb_sock_addr addr;
3926         const char *iface;
3927         int count;
3928 };
3929
3930 /*
3931   send a control_gratuitous arp
3932  */
3933 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3934                                   struct timeval t, void *private_data)
3935 {
3936         int ret;
3937         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3938                                                         struct control_gratious_arp);
3939
3940         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3941         if (ret != 0) {
3942                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3943                                  arp->iface, strerror(errno)));
3944         }
3945
3946
3947         arp->count++;
3948         if (arp->count == CTDB_ARP_REPEAT) {
3949                 talloc_free(arp);
3950                 return;
3951         }
3952
3953         event_add_timed(arp->ctdb->ev, arp, 
3954                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3955                         send_gratious_arp, arp);
3956 }
3957
3958
3959 /*
3960   send a gratious arp 
3961  */
3962 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3963 {
3964         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3965         struct control_gratious_arp *arp;
3966
3967         /* verify the size of indata */
3968         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3969                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3970                                  (unsigned)indata.dsize, 
3971                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3972                 return -1;
3973         }
3974         if (indata.dsize != 
3975                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3976                 + gratious_arp->len ) ){
3977
3978                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3979                         "but should be %u bytes\n", 
3980                          (unsigned)indata.dsize, 
3981                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3982                 return -1;
3983         }
3984
3985
3986         arp = talloc(ctdb, struct control_gratious_arp);
3987         CTDB_NO_MEMORY(ctdb, arp);
3988
3989         arp->ctdb  = ctdb;
3990         arp->addr   = gratious_arp->addr;
3991         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3992         CTDB_NO_MEMORY(ctdb, arp->iface);
3993         arp->count = 0;
3994         
3995         event_add_timed(arp->ctdb->ev, arp, 
3996                         timeval_zero(), send_gratious_arp, arp);
3997
3998         return 0;
3999 }
4000
4001 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4002 {
4003         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4004         int ret;
4005
4006         /* verify the size of indata */
4007         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4008                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4009                 return -1;
4010         }
4011         if (indata.dsize != 
4012                 ( offsetof(struct ctdb_control_ip_iface, iface)
4013                 + pub->len ) ){
4014
4015                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4016                         "but should be %u bytes\n", 
4017                          (unsigned)indata.dsize, 
4018                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4019                 return -1;
4020         }
4021
4022         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4023
4024         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4025
4026         if (ret != 0) {
4027                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4028                 return -1;
4029         }
4030
4031         return 0;
4032 }
4033
4034 struct delete_ip_callback_state {
4035         struct ctdb_req_control *c;
4036 };
4037
4038 /*
4039   called when releaseip event finishes for del_public_address
4040  */
4041 static void delete_ip_callback(struct ctdb_context *ctdb,
4042                                int32_t status, TDB_DATA data,
4043                                const char *errormsg,
4044                                void *private_data)
4045 {
4046         struct delete_ip_callback_state *state =
4047                 talloc_get_type(private_data, struct delete_ip_callback_state);
4048
4049         /* If release failed then fail. */
4050         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4051         talloc_free(private_data);
4052 }
4053
4054 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4055                                         struct ctdb_req_control *c,
4056                                         TDB_DATA indata, bool *async_reply)
4057 {
4058         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4059         struct ctdb_vnn *vnn;
4060
4061         /* verify the size of indata */
4062         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4063                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4064                 return -1;
4065         }
4066         if (indata.dsize != 
4067                 ( offsetof(struct ctdb_control_ip_iface, iface)
4068                 + pub->len ) ){
4069
4070                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4071                         "but should be %u bytes\n", 
4072                          (unsigned)indata.dsize, 
4073                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4074                 return -1;
4075         }
4076
4077         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4078
4079         /* walk over all public addresses until we find a match */
4080         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4081                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4082                         if (vnn->pnn == ctdb->pnn) {
4083                                 struct delete_ip_callback_state *state;
4084                                 struct ctdb_public_ip *ip;
4085                                 TDB_DATA data;
4086                                 int ret;
4087
4088                                 vnn->delete_pending = true;
4089
4090                                 state = talloc(ctdb,
4091                                                struct delete_ip_callback_state);
4092                                 CTDB_NO_MEMORY(ctdb, state);
4093                                 state->c = c;
4094
4095                                 ip = talloc(state, struct ctdb_public_ip);
4096                                 if (ip == NULL) {
4097                                         DEBUG(DEBUG_ERR,
4098                                               (__location__ " Out of memory\n"));
4099                                         talloc_free(state);
4100                                         return -1;
4101                                 }
4102                                 ip->pnn = -1;
4103                                 ip->addr = pub->addr;
4104
4105                                 data.dsize = sizeof(struct ctdb_public_ip);
4106                                 data.dptr = (unsigned char *)ip;
4107
4108                                 ret = ctdb_daemon_send_control(ctdb,
4109                                                                ctdb_get_pnn(ctdb),
4110                                                                0,
4111                                                                CTDB_CONTROL_RELEASE_IP,
4112                                                                0, 0,
4113                                                                data,
4114                                                                delete_ip_callback,
4115                                                                state);
4116                                 if (ret == -1) {
4117                                         DEBUG(DEBUG_ERR,
4118                                               (__location__ "Unable to send "
4119                                                "CTDB_CONTROL_RELEASE_IP\n"));
4120                                         talloc_free(state);
4121                                         return -1;
4122                                 }
4123
4124                                 state->c = talloc_steal(state, c);
4125                                 *async_reply = true;
4126                         } else {
4127                                 /* This IP is not hosted on the
4128                                  * current node so just delete it
4129                                  * now. */
4130                                 do_delete_ip(ctdb, vnn);
4131                         }
4132
4133                         return 0;
4134                 }
4135         }
4136
4137         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4138                          ctdb_addr_to_str(&pub->addr)));
4139         return -1;
4140 }
4141
4142
4143 struct ipreallocated_callback_state {
4144         struct ctdb_req_control *c;
4145 };
4146
4147 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4148                                         int status, void *p)
4149 {
4150         struct ipreallocated_callback_state *state =
4151                 talloc_get_type(p, struct ipreallocated_callback_state);
4152
4153         if (status != 0) {
4154                 DEBUG(DEBUG_ERR,
4155                       (" \"ipreallocated\" event script failed (status %d)\n",
4156                        status));
4157                 if (status == -ETIME) {
4158                         ctdb_ban_self(ctdb);
4159                 }
4160         }
4161
4162         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4163         talloc_free(state);
4164 }
4165
4166 /* A control to run the ipreallocated event */
4167 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4168                                    struct ctdb_req_control *c,
4169                                    bool *async_reply)
4170 {
4171         int ret;
4172         struct ipreallocated_callback_state *state;
4173
4174         state = talloc(ctdb, struct ipreallocated_callback_state);
4175         CTDB_NO_MEMORY(ctdb, state);
4176
4177         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4178
4179         ret = ctdb_event_script_callback(ctdb, state,
4180                                          ctdb_ipreallocated_callback, state,
4181                                          CTDB_EVENT_IPREALLOCATED,
4182                                          "%s", "");
4183
4184         if (ret != 0) {
4185                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4186                 talloc_free(state);
4187                 return -1;
4188         }
4189
4190         /* tell the control that we will be reply asynchronously */
4191         state->c    = talloc_steal(state, c);
4192         *async_reply = true;
4193
4194         return 0;
4195 }
4196
4197
4198 /* This function is called from the recovery daemon to verify that a remote
4199    node has the expected ip allocation.
4200    This is verified against ctdb->ip_tree
4201 */
4202 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4203                                 struct ctdb_all_public_ips *ips,
4204                                 uint32_t pnn)
4205 {
4206         struct ctdb_public_ip_list *tmp_ip; 
4207         int i;
4208
4209         if (ctdb->ip_tree == NULL) {
4210                 /* dont know the expected allocation yet, assume remote node
4211                    is correct. */
4212                 return 0;
4213         }
4214
4215         if (ips == NULL) {
4216                 return 0;
4217         }
4218
4219         for (i=0; i<ips->num; i++) {
4220                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4221                 if (tmp_ip == NULL) {
4222                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4223                         return -1;
4224                 }
4225
4226                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4227                         continue;
4228                 }
4229
4230                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4231                         DEBUG(DEBUG_ERR,
4232                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4233                                pnn,
4234                                ctdb_addr_to_str(&ips->ips[i].addr),
4235                                ips->ips[i].pnn, tmp_ip->pnn));
4236                         return -1;
4237                 }
4238         }
4239
4240         return 0;
4241 }
4242
4243 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4244 {
4245         struct ctdb_public_ip_list *tmp_ip; 
4246
4247         if (ctdb->ip_tree == NULL) {
4248                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4249                 return -1;
4250         }
4251
4252         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4253         if (tmp_ip == NULL) {
4254                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4255                 return -1;
4256         }
4257
4258         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4259         tmp_ip->pnn = ip->pnn;
4260
4261         return 0;
4262 }
4263
4264
4265 struct ctdb_reloadips_handle {
4266         struct ctdb_context *ctdb;
4267         struct ctdb_req_control *c;
4268         int status;
4269         int fd[2];
4270         pid_t child;
4271         struct fd_event *fde;
4272 };
4273
4274 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4275 {
4276         if (h == h->ctdb->reload_ips) {
4277                 h->ctdb->reload_ips = NULL;
4278         }
4279         if (h->c != NULL) {
4280                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4281                 h->c = NULL;
4282         }
4283         ctdb_kill(h->ctdb, h->child, SIGKILL);
4284         return 0;
4285 }
4286
4287 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4288                                 struct timed_event *te,
4289                                 struct timeval t, void *private_data)
4290 {
4291         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4292
4293         talloc_free(h);
4294 }       
4295
4296 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4297                              uint16_t flags, void *private_data)
4298 {
4299         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4300
4301         char res;
4302         int ret;
4303
4304         ret = sys_read(h->fd[0], &res, 1);
4305         if (ret < 1 || res != 0) {
4306                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4307                 res = 1;
4308         }
4309         h->status = res;
4310
4311         talloc_free(h);
4312 }
4313
4314 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4315 {
4316         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4317         struct ctdb_all_public_ips *ips;
4318         struct ctdb_vnn *vnn;
4319         struct client_async_data *async_data;
4320         struct timeval timeout;
4321         TDB_DATA data;
4322         struct ctdb_client_control_state *state;
4323         bool first_add;
4324         int i, ret;
4325
4326         CTDB_NO_MEMORY(ctdb, mem_ctx);
4327
4328         /* Read IPs from local node */
4329         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4330                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4331         if (ret != 0) {
4332                 DEBUG(DEBUG_ERR,
4333                       ("Unable to fetch public IPs from local node\n"));
4334                 talloc_free(mem_ctx);
4335                 return -1;
4336         }
4337
4338         /* Read IPs file - this is safe since this is a child process */
4339         ctdb->vnn = NULL;
4340         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4341                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4342                 talloc_free(mem_ctx);
4343                 return -1;
4344         }
4345
4346         async_data = talloc_zero(mem_ctx, struct client_async_data);
4347         CTDB_NO_MEMORY(ctdb, async_data);
4348
4349         /* Compare IPs between node and file for IPs to be deleted */
4350         for (i = 0; i < ips->num; i++) {
4351                 /* */
4352                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4353                         if (ctdb_same_ip(&vnn->public_address,
4354                                          &ips->ips[i].addr)) {
4355                                 /* IP is still in file */
4356                                 break;
4357                         }
4358                 }
4359
4360                 if (vnn == NULL) {
4361                         /* Delete IP ips->ips[i] */
4362                         struct ctdb_control_ip_iface *pub;
4363
4364                         DEBUG(DEBUG_NOTICE,
4365                               ("IP %s no longer configured, deleting it\n",
4366                                ctdb_addr_to_str(&ips->ips[i].addr)));
4367
4368                         pub = talloc_zero(mem_ctx,
4369                                           struct ctdb_control_ip_iface);
4370                         CTDB_NO_MEMORY(ctdb, pub);
4371
4372                         pub->addr  = ips->ips[i].addr;
4373                         pub->mask  = 0;
4374                         pub->len   = 0;
4375
4376                         timeout = TAKEOVER_TIMEOUT();
4377
4378                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4379                                               iface) + pub->len;
4380                         data.dptr = (uint8_t *)pub;
4381
4382                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4383                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4384                                                   0, data, async_data,
4385                                                   &timeout, NULL);
4386                         if (state == NULL) {
4387                                 DEBUG(DEBUG_ERR,
4388                                       (__location__
4389                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4390                                 goto failed;
4391                         }
4392
4393                         ctdb_client_async_add(async_data, state);
4394                 }
4395         }
4396
4397         /* Compare IPs between node and file for IPs to be added */
4398         first_add = true;
4399         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4400                 for (i = 0; i < ips->num; i++) {
4401                         if (ctdb_same_ip(&vnn->public_address,
4402                                          &ips->ips[i].addr)) {
4403                                 /* IP already on node */
4404                                 break;
4405                         }
4406                 }
4407                 if (i == ips->num) {
4408                         /* Add IP ips->ips[i] */
4409                         struct ctdb_control_ip_iface *pub;
4410                         const char *ifaces = NULL;
4411                         uint32_t len;
4412                         int iface = 0;
4413
4414                         DEBUG(DEBUG_NOTICE,
4415                               ("New IP %s configured, adding it\n",
4416                                ctdb_addr_to_str(&vnn->public_address)));
4417                         if (first_add) {
4418                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4419
4420                                 data.dsize = sizeof(pnn);
4421                                 data.dptr  = (uint8_t *)&pnn;
4422
4423                                 ret = ctdb_client_send_message(
4424                                         ctdb,
4425                                         CTDB_BROADCAST_CONNECTED,
4426                                         CTDB_SRVID_REBALANCE_NODE,
4427                                         data);
4428                                 if (ret != 0) {
4429                                         DEBUG(DEBUG_WARNING,
4430                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4431                                 }
4432
4433                                 first_add = false;
4434                         }
4435
4436                         ifaces = vnn->ifaces[0];
4437                         iface = 1;
4438                         while (vnn->ifaces[iface] != NULL) {
4439                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4440                                                          vnn->ifaces[iface]);
4441                                 iface++;
4442                         }
4443
4444                         len   = strlen(ifaces) + 1;
4445                         pub = talloc_zero_size(mem_ctx,
4446                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4447                         CTDB_NO_MEMORY(ctdb, pub);
4448
4449                         pub->addr  = vnn->public_address;
4450                         pub->mask  = vnn->public_netmask_bits;
4451                         pub->len   = len;
4452                         memcpy(&pub->iface[0], ifaces, pub->len);
4453
4454                         timeout = TAKEOVER_TIMEOUT();
4455
4456                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4457                                               iface) + pub->len;
4458                         data.dptr = (uint8_t *)pub;
4459
4460                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4461                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4462                                                   0, data, async_data,
4463                                                   &timeout, NULL);
4464                         if (state == NULL) {
4465                                 DEBUG(DEBUG_ERR,
4466                                       (__location__
4467                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4468                                 goto failed;
4469                         }
4470
4471                         ctdb_client_async_add(async_data, state);
4472                 }
4473         }
4474
4475         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4476                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4477                 goto failed;
4478         }
4479
4480         talloc_free(mem_ctx);
4481         return 0;
4482
4483 failed:
4484         talloc_free(mem_ctx);
4485         return -1;
4486 }
4487
4488 /* This control is sent to force the node to re-read the public addresses file
4489    and drop any addresses we should nnot longer host, and add new addresses
4490    that we are now able to host
4491 */
4492 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4493 {
4494         struct ctdb_reloadips_handle *h;
4495         pid_t parent = getpid();
4496
4497         if (ctdb->reload_ips != NULL) {
4498                 talloc_free(ctdb->reload_ips);
4499                 ctdb->reload_ips = NULL;
4500         }
4501
4502         h = talloc(ctdb, struct ctdb_reloadips_handle);
4503         CTDB_NO_MEMORY(ctdb, h);
4504         h->ctdb     = ctdb;
4505         h->c        = NULL;
4506         h->status   = -1;
4507         
4508         if (pipe(h->fd) == -1) {
4509                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4510                 talloc_free(h);
4511                 return -1;
4512         }
4513
4514         h->child = ctdb_fork(ctdb);
4515         if (h->child == (pid_t)-1) {
4516                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4517                 close(h->fd[0]);
4518                 close(h->fd[1]);
4519                 talloc_free(h);
4520                 return -1;
4521         }
4522
4523         /* child process */
4524         if (h->child == 0) {
4525                 signed char res = 0;
4526
4527                 close(h->fd[0]);
4528                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4529
4530                 ctdb_set_process_name("ctdb_reloadips");
4531                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4532                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4533                         res = -1;
4534                 } else {
4535                         res = ctdb_reloadips_child(ctdb);
4536                         if (res != 0) {
4537                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4538                         }
4539                 }
4540
4541                 sys_write(h->fd[1], &res, 1);
4542                 /* make sure we die when our parent dies */
4543                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4544                         sleep(5);
4545                 }
4546                 _exit(0);
4547         }
4548
4549         h->c             = talloc_steal(h, c);
4550
4551         close(h->fd[1]);
4552         set_close_on_exec(h->fd[0]);
4553
4554         talloc_set_destructor(h, ctdb_reloadips_destructor);
4555
4556
4557         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4558                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4559                         (void *)h);
4560         tevent_fd_set_auto_close(h->fde);
4561
4562         event_add_timed(ctdb->ev, h,
4563                         timeval_current_ofs(120, 0),
4564                         ctdb_reloadips_timeout_event, h);
4565
4566         /* we reply later */
4567         *async_reply = true;
4568         return 0;
4569 }