recoverd: Call takeover fail callback only once per node
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          false,
514                                          CTDB_EVENT_TAKE_IP,
515                                          "%s %s %u",
516                                          ctdb_vnn_iface_string(vnn),
517                                          ctdb_addr_to_str(&vnn->public_address),
518                                          vnn->public_netmask_bits);
519
520         if (ret != 0) {
521                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522                         ctdb_addr_to_str(&vnn->public_address),
523                         ctdb_vnn_iface_string(vnn)));
524                 talloc_free(state);
525                 return -1;
526         }
527
528         return 0;
529 }
530
531 struct ctdb_do_updateip_state {
532         struct ctdb_req_control *c;
533         struct ctdb_iface *old;
534         struct ctdb_vnn *vnn;
535 };
536
537 /*
538   called when updateip event finishes
539  */
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541                                       void *private_data)
542 {
543         struct ctdb_do_updateip_state *state =
544                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545         int32_t ret;
546
547         if (status != 0) {
548                 if (status == -ETIME) {
549                         ctdb_ban_self(ctdb);
550                 }
551                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552                         ctdb_addr_to_str(&state->vnn->public_address),
553                         state->old->name,
554                         ctdb_vnn_iface_string(state->vnn)));
555
556                 /*
557                  * All we can do is reset the old interface
558                  * and let the next run fix it
559                  */
560                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561                 state->vnn->iface = state->old;
562                 state->vnn->iface->references++;
563
564                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         if (ctdb->do_checkpublicip) {
570
571         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572         if (ret != 0) {
573                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574                 talloc_free(state);
575                 return;
576         }
577
578         }
579
580         /* the control succeeded */
581         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582         talloc_free(state);
583         return;
584 }
585
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587 {
588         state->vnn->update_in_flight = false;
589         return 0;
590 }
591
592 /*
593   update (move) an ip address
594  */
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596                                 struct ctdb_req_control *c,
597                                 struct ctdb_vnn *vnn)
598 {
599         int ret;
600         struct ctdb_do_updateip_state *state;
601         struct ctdb_iface *old = vnn->iface;
602         const char *new_name;
603
604         if (vnn->update_in_flight) {
605                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606                                     "update for this IP already in flight\n",
607                                     ctdb_addr_to_str(&vnn->public_address),
608                                     vnn->public_netmask_bits));
609                 return -1;
610         }
611
612         ctdb_vnn_unassign_iface(ctdb, vnn);
613         ret = ctdb_vnn_assign_iface(ctdb, vnn);
614         if (ret != 0) {
615                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616                                  "assin a usable interface (old iface '%s')\n",
617                                  ctdb_addr_to_str(&vnn->public_address),
618                                  vnn->public_netmask_bits,
619                                  old->name));
620                 return -1;
621         }
622
623         new_name = ctdb_vnn_iface_string(vnn);
624         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625                 /* A benign update from one interface onto itself.
626                  * no need to run the eventscripts in this case, just return
627                  * success.
628                  */
629                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630                 return 0;
631         }
632
633         state = talloc(vnn, struct ctdb_do_updateip_state);
634         CTDB_NO_MEMORY(ctdb, state);
635
636         state->c = talloc_steal(ctdb, c);
637         state->old = old;
638         state->vnn = vnn;
639
640         vnn->update_in_flight = true;
641         talloc_set_destructor(state, ctdb_updateip_destructor);
642
643         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644                             "interface %s to %s\n",
645                             ctdb_addr_to_str(&vnn->public_address),
646                             vnn->public_netmask_bits,
647                             old->name,
648                             new_name));
649
650         ret = ctdb_event_script_callback(ctdb,
651                                          state,
652                                          ctdb_do_updateip_callback,
653                                          state,
654                                          false,
655                                          CTDB_EVENT_UPDATE_IP,
656                                          "%s %s %s %u",
657                                          state->old->name,
658                                          new_name,
659                                          ctdb_addr_to_str(&vnn->public_address),
660                                          vnn->public_netmask_bits);
661         if (ret != 0) {
662                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663                                  ctdb_addr_to_str(&vnn->public_address),
664                                  old->name, new_name));
665                 talloc_free(state);
666                 return -1;
667         }
668
669         return 0;
670 }
671
672 /*
673   Find the vnn of the node that has a public ip address
674   returns -1 if the address is not known as a public address
675  */
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677 {
678         struct ctdb_vnn *vnn;
679
680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681                 if (ctdb_same_ip(&vnn->public_address, addr)) {
682                         return vnn;
683                 }
684         }
685
686         return NULL;
687 }
688
689 /*
690   take over an ip address
691  */
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693                                  struct ctdb_req_control *c,
694                                  TDB_DATA indata,
695                                  bool *async_reply)
696 {
697         int ret;
698         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699         struct ctdb_vnn *vnn;
700         bool have_ip = false;
701         bool do_updateip = false;
702         bool do_takeip = false;
703         struct ctdb_iface *best_iface = NULL;
704
705         if (pip->pnn != ctdb->pnn) {
706                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707                                  "with pnn %d, but we're node %d\n",
708                                  ctdb_addr_to_str(&pip->addr),
709                                  pip->pnn, ctdb->pnn));
710                 return -1;
711         }
712
713         /* update out vnn list */
714         vnn = find_public_ip_vnn(ctdb, &pip->addr);
715         if (vnn == NULL) {
716                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717                         ctdb_addr_to_str(&pip->addr)));
718                 return 0;
719         }
720
721         if (ctdb->do_checkpublicip) {
722                 have_ip = ctdb_sys_have_ip(&pip->addr);
723         }
724         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725         if (best_iface == NULL) {
726                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727                                  "a usable interface (old %s, have_ip %d)\n",
728                                  ctdb_addr_to_str(&vnn->public_address),
729                                  vnn->public_netmask_bits,
730                                  ctdb_vnn_iface_string(vnn),
731                                  have_ip));
732                 return -1;
733         }
734
735         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737                 have_ip = false;
738         }
739
740
741         if (vnn->iface == NULL && have_ip) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744                                  ctdb_addr_to_str(&vnn->public_address)));
745                 return 0;
746         }
747
748         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750                                   "and we have it on iface[%s], but it was assigned to node %d"
751                                   "and we are node %d, banning ourself\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754                 ctdb_ban_self(ctdb);
755                 return -1;
756         }
757
758         if (vnn->pnn == -1 && have_ip) {
759                 vnn->pnn = ctdb->pnn;
760                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761                                   "and we already have it on iface[%s], update local daemon\n",
762                                  ctdb_addr_to_str(&vnn->public_address),
763                                   ctdb_vnn_iface_string(vnn)));
764                 return 0;
765         }
766
767         if (vnn->iface) {
768                 if (vnn->iface != best_iface) {
769                         if (!vnn->iface->link_up) {
770                                 do_updateip = true;
771                         } else if (vnn->iface->references > (best_iface->references + 1)) {
772                                 /* only move when the rebalance gains something */
773                                         do_updateip = true;
774                         }
775                 }
776         }
777
778         if (!have_ip) {
779                 if (do_updateip) {
780                         ctdb_vnn_unassign_iface(ctdb, vnn);
781                         do_updateip = false;
782                 }
783                 do_takeip = true;
784         }
785
786         if (do_takeip) {
787                 ret = ctdb_do_takeip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else if (do_updateip) {
792                 ret = ctdb_do_updateip(ctdb, c, vnn);
793                 if (ret != 0) {
794                         return -1;
795                 }
796         } else {
797                 /*
798                  * The interface is up and the kernel known the ip
799                  * => do nothing
800                  */
801                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits,
804                         ctdb_vnn_iface_string(vnn)));
805                 return 0;
806         }
807
808         /* tell ctdb_control.c that we will be replying asynchronously */
809         *async_reply = true;
810
811         return 0;
812 }
813
814 /*
815   takeover an ip address old v4 style
816  */
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
818                                 struct ctdb_req_control *c,
819                                 TDB_DATA indata, 
820                                 bool *async_reply)
821 {
822         TDB_DATA data;
823         
824         data.dsize = sizeof(struct ctdb_public_ip);
825         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826         CTDB_NO_MEMORY(ctdb, data.dptr);
827         
828         memcpy(data.dptr, indata.dptr, indata.dsize);
829         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
830 }
831
832 /*
833   kill any clients that are registered with a IP that is being released
834  */
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
836 {
837         struct ctdb_client_ip *ip;
838
839         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840                 ctdb_addr_to_str(addr)));
841
842         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843                 ctdb_sock_addr tmp_addr;
844
845                 tmp_addr = ip->addr;
846                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
847                         ip->client_id,
848                         ctdb_addr_to_str(&ip->addr)));
849
850                 if (ctdb_same_ip(&tmp_addr, addr)) {
851                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
852                                                                      ip->client_id, 
853                                                                      struct ctdb_client);
854                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
855                                 ip->client_id,
856                                 ctdb_addr_to_str(&ip->addr),
857                                 client->pid));
858
859                         if (client->pid != 0) {
860                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861                                         (unsigned)client->pid,
862                                         ctdb_addr_to_str(addr),
863                                         ip->client_id));
864                                 ctdb_kill(ctdb, client->pid, SIGKILL);
865                         }
866                 }
867         }
868 }
869
870 /*
871   called when releaseip event finishes
872  */
873 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
874                                 void *private_data)
875 {
876         struct takeover_callback_state *state = 
877                 talloc_get_type(private_data, struct takeover_callback_state);
878         TDB_DATA data;
879
880         if (status == -ETIME) {
881                 ctdb_ban_self(ctdb);
882         }
883
884         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
885                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
886                                   ctdb_addr_to_str(state->addr)));
887                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
888                 talloc_free(state);
889                 return;
890         }
891
892         /* send a message to all clients of this node telling them
893            that the cluster has been reconfigured and they should
894            release any sockets on this IP */
895         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
896         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
897         data.dsize = strlen((char *)data.dptr)+1;
898
899         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
900
901         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
902
903         /* kill clients that have registered with this IP */
904         release_kill_clients(ctdb, state->addr);
905
906         ctdb_vnn_unassign_iface(ctdb, state->vnn);
907
908         /* the control succeeded */
909         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
910         talloc_free(state);
911 }
912
913 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
914 {
915         state->vnn->update_in_flight = false;
916         return 0;
917 }
918
919 /*
920   release an ip address
921  */
922 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
923                                 struct ctdb_req_control *c,
924                                 TDB_DATA indata, 
925                                 bool *async_reply)
926 {
927         int ret;
928         struct takeover_callback_state *state;
929         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
930         struct ctdb_vnn *vnn;
931         char *iface;
932
933         /* update our vnn list */
934         vnn = find_public_ip_vnn(ctdb, &pip->addr);
935         if (vnn == NULL) {
936                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
937                         ctdb_addr_to_str(&pip->addr)));
938                 return 0;
939         }
940         vnn->pnn = pip->pnn;
941
942         /* stop any previous arps */
943         talloc_free(vnn->takeover_ctx);
944         vnn->takeover_ctx = NULL;
945
946         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
947          * lazy multicast to drop an IP from any node that isn't the
948          * intended new node.  The following causes makes ctdbd ignore
949          * a release for any address it doesn't host.
950          */
951         if (ctdb->do_checkpublicip) {
952                 if (!ctdb_sys_have_ip(&pip->addr)) {
953                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
954                                 ctdb_addr_to_str(&pip->addr),
955                                 vnn->public_netmask_bits,
956                                 ctdb_vnn_iface_string(vnn)));
957                         ctdb_vnn_unassign_iface(ctdb, vnn);
958                         return 0;
959                 }
960         } else {
961                 if (vnn->iface == NULL) {
962                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
963                                            ctdb_addr_to_str(&pip->addr),
964                                            vnn->public_netmask_bits));
965                         return 0;
966                 }
967         }
968
969         /* There is a potential race between take_ip and us because we
970          * update the VNN via a callback that run when the
971          * eventscripts have been run.  Avoid the race by allowing one
972          * update to be in flight at a time.
973          */
974         if (vnn->update_in_flight) {
975                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
976                                     "update for this IP already in flight\n",
977                                     ctdb_addr_to_str(&vnn->public_address),
978                                     vnn->public_netmask_bits));
979                 return -1;
980         }
981
982         if (ctdb->do_checkpublicip) {
983                 iface = ctdb_sys_find_ifname(&pip->addr);
984                 if (iface == NULL) {
985                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
986                         return 0;
987                 }
988                 if (vnn->iface == NULL) {
989                         DEBUG(DEBUG_WARNING,
990                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
991                                ctdb_addr_to_str(&pip->addr),
992                                iface));
993                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
994                         DEBUG(DEBUG_WARNING,
995                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
996                                ctdb_addr_to_str(&pip->addr),
997                                iface,
998                                ctdb_vnn_iface_string(vnn)));
999                         /* Should we fix vnn->iface?  If we do, what
1000                          * happens to reference counts?
1001                          */
1002                 }
1003         } else {
1004                 iface = strdup(ctdb_vnn_iface_string(vnn));
1005         }
1006
1007         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1008                 ctdb_addr_to_str(&pip->addr),
1009                 vnn->public_netmask_bits,
1010                 iface,
1011                 pip->pnn));
1012
1013         state = talloc(ctdb, struct takeover_callback_state);
1014         CTDB_NO_MEMORY(ctdb, state);
1015
1016         state->c = talloc_steal(state, c);
1017         state->addr = talloc(state, ctdb_sock_addr);       
1018         CTDB_NO_MEMORY(ctdb, state->addr);
1019         *state->addr = pip->addr;
1020         state->vnn   = vnn;
1021
1022         vnn->update_in_flight = true;
1023         talloc_set_destructor(state, ctdb_releaseip_destructor);
1024
1025         ret = ctdb_event_script_callback(ctdb, 
1026                                          state, release_ip_callback, state,
1027                                          false,
1028                                          CTDB_EVENT_RELEASE_IP,
1029                                          "%s %s %u",
1030                                          iface,
1031                                          ctdb_addr_to_str(&pip->addr),
1032                                          vnn->public_netmask_bits);
1033         free(iface);
1034         if (ret != 0) {
1035                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1036                         ctdb_addr_to_str(&pip->addr),
1037                         ctdb_vnn_iface_string(vnn)));
1038                 talloc_free(state);
1039                 return -1;
1040         }
1041
1042         /* tell the control that we will be reply asynchronously */
1043         *async_reply = true;
1044         return 0;
1045 }
1046
1047 /*
1048   release an ip address old v4 style
1049  */
1050 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1051                                 struct ctdb_req_control *c,
1052                                 TDB_DATA indata, 
1053                                 bool *async_reply)
1054 {
1055         TDB_DATA data;
1056         
1057         data.dsize = sizeof(struct ctdb_public_ip);
1058         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1059         CTDB_NO_MEMORY(ctdb, data.dptr);
1060         
1061         memcpy(data.dptr, indata.dptr, indata.dsize);
1062         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1063 }
1064
1065
1066 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1067                                    ctdb_sock_addr *addr,
1068                                    unsigned mask, const char *ifaces,
1069                                    bool check_address)
1070 {
1071         struct ctdb_vnn      *vnn;
1072         uint32_t num = 0;
1073         char *tmp;
1074         const char *iface;
1075         int i;
1076         int ret;
1077
1078         tmp = strdup(ifaces);
1079         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1080                 if (!ctdb_sys_check_iface_exists(iface)) {
1081                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1082                         free(tmp);
1083                         return -1;
1084                 }
1085         }
1086         free(tmp);
1087
1088         /* Verify that we dont have an entry for this ip yet */
1089         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1090                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1091                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1092                                 ctdb_addr_to_str(addr)));
1093                         return -1;
1094                 }               
1095         }
1096
1097         /* create a new vnn structure for this ip address */
1098         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1099         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1100         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1101         tmp = talloc_strdup(vnn, ifaces);
1102         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1103         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1104                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1105                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1106                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1107                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1108                 num++;
1109         }
1110         talloc_free(tmp);
1111         vnn->ifaces[num] = NULL;
1112         vnn->public_address      = *addr;
1113         vnn->public_netmask_bits = mask;
1114         vnn->pnn                 = -1;
1115         if (check_address) {
1116                 if (ctdb_sys_have_ip(addr)) {
1117                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1118                         vnn->pnn = ctdb->pnn;
1119                 }
1120         }
1121
1122         for (i=0; vnn->ifaces[i]; i++) {
1123                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1124                 if (ret != 0) {
1125                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1126                                            "for public_address[%s]\n",
1127                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1128                         talloc_free(vnn);
1129                         return -1;
1130                 }
1131         }
1132
1133         DLIST_ADD(ctdb->vnn, vnn);
1134
1135         return 0;
1136 }
1137
1138 /*
1139   setup the event script directory
1140 */
1141 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1142 {
1143         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1144         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1145         return 0;
1146 }
1147
1148 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1149                                   struct timeval t, void *private_data)
1150 {
1151         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1152                                                         struct ctdb_context);
1153         struct ctdb_vnn *vnn;
1154
1155         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1156                 int i;
1157
1158                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1159                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1160                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1161                                         vnn->ifaces[i],
1162                                         ctdb_addr_to_str(&vnn->public_address)));
1163                         }
1164                 }
1165         }
1166
1167         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1168                 timeval_current_ofs(30, 0), 
1169                 ctdb_check_interfaces_event, ctdb);
1170 }
1171
1172
1173 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1174 {
1175         if (ctdb->check_public_ifaces_ctx != NULL) {
1176                 talloc_free(ctdb->check_public_ifaces_ctx);
1177                 ctdb->check_public_ifaces_ctx = NULL;
1178         }
1179
1180         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1181         if (ctdb->check_public_ifaces_ctx == NULL) {
1182                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1183         }
1184
1185         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1186                 timeval_current_ofs(30, 0), 
1187                 ctdb_check_interfaces_event, ctdb);
1188
1189         return 0;
1190 }
1191
1192
1193 /*
1194   setup the public address lists from a file
1195 */
1196 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1197 {
1198         char **lines;
1199         int nlines;
1200         int i;
1201
1202         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1203         if (lines == NULL) {
1204                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1205                 return -1;
1206         }
1207         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1208                 nlines--;
1209         }
1210
1211         for (i=0;i<nlines;i++) {
1212                 unsigned mask;
1213                 ctdb_sock_addr addr;
1214                 const char *addrstr;
1215                 const char *ifaces;
1216                 char *tok, *line;
1217
1218                 line = lines[i];
1219                 while ((*line == ' ') || (*line == '\t')) {
1220                         line++;
1221                 }
1222                 if (*line == '#') {
1223                         continue;
1224                 }
1225                 if (strcmp(line, "") == 0) {
1226                         continue;
1227                 }
1228                 tok = strtok(line, " \t");
1229                 addrstr = tok;
1230                 tok = strtok(NULL, " \t");
1231                 if (tok == NULL) {
1232                         if (NULL == ctdb->default_public_interface) {
1233                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1234                                          i+1));
1235                                 talloc_free(lines);
1236                                 return -1;
1237                         }
1238                         ifaces = ctdb->default_public_interface;
1239                 } else {
1240                         ifaces = tok;
1241                 }
1242
1243                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1244                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1245                         talloc_free(lines);
1246                         return -1;
1247                 }
1248                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1249                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1250                         talloc_free(lines);
1251                         return -1;
1252                 }
1253         }
1254
1255
1256         talloc_free(lines);
1257         return 0;
1258 }
1259
1260 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1261                               const char *iface,
1262                               const char *ip)
1263 {
1264         struct ctdb_vnn *svnn;
1265         struct ctdb_iface *cur = NULL;
1266         bool ok;
1267         int ret;
1268
1269         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1270         CTDB_NO_MEMORY(ctdb, svnn);
1271
1272         svnn->ifaces = talloc_array(svnn, const char *, 2);
1273         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1274         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1275         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1276         svnn->ifaces[1] = NULL;
1277
1278         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1279         if (!ok) {
1280                 talloc_free(svnn);
1281                 return -1;
1282         }
1283
1284         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1285         if (ret != 0) {
1286                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1287                                    "for single_ip[%s]\n",
1288                                    svnn->ifaces[0],
1289                                    ctdb_addr_to_str(&svnn->public_address)));
1290                 talloc_free(svnn);
1291                 return -1;
1292         }
1293
1294         /* assume the single public ip interface is initially "good" */
1295         cur = ctdb_find_iface(ctdb, iface);
1296         if (cur == NULL) {
1297                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1298                 return -1;
1299         }
1300         cur->link_up = true;
1301
1302         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1303         if (ret != 0) {
1304                 talloc_free(svnn);
1305                 return -1;
1306         }
1307
1308         ctdb->single_ip_vnn = svnn;
1309         return 0;
1310 }
1311
1312 /* Given a physical node, return the number of
1313    public addresses that is currently assigned to this node.
1314 */
1315 static int node_ip_coverage(struct ctdb_context *ctdb, 
1316         int32_t pnn,
1317         struct ctdb_public_ip_list *ips)
1318 {
1319         int num=0;
1320
1321         for (;ips;ips=ips->next) {
1322                 if (ips->pnn == pnn) {
1323                         num++;
1324                 }
1325         }
1326         return num;
1327 }
1328
1329
1330 /* Can the given node host the given IP: is the public IP known to the
1331  * node and is NOIPHOST unset?
1332 */
1333 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1334                              struct ctdb_ipflags ipflags,
1335                              struct ctdb_public_ip_list *ip)
1336 {
1337         struct ctdb_all_public_ips *public_ips;
1338         int i;
1339
1340         if (ipflags.noiphost) {
1341                 return false;
1342         }
1343
1344         public_ips = ctdb->nodes[pnn]->available_public_ips;
1345
1346         if (public_ips == NULL) {
1347                 return false;
1348         }
1349
1350         for (i=0; i<public_ips->num; i++) {
1351                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1352                         /* yes, this node can serve this public ip */
1353                         return true;
1354                 }
1355         }
1356
1357         return false;
1358 }
1359
1360 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1361                                  struct ctdb_ipflags ipflags,
1362                                  struct ctdb_public_ip_list *ip)
1363 {
1364         if (ipflags.noiptakeover) {
1365                 return false;
1366         }
1367
1368         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1369 }
1370
1371 /* search the node lists list for a node to takeover this ip.
1372    pick the node that currently are serving the least number of ips
1373    so that the ips get spread out evenly.
1374 */
1375 static int find_takeover_node(struct ctdb_context *ctdb, 
1376                 struct ctdb_ipflags *ipflags,
1377                 struct ctdb_public_ip_list *ip,
1378                 struct ctdb_public_ip_list *all_ips)
1379 {
1380         int pnn, min=0, num;
1381         int i, numnodes;
1382
1383         numnodes = talloc_array_length(ipflags);
1384         pnn    = -1;
1385         for (i=0; i<numnodes; i++) {
1386                 /* verify that this node can serve this ip */
1387                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1388                         /* no it couldnt   so skip to the next node */
1389                         continue;
1390                 }
1391
1392                 num = node_ip_coverage(ctdb, i, all_ips);
1393                 /* was this the first node we checked ? */
1394                 if (pnn == -1) {
1395                         pnn = i;
1396                         min  = num;
1397                 } else {
1398                         if (num < min) {
1399                                 pnn = i;
1400                                 min  = num;
1401                         }
1402                 }
1403         }       
1404         if (pnn == -1) {
1405                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1406                         ctdb_addr_to_str(&ip->addr)));
1407
1408                 return -1;
1409         }
1410
1411         ip->pnn = pnn;
1412         return 0;
1413 }
1414
1415 #define IP_KEYLEN       4
1416 static uint32_t *ip_key(ctdb_sock_addr *ip)
1417 {
1418         static uint32_t key[IP_KEYLEN];
1419
1420         bzero(key, sizeof(key));
1421
1422         switch (ip->sa.sa_family) {
1423         case AF_INET:
1424                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1425                 break;
1426         case AF_INET6: {
1427                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1428                 key[0]  = htonl(s6_a32[0]);
1429                 key[1]  = htonl(s6_a32[1]);
1430                 key[2]  = htonl(s6_a32[2]);
1431                 key[3]  = htonl(s6_a32[3]);
1432                 break;
1433         }
1434         default:
1435                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1436                 return key;
1437         }
1438
1439         return key;
1440 }
1441
1442 static void *add_ip_callback(void *parm, void *data)
1443 {
1444         struct ctdb_public_ip_list *this_ip = parm; 
1445         struct ctdb_public_ip_list *prev_ip = data; 
1446
1447         if (prev_ip == NULL) {
1448                 return parm;
1449         }
1450         if (this_ip->pnn == -1) {
1451                 this_ip->pnn = prev_ip->pnn;
1452         }
1453
1454         return parm;
1455 }
1456
1457 static int getips_count_callback(void *param, void *data)
1458 {
1459         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1460         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1461
1462         new_ip->next = *ip_list;
1463         *ip_list     = new_ip;
1464         return 0;
1465 }
1466
1467 static struct ctdb_public_ip_list *
1468 create_merged_ip_list(struct ctdb_context *ctdb)
1469 {
1470         int i, j;
1471         struct ctdb_public_ip_list *ip_list;
1472         struct ctdb_all_public_ips *public_ips;
1473
1474         if (ctdb->ip_tree != NULL) {
1475                 talloc_free(ctdb->ip_tree);
1476                 ctdb->ip_tree = NULL;
1477         }
1478         ctdb->ip_tree = trbt_create(ctdb, 0);
1479
1480         for (i=0;i<ctdb->num_nodes;i++) {
1481                 public_ips = ctdb->nodes[i]->known_public_ips;
1482
1483                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1484                         continue;
1485                 }
1486
1487                 /* there were no public ips for this node */
1488                 if (public_ips == NULL) {
1489                         continue;
1490                 }               
1491
1492                 for (j=0;j<public_ips->num;j++) {
1493                         struct ctdb_public_ip_list *tmp_ip; 
1494
1495                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1496                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1497                         /* Do not use information about IP addresses hosted
1498                          * on other nodes, it may not be accurate */
1499                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1500                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1501                         } else {
1502                                 tmp_ip->pnn = -1;
1503                         }
1504                         tmp_ip->addr = public_ips->ips[j].addr;
1505                         tmp_ip->next = NULL;
1506
1507                         trbt_insertarray32_callback(ctdb->ip_tree,
1508                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1509                                 add_ip_callback,
1510                                 tmp_ip);
1511                 }
1512         }
1513
1514         ip_list = NULL;
1515         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1516
1517         return ip_list;
1518 }
1519
1520 /* 
1521  * This is the length of the longtest common prefix between the IPs.
1522  * It is calculated by XOR-ing the 2 IPs together and counting the
1523  * number of leading zeroes.  The implementation means that all
1524  * addresses end up being 128 bits long.
1525  *
1526  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1527  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1528  * lots of nodes and IP addresses?
1529  */
1530 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1531 {
1532         uint32_t ip1_k[IP_KEYLEN];
1533         uint32_t *t;
1534         int i;
1535         uint32_t x;
1536
1537         uint32_t distance = 0;
1538
1539         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1540         t = ip_key(ip2);
1541         for (i=0; i<IP_KEYLEN; i++) {
1542                 x = ip1_k[i] ^ t[i];
1543                 if (x == 0) {
1544                         distance += 32;
1545                 } else {
1546                         /* Count number of leading zeroes. 
1547                          * FIXME? This could be optimised...
1548                          */
1549                         while ((x & (1 << 31)) == 0) {
1550                                 x <<= 1;
1551                                 distance += 1;
1552                         }
1553                 }
1554         }
1555
1556         return distance;
1557 }
1558
1559 /* Calculate the IP distance for the given IP relative to IPs on the
1560    given node.  The ips argument is generally the all_ips variable
1561    used in the main part of the algorithm.
1562  */
1563 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1564                                   struct ctdb_public_ip_list *ips,
1565                                   int pnn)
1566 {
1567         struct ctdb_public_ip_list *t;
1568         uint32_t d;
1569
1570         uint32_t sum = 0;
1571
1572         for (t=ips; t != NULL; t=t->next) {
1573                 if (t->pnn != pnn) {
1574                         continue;
1575                 }
1576
1577                 /* Optimisation: We never calculate the distance
1578                  * between an address and itself.  This allows us to
1579                  * calculate the effect of removing an address from a
1580                  * node by simply calculating the distance between
1581                  * that address and all of the exitsing addresses.
1582                  * Moreover, we assume that we're only ever dealing
1583                  * with addresses from all_ips so we can identify an
1584                  * address via a pointer rather than doing a more
1585                  * expensive address comparison. */
1586                 if (&(t->addr) == ip) {
1587                         continue;
1588                 }
1589
1590                 d = ip_distance(ip, &(t->addr));
1591                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1592         }
1593
1594         return sum;
1595 }
1596
1597 /* Return the LCP2 imbalance metric for addresses currently assigned
1598    to the given node.
1599  */
1600 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1601 {
1602         struct ctdb_public_ip_list *t;
1603
1604         uint32_t imbalance = 0;
1605
1606         for (t=all_ips; t!=NULL; t=t->next) {
1607                 if (t->pnn != pnn) {
1608                         continue;
1609                 }
1610                 /* Pass the rest of the IPs rather than the whole
1611                    all_ips input list.
1612                 */
1613                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1614         }
1615
1616         return imbalance;
1617 }
1618
1619 /* Allocate any unassigned IPs just by looping through the IPs and
1620  * finding the best node for each.
1621  */
1622 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1623                                       struct ctdb_ipflags *ipflags,
1624                                       struct ctdb_public_ip_list *all_ips)
1625 {
1626         struct ctdb_public_ip_list *tmp_ip;
1627
1628         /* loop over all ip's and find a physical node to cover for 
1629            each unassigned ip.
1630         */
1631         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1632                 if (tmp_ip->pnn == -1) {
1633                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1634                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1635                                         ctdb_addr_to_str(&tmp_ip->addr)));
1636                         }
1637                 }
1638         }
1639 }
1640
1641 /* Basic non-deterministic rebalancing algorithm.
1642  */
1643 static void basic_failback(struct ctdb_context *ctdb,
1644                            struct ctdb_ipflags *ipflags,
1645                            struct ctdb_public_ip_list *all_ips,
1646                            int num_ips)
1647 {
1648         int i, numnodes;
1649         int maxnode, maxnum, minnode, minnum, num, retries;
1650         struct ctdb_public_ip_list *tmp_ip;
1651
1652         numnodes = talloc_array_length(ipflags);
1653         retries = 0;
1654
1655 try_again:
1656         maxnum=0;
1657         minnum=0;
1658
1659         /* for each ip address, loop over all nodes that can serve
1660            this ip and make sure that the difference between the node
1661            serving the most and the node serving the least ip's are
1662            not greater than 1.
1663         */
1664         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1665                 if (tmp_ip->pnn == -1) {
1666                         continue;
1667                 }
1668
1669                 /* Get the highest and lowest number of ips's served by any 
1670                    valid node which can serve this ip.
1671                 */
1672                 maxnode = -1;
1673                 minnode = -1;
1674                 for (i=0; i<numnodes; i++) {
1675                         /* only check nodes that can actually serve this ip */
1676                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1677                                 /* no it couldnt   so skip to the next node */
1678                                 continue;
1679                         }
1680
1681                         num = node_ip_coverage(ctdb, i, all_ips);
1682                         if (maxnode == -1) {
1683                                 maxnode = i;
1684                                 maxnum  = num;
1685                         } else {
1686                                 if (num > maxnum) {
1687                                         maxnode = i;
1688                                         maxnum  = num;
1689                                 }
1690                         }
1691                         if (minnode == -1) {
1692                                 minnode = i;
1693                                 minnum  = num;
1694                         } else {
1695                                 if (num < minnum) {
1696                                         minnode = i;
1697                                         minnum  = num;
1698                                 }
1699                         }
1700                 }
1701                 if (maxnode == -1) {
1702                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1703                                 ctdb_addr_to_str(&tmp_ip->addr)));
1704
1705                         continue;
1706                 }
1707
1708                 /* if the spread between the smallest and largest coverage by
1709                    a node is >=2 we steal one of the ips from the node with
1710                    most coverage to even things out a bit.
1711                    try to do this a limited number of times since we dont
1712                    want to spend too much time balancing the ip coverage.
1713                 */
1714                 if ( (maxnum > minnum+1)
1715                      && (retries < (num_ips + 5)) ){
1716                         struct ctdb_public_ip_list *tmp;
1717
1718                         /* Reassign one of maxnode's VNNs */
1719                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1720                                 if (tmp->pnn == maxnode) {
1721                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1722                                         retries++;
1723                                         goto try_again;;
1724                                 }
1725                         }
1726                 }
1727         }
1728 }
1729
1730 struct ctdb_rebalancenodes {
1731         struct ctdb_rebalancenodes *next;
1732         uint32_t pnn;
1733 };
1734 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1735
1736
1737 /* set this flag to force the node to be rebalanced even if it just didnt
1738    become healthy again.
1739 */
1740 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1741 {
1742         struct ctdb_rebalancenodes *rebalance;
1743
1744         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1745                 if (rebalance->pnn == pnn) {
1746                         return;
1747                 }
1748         }
1749
1750         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1751         rebalance->pnn = pnn;
1752         rebalance->next = force_rebalance_list;
1753         force_rebalance_list = rebalance;
1754 }
1755
1756 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1757  * that we can unit test it.
1758  */
1759 static void lcp2_init(struct ctdb_context *tmp_ctx,
1760                       struct ctdb_ipflags *ipflags,
1761                       struct ctdb_public_ip_list *all_ips,
1762                       uint32_t **lcp2_imbalances,
1763                       bool **rebalance_candidates)
1764 {
1765         int i, numnodes;
1766         struct ctdb_public_ip_list *tmp_ip;
1767
1768         numnodes = talloc_array_length(ipflags);
1769
1770         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1771         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1772         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1773         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1774
1775         for (i=0; i<numnodes; i++) {
1776                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1777                 /* First step: assume all nodes are candidates */
1778                 (*rebalance_candidates)[i] = true;
1779         }
1780
1781         /* 2nd step: if a node has IPs assigned then it must have been
1782          * healthy before, so we remove it from consideration.  This
1783          * is overkill but is all we have because we don't maintain
1784          * state between takeover runs.  An alternative would be to
1785          * keep state and invalidate it every time the recovery master
1786          * changes.
1787          */
1788         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1789                 if (tmp_ip->pnn != -1) {
1790                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1791                 }
1792         }
1793
1794         /* 3rd step: if a node is forced to re-balance then
1795            we allow failback onto the node */
1796         while (force_rebalance_list != NULL) {
1797                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1798
1799                 if (force_rebalance_list->pnn <= numnodes) {
1800                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1801                 }
1802
1803                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1804                 talloc_free(force_rebalance_list);
1805                 force_rebalance_list = next;
1806         }
1807 }
1808
1809 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1810  * the IP/node combination that will cost the least.
1811  */
1812 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1813                                      struct ctdb_ipflags *ipflags,
1814                                      struct ctdb_public_ip_list *all_ips,
1815                                      uint32_t *lcp2_imbalances)
1816 {
1817         struct ctdb_public_ip_list *tmp_ip;
1818         int dstnode, numnodes;
1819
1820         int minnode;
1821         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1822         struct ctdb_public_ip_list *minip;
1823
1824         bool should_loop = true;
1825         bool have_unassigned = true;
1826
1827         numnodes = talloc_array_length(ipflags);
1828
1829         while (have_unassigned && should_loop) {
1830                 should_loop = false;
1831
1832                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1833                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1834
1835                 minnode = -1;
1836                 mindsum = 0;
1837                 minip = NULL;
1838
1839                 /* loop over each unassigned ip. */
1840                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1841                         if (tmp_ip->pnn != -1) {
1842                                 continue;
1843                         }
1844
1845                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1846                                 /* only check nodes that can actually takeover this ip */
1847                                 if (!can_node_takeover_ip(ctdb, dstnode,
1848                                                           ipflags[dstnode],
1849                                                           tmp_ip)) {
1850                                         /* no it couldnt   so skip to the next node */
1851                                         continue;
1852                                 }
1853
1854                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1855                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1856                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1857                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1858                                                    dstnode,
1859                                                    dstimbl - lcp2_imbalances[dstnode]));
1860
1861
1862                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1863                                         minnode = dstnode;
1864                                         minimbl = dstimbl;
1865                                         mindsum = dstdsum;
1866                                         minip = tmp_ip;
1867                                         should_loop = true;
1868                                 }
1869                         }
1870                 }
1871
1872                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1873
1874                 /* If we found one then assign it to the given node. */
1875                 if (minnode != -1) {
1876                         minip->pnn = minnode;
1877                         lcp2_imbalances[minnode] = minimbl;
1878                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1879                                           ctdb_addr_to_str(&(minip->addr)),
1880                                           minnode,
1881                                           mindsum));
1882                 }
1883
1884                 /* There might be a better way but at least this is clear. */
1885                 have_unassigned = false;
1886                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1887                         if (tmp_ip->pnn == -1) {
1888                                 have_unassigned = true;
1889                         }
1890                 }
1891         }
1892
1893         /* We know if we have an unassigned addresses so we might as
1894          * well optimise.
1895          */
1896         if (have_unassigned) {
1897                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1898                         if (tmp_ip->pnn == -1) {
1899                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1900                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1901                         }
1902                 }
1903         }
1904 }
1905
1906 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1907  * to move IPs from, determines the best IP/destination node
1908  * combination to move from the source node.
1909  */
1910 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1911                                     struct ctdb_ipflags *ipflags,
1912                                     struct ctdb_public_ip_list *all_ips,
1913                                     int srcnode,
1914                                     uint32_t candimbl,
1915                                     uint32_t *lcp2_imbalances,
1916                                     bool *rebalance_candidates)
1917 {
1918         int dstnode, mindstnode, numnodes;
1919         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1920         uint32_t minsrcimbl, mindstimbl;
1921         struct ctdb_public_ip_list *minip;
1922         struct ctdb_public_ip_list *tmp_ip;
1923
1924         /* Find an IP and destination node that best reduces imbalance. */
1925         srcimbl = 0;
1926         minip = NULL;
1927         minsrcimbl = 0;
1928         mindstnode = -1;
1929         mindstimbl = 0;
1930
1931         numnodes = talloc_array_length(ipflags);
1932
1933         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1934         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1935
1936         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1937                 /* Only consider addresses on srcnode. */
1938                 if (tmp_ip->pnn != srcnode) {
1939                         continue;
1940                 }
1941
1942                 /* What is this IP address costing the source node? */
1943                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1944                 srcimbl = candimbl - srcdsum;
1945
1946                 /* Consider this IP address would cost each potential
1947                  * destination node.  Destination nodes are limited to
1948                  * those that are newly healthy, since we don't want
1949                  * to do gratuitous failover of IPs just to make minor
1950                  * balance improvements.
1951                  */
1952                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1953                         if (!rebalance_candidates[dstnode]) {
1954                                 continue;
1955                         }
1956
1957                         /* only check nodes that can actually takeover this ip */
1958                         if (!can_node_takeover_ip(ctdb, dstnode,
1959                                                   ipflags[dstnode], tmp_ip)) {
1960                                 /* no it couldnt   so skip to the next node */
1961                                 continue;
1962                         }
1963
1964                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1965                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1966                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1967                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1968                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1969                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1970
1971                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1972                             ((mindstnode == -1) ||                              \
1973                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1974
1975                                 minip = tmp_ip;
1976                                 minsrcimbl = srcimbl;
1977                                 mindstnode = dstnode;
1978                                 mindstimbl = dstimbl;
1979                         }
1980                 }
1981         }
1982         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1983
1984         if (mindstnode != -1) {
1985                 /* We found a move that makes things better... */
1986                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1987                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1988                                   ctdb_addr_to_str(&(minip->addr)),
1989                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1990
1991
1992                 lcp2_imbalances[srcnode] = srcimbl;
1993                 lcp2_imbalances[mindstnode] = mindstimbl;
1994                 minip->pnn = mindstnode;
1995
1996                 return true;
1997         }
1998
1999         return false;
2000         
2001 }
2002
2003 struct lcp2_imbalance_pnn {
2004         uint32_t imbalance;
2005         int pnn;
2006 };
2007
2008 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2009 {
2010         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2011         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2012
2013         if (lipa->imbalance > lipb->imbalance) {
2014                 return -1;
2015         } else if (lipa->imbalance == lipb->imbalance) {
2016                 return 0;
2017         } else {
2018                 return 1;
2019         }
2020 }
2021
2022 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2023  * node with the highest LCP2 imbalance, and then determines the best
2024  * IP/destination node combination to move from the source node.
2025  */
2026 static void lcp2_failback(struct ctdb_context *ctdb,
2027                           struct ctdb_ipflags *ipflags,
2028                           struct ctdb_public_ip_list *all_ips,
2029                           uint32_t *lcp2_imbalances,
2030                           bool *rebalance_candidates)
2031 {
2032         int i, num_rebalance_candidates, numnodes;
2033         struct lcp2_imbalance_pnn * lips;
2034         bool again;
2035
2036         numnodes = talloc_array_length(ipflags);
2037
2038 try_again:
2039
2040         /* It is only worth continuing if we have suitable target
2041          * nodes to transfer IPs to.  This check is much cheaper than
2042          * continuing on...
2043          */
2044         num_rebalance_candidates = 0;
2045         for (i=0; i<numnodes; i++) {
2046                 if (rebalance_candidates[i]) {
2047                         num_rebalance_candidates++;
2048                 }
2049         }
2050         if (num_rebalance_candidates == 0) {
2051                 return;
2052         }
2053
2054         /* Put the imbalances and nodes into an array, sort them and
2055          * iterate through candidates.  Usually the 1st one will be
2056          * used, so this doesn't cost much...
2057          */
2058         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2059         for (i=0; i<numnodes; i++) {
2060                 lips[i].imbalance = lcp2_imbalances[i];
2061                 lips[i].pnn = i;
2062         }
2063         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2064               lcp2_cmp_imbalance_pnn);
2065
2066         again = false;
2067         for (i=0; i<numnodes; i++) {
2068                 /* This means that all nodes had 0 or 1 addresses, so
2069                  * can't be imbalanced.
2070                  */
2071                 if (lips[i].imbalance == 0) {
2072                         break;
2073                 }
2074
2075                 if (lcp2_failback_candidate(ctdb,
2076                                             ipflags,
2077                                             all_ips,
2078                                             lips[i].pnn,
2079                                             lips[i].imbalance,
2080                                             lcp2_imbalances,
2081                                             rebalance_candidates)) {
2082                         again = true;
2083                         break;
2084                 }
2085         }
2086
2087         talloc_free(lips);
2088         if (again) {
2089                 goto try_again;
2090         }
2091 }
2092
2093 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2094                                     struct ctdb_ipflags *ipflags,
2095                                     struct ctdb_public_ip_list *all_ips)
2096 {
2097         struct ctdb_public_ip_list *tmp_ip;
2098
2099         /* verify that the assigned nodes can serve that public ip
2100            and set it to -1 if not
2101         */
2102         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2103                 if (tmp_ip->pnn == -1) {
2104                         continue;
2105                 }
2106                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2107                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2108                         /* this node can not serve this ip. */
2109                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2110                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2111                                            tmp_ip->pnn));
2112                         tmp_ip->pnn = -1;
2113                 }
2114         }
2115 }
2116
2117 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2118                                        struct ctdb_ipflags *ipflags,
2119                                        struct ctdb_public_ip_list *all_ips)
2120 {
2121         struct ctdb_public_ip_list *tmp_ip;
2122         int i, numnodes;
2123
2124         numnodes = talloc_array_length(ipflags);
2125
2126         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2127        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2128         *  always be allocated the same way for a specific set of
2129         *  available/unavailable nodes.
2130         */
2131
2132         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2133                 tmp_ip->pnn = i % numnodes;
2134         }
2135
2136         /* IP failback doesn't make sense with deterministic
2137          * IPs, since the modulo step above implicitly fails
2138          * back IPs to their "home" node.
2139          */
2140         if (1 == ctdb->tunable.no_ip_failback) {
2141                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2142         }
2143
2144         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2145
2146         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2147
2148         /* No failback here! */
2149 }
2150
2151 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2152                                           struct ctdb_ipflags *ipflags,
2153                                           struct ctdb_public_ip_list *all_ips)
2154 {
2155         /* This should be pushed down into basic_failback. */
2156         struct ctdb_public_ip_list *tmp_ip;
2157         int num_ips = 0;
2158         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2159                 num_ips++;
2160         }
2161
2162         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2163
2164         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2165
2166         /* If we don't want IPs to fail back then don't rebalance IPs. */
2167         if (1 == ctdb->tunable.no_ip_failback) {
2168                 return;
2169         }
2170
2171         /* Now, try to make sure the ip adresses are evenly distributed
2172            across the nodes.
2173         */
2174         basic_failback(ctdb, ipflags, all_ips, num_ips);
2175 }
2176
2177 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2178                           struct ctdb_ipflags *ipflags,
2179                           struct ctdb_public_ip_list *all_ips)
2180 {
2181         uint32_t *lcp2_imbalances;
2182         bool *rebalance_candidates;
2183
2184         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2185
2186         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2187
2188         lcp2_init(tmp_ctx, ipflags, all_ips,
2189                   &lcp2_imbalances, &rebalance_candidates);
2190
2191         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2192
2193         /* If we don't want IPs to fail back then don't rebalance IPs. */
2194         if (1 == ctdb->tunable.no_ip_failback) {
2195                 goto finished;
2196         }
2197
2198         /* Now, try to make sure the ip adresses are evenly distributed
2199            across the nodes.
2200         */
2201         lcp2_failback(ctdb, ipflags, all_ips,
2202                       lcp2_imbalances, rebalance_candidates);
2203
2204 finished:
2205         talloc_free(tmp_ctx);
2206 }
2207
2208 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2209 {
2210         int i, num_healthy;
2211
2212         /* Count how many completely healthy nodes we have */
2213         num_healthy = 0;
2214         for (i=0;i<nodemap->num;i++) {
2215                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2216                         num_healthy++;
2217                 }
2218         }
2219
2220         return num_healthy == 0;
2221 }
2222
2223 /* The calculation part of the IP allocation algorithm. */
2224 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2225                                    struct ctdb_ipflags *ipflags,
2226                                    struct ctdb_public_ip_list **all_ips_p)
2227 {
2228         /* since nodes only know about those public addresses that
2229            can be served by that particular node, no single node has
2230            a full list of all public addresses that exist in the cluster.
2231            Walk over all node structures and create a merged list of
2232            all public addresses that exist in the cluster.
2233
2234            keep the tree of ips around as ctdb->ip_tree
2235         */
2236         *all_ips_p = create_merged_ip_list(ctdb);
2237
2238         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2239                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p);
2240         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2241                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2242         } else {
2243                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2244         }
2245
2246         /* at this point ->pnn is the node which will own each IP
2247            or -1 if there is no node that can cover this ip
2248         */
2249
2250         return;
2251 }
2252
2253 struct get_tunable_callback_data {
2254         const char *tunable;
2255         uint32_t *out;
2256         bool fatal;
2257 };
2258
2259 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2260                                  int32_t res, TDB_DATA outdata,
2261                                  void *callback)
2262 {
2263         struct get_tunable_callback_data *cd =
2264                 (struct get_tunable_callback_data *)callback;
2265         int size;
2266
2267         if (res != 0) {
2268                 /* Already handled in fail callback */
2269                 return;
2270         }
2271
2272         if (outdata.dsize != sizeof(uint32_t)) {
2273                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2274                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2275                                  (int)outdata.dsize));
2276                 cd->fatal = true;
2277                 return;
2278         }
2279
2280         size = talloc_array_length(cd->out);
2281         if (pnn >= size) {
2282                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2283                                  cd->tunable, pnn, size));
2284                 return;
2285         }
2286
2287                 
2288         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2289 }
2290
2291 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2292                                        int32_t res, TDB_DATA outdata,
2293                                        void *callback)
2294 {
2295         struct get_tunable_callback_data *cd =
2296                 (struct get_tunable_callback_data *)callback;
2297
2298         switch (res) {
2299         case -ETIME:
2300                 DEBUG(DEBUG_ERR,
2301                       ("Timed out getting tunable \"%s\" from node %d\n",
2302                        cd->tunable, pnn));
2303                 cd->fatal = true;
2304                 break;
2305         case -EINVAL:
2306         case -1:
2307                 DEBUG(DEBUG_WARNING,
2308                       ("Tunable \"%s\" not implemented on node %d\n",
2309                        cd->tunable, pnn));
2310                 break;
2311         default:
2312                 DEBUG(DEBUG_ERR,
2313                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2314                        cd->tunable, pnn));
2315                 cd->fatal = true;
2316         }
2317 }
2318
2319 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2320                                         TALLOC_CTX *tmp_ctx,
2321                                         struct ctdb_node_map *nodemap,
2322                                         const char *tunable,
2323                                         uint32_t default_value)
2324 {
2325         TDB_DATA data;
2326         struct ctdb_control_get_tunable *t;
2327         uint32_t *nodes;
2328         uint32_t *tvals;
2329         struct get_tunable_callback_data callback_data;
2330         int i;
2331
2332         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2333         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2334         for (i=0; i<nodemap->num; i++) {
2335                 tvals[i] = default_value;
2336         }
2337                 
2338         callback_data.out = tvals;
2339         callback_data.tunable = tunable;
2340         callback_data.fatal = false;
2341
2342         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2343         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2344         t = (struct ctdb_control_get_tunable *)data.dptr;
2345         t->length = strlen(tunable)+1;
2346         memcpy(t->name, tunable, t->length);
2347         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2348         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2349                                       nodes, 0, TAKEOVER_TIMEOUT(),
2350                                       false, data,
2351                                       get_tunable_callback,
2352                                       get_tunable_fail_callback,
2353                                       &callback_data) != 0) {
2354                 if (callback_data.fatal) {
2355                         talloc_free(tvals);
2356                         tvals = NULL;
2357                 }
2358         }
2359         talloc_free(nodes);
2360         talloc_free(data.dptr);
2361
2362         return tvals;
2363 }
2364
2365 struct get_runstate_callback_data {
2366         enum ctdb_runstate *out;
2367         bool fatal;
2368 };
2369
2370 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2371                                   int32_t res, TDB_DATA outdata,
2372                                   void *callback_data)
2373 {
2374         struct get_runstate_callback_data *cd =
2375                 (struct get_runstate_callback_data *)callback_data;
2376         int size;
2377
2378         if (res != 0) {
2379                 /* Already handled in fail callback */
2380                 return;
2381         }
2382
2383         if (outdata.dsize != sizeof(uint32_t)) {
2384                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2385                                  pnn, (int)sizeof(uint32_t),
2386                                  (int)outdata.dsize));
2387                 cd->fatal = true;
2388                 return;
2389         }
2390
2391         size = talloc_array_length(cd->out);
2392         if (pnn >= size) {
2393                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2394                                  pnn, size));
2395                 return;
2396         }
2397
2398         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2399 }
2400
2401 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2402                                        int32_t res, TDB_DATA outdata,
2403                                        void *callback)
2404 {
2405         struct get_runstate_callback_data *cd =
2406                 (struct get_runstate_callback_data *)callback;
2407
2408         switch (res) {
2409         case -ETIME:
2410                 DEBUG(DEBUG_ERR,
2411                       ("Timed out getting runstate from node %d\n", pnn));
2412                 cd->fatal = true;
2413                 break;
2414         default:
2415                 DEBUG(DEBUG_WARNING,
2416                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2417                        pnn));
2418         }
2419 }
2420
2421 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2422                                                     TALLOC_CTX *tmp_ctx,
2423                                                     struct ctdb_node_map *nodemap,
2424                                                     enum ctdb_runstate default_value)
2425 {
2426         uint32_t *nodes;
2427         enum ctdb_runstate *rs;
2428         struct get_runstate_callback_data callback_data;
2429         int i;
2430
2431         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2432         CTDB_NO_MEMORY_NULL(ctdb, rs);
2433         for (i=0; i<nodemap->num; i++) {
2434                 rs[i] = default_value;
2435         }
2436
2437         callback_data.out = rs;
2438         callback_data.fatal = false;
2439
2440         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2441         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2442                                       nodes, 0, TAKEOVER_TIMEOUT(),
2443                                       true, tdb_null,
2444                                       get_runstate_callback,
2445                                       get_runstate_fail_callback,
2446                                       &callback_data) != 0) {
2447                 if (callback_data.fatal) {
2448                         free(rs);
2449                         rs = NULL;
2450                 }
2451         }
2452         talloc_free(nodes);
2453
2454         return rs;
2455 }
2456
2457 /* Set internal flags for IP allocation:
2458  *   Clear ip flags
2459  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2460  *   Set NOIPHOST ip flag for each INACTIVE node
2461  *   if all nodes are disabled:
2462  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2463  *   else
2464  *     Set NOIPHOST ip flags for disabled nodes
2465  */
2466 static struct ctdb_ipflags *
2467 set_ipflags_internal(struct ctdb_context *ctdb,
2468                      TALLOC_CTX *tmp_ctx,
2469                      struct ctdb_node_map *nodemap,
2470                      uint32_t *tval_noiptakeover,
2471                      uint32_t *tval_noiphostonalldisabled,
2472                      enum ctdb_runstate *runstate)
2473 {
2474         int i;
2475         struct ctdb_ipflags *ipflags;
2476
2477         /* Clear IP flags - implicit due to talloc_zero */
2478         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2479         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2480
2481         for (i=0;i<nodemap->num;i++) {
2482                 /* Can not take IPs on node with NoIPTakeover set */
2483                 if (tval_noiptakeover[i] != 0) {
2484                         ipflags[i].noiptakeover = true;
2485                 }
2486
2487                 /* Can not host IPs on node not in RUNNING state */
2488                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2489                         ipflags[i].noiphost = true;
2490                         continue;
2491                 }
2492                 /* Can not host IPs on INACTIVE node */
2493                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2494                         ipflags[i].noiphost = true;
2495                 }
2496         }
2497
2498         if (all_nodes_are_disabled(nodemap)) {
2499                 /* If all nodes are disabled, can not host IPs on node
2500                  * with NoIPHostOnAllDisabled set
2501                  */
2502                 for (i=0;i<nodemap->num;i++) {
2503                         if (tval_noiphostonalldisabled[i] != 0) {
2504                                 ipflags[i].noiphost = true;
2505                         }
2506                 }
2507         } else {
2508                 /* If some nodes are not disabled, then can not host
2509                  * IPs on DISABLED node
2510                  */
2511                 for (i=0;i<nodemap->num;i++) {
2512                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2513                                 ipflags[i].noiphost = true;
2514                         }
2515                 }
2516         }
2517
2518         return ipflags;
2519 }
2520
2521 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2522                                         TALLOC_CTX *tmp_ctx,
2523                                         struct ctdb_node_map *nodemap)
2524 {
2525         uint32_t *tval_noiptakeover;
2526         uint32_t *tval_noiphostonalldisabled;
2527         struct ctdb_ipflags *ipflags;
2528         enum ctdb_runstate *runstate;
2529
2530
2531         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2532                                                    "NoIPTakeover", 0);
2533         if (tval_noiptakeover == NULL) {
2534                 return NULL;
2535         }
2536
2537         tval_noiphostonalldisabled =
2538                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2539                                        "NoIPHostOnAllDisabled", 0);
2540         if (tval_noiphostonalldisabled == NULL) {
2541                 /* Caller frees tmp_ctx */
2542                 return NULL;
2543         }
2544
2545         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2546          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2547          * reasonable behaviour on a mixed cluster during upgrade.
2548          */
2549         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2550                                            CTDB_RUNSTATE_RUNNING);
2551         if (runstate == NULL) {
2552                 /* Caller frees tmp_ctx */
2553                 return NULL;
2554         }
2555
2556         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2557                                        tval_noiptakeover,
2558                                        tval_noiphostonalldisabled,
2559                                        runstate);
2560
2561         talloc_free(tval_noiptakeover);
2562         talloc_free(tval_noiphostonalldisabled);
2563         talloc_free(runstate);
2564
2565         return ipflags;
2566 }
2567
2568 struct iprealloc_callback_data {
2569         bool *retry_nodes;
2570         int retry_count;
2571         client_async_callback fail_callback;
2572         void *fail_callback_data;
2573         struct ctdb_node_map *nodemap;
2574 };
2575
2576 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2577                                         int32_t res, TDB_DATA outdata,
2578                                         void *callback)
2579 {
2580         int numnodes;
2581         struct iprealloc_callback_data *cd =
2582                 (struct iprealloc_callback_data *)callback;
2583
2584         switch (res) {
2585         case -ETIME:
2586                 /* If the control timed out then that's a real error,
2587                  * so call the real fail callback
2588                  */
2589                 cd->fail_callback(ctdb, pnn, res, outdata,
2590                                   cd->fail_callback_data);
2591                 break;
2592         default:
2593                 /* If not a timeout then either the ipreallocated
2594                  * eventscript (or some setup) failed.  This might
2595                  * have failed because the IPREALLOCATED control isn't
2596                  * implemented - right now there is no way of knowing
2597                  * because the error codes are all folded down to -1.
2598                  * Consider retrying using EVENTSCRIPT control...
2599                  */
2600
2601                 numnodes = talloc_array_length(cd->retry_nodes);
2602                 if (pnn > numnodes) {
2603                         DEBUG(DEBUG_ERR,
2604                               ("ipreallocated failure from node %d, but only %d nodes in nodemap\n",
2605                                pnn, numnodes));
2606                         return;
2607                 }
2608
2609                 /* Can't run the "ipreallocated" event on a STOPPED node */
2610                 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2611                         DEBUG(DEBUG_ERR,
2612                               ("ipreallocated failure from node %d, but node is stopped - not flagging a retry\n",
2613                                pnn));
2614                         return;
2615                 }
2616
2617                 DEBUG(DEBUG_WARNING,
2618                       ("ipreallocated failure from node %d, flagging retry\n",
2619                        pnn));
2620                 cd->retry_nodes[pnn] = true;
2621                 cd->retry_count++;
2622         }
2623 }
2624
2625 struct takeover_callback_data {
2626         bool *node_failed;
2627         client_async_callback fail_callback;
2628         void *fail_callback_data;
2629         struct ctdb_node_map *nodemap;
2630 };
2631
2632 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2633                                        uint32_t node_pnn, int32_t res,
2634                                        TDB_DATA outdata, void *callback_data)
2635 {
2636         struct takeover_callback_data *cd =
2637                 talloc_get_type_abort(callback_data,
2638                                       struct takeover_callback_data);
2639         int i;
2640
2641         for (i = 0; i < cd->nodemap->num; i++) {
2642                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2643                         break;
2644                 }
2645         }
2646
2647         if (i == cd->nodemap->num) {
2648                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2649                 return;
2650         }
2651
2652         if (!cd->node_failed[i]) {
2653                 cd->node_failed[i] = true;
2654                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2655                                   cd->fail_callback_data);
2656         }
2657 }
2658
2659 /*
2660   make any IP alias changes for public addresses that are necessary 
2661  */
2662 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2663                       client_async_callback fail_callback, void *callback_data)
2664 {
2665         int i, j;
2666         struct ctdb_public_ip ip;
2667         struct ctdb_public_ipv4 ipv4;
2668         uint32_t *nodes;
2669         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2670         TDB_DATA data;
2671         struct timeval timeout;
2672         struct client_async_data *async_data;
2673         struct ctdb_client_control_state *state;
2674         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2675         uint32_t disable_timeout;
2676         struct ctdb_ipflags *ipflags;
2677         struct takeover_callback_data *takeover_data;
2678         struct iprealloc_callback_data iprealloc_data;
2679         bool *retry_data;
2680
2681         /*
2682          * ip failover is completely disabled, just send out the 
2683          * ipreallocated event.
2684          */
2685         if (ctdb->tunable.disable_ip_failover != 0) {
2686                 goto ipreallocated;
2687         }
2688
2689         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2690         if (ipflags == NULL) {
2691                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2692                 talloc_free(tmp_ctx);
2693                 return -1;
2694         }
2695
2696         ZERO_STRUCT(ip);
2697
2698         /* Do the IP reassignment calculations */
2699         ctdb_takeover_run_core(ctdb, ipflags, &all_ips);
2700
2701         /* The IP flags need to be cleared because they should never
2702          * be seen outside the IP allocation code.
2703          */
2704
2705         /* The recovery daemon does regular sanity checks of the IPs.
2706          * However, sometimes it is overzealous and thinks changes are
2707          * required when they're already underway.  This stops the
2708          * checks for a while before we start moving IPs.
2709          */
2710         disable_timeout = ctdb->tunable.takeover_timeout;
2711         data.dptr  = (uint8_t*)&disable_timeout;
2712         data.dsize = sizeof(disable_timeout);
2713         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2714                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2715                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2716         }
2717
2718         /* now tell all nodes to delete any alias that they should not
2719            have.  This will be a NOOP on nodes that don't currently
2720            hold the given alias */
2721         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2722         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2723
2724         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2725                                                        bool, nodemap->num);
2726         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2727         takeover_data->fail_callback = fail_callback;
2728         takeover_data->fail_callback_data = callback_data;
2729         takeover_data->nodemap = nodemap;
2730
2731         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2732         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2733
2734         async_data->fail_callback = takeover_run_fail_callback;
2735         async_data->callback_data = takeover_data;
2736
2737         for (i=0;i<nodemap->num;i++) {
2738                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2739                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2740                         continue;
2741                 }
2742
2743                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2744                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2745                                 /* This node should be serving this
2746                                    vnn so dont tell it to release the ip
2747                                 */
2748                                 continue;
2749                         }
2750                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2751                                 ipv4.pnn = tmp_ip->pnn;
2752                                 ipv4.sin = tmp_ip->addr.ip;
2753
2754                                 timeout = TAKEOVER_TIMEOUT();
2755                                 data.dsize = sizeof(ipv4);
2756                                 data.dptr  = (uint8_t *)&ipv4;
2757                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2758                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2759                                                 data, async_data,
2760                                                 &timeout, NULL);
2761                         } else {
2762                                 ip.pnn  = tmp_ip->pnn;
2763                                 ip.addr = tmp_ip->addr;
2764
2765                                 timeout = TAKEOVER_TIMEOUT();
2766                                 data.dsize = sizeof(ip);
2767                                 data.dptr  = (uint8_t *)&ip;
2768                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2769                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2770                                                 data, async_data,
2771                                                 &timeout, NULL);
2772                         }
2773
2774                         if (state == NULL) {
2775                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2776                                 talloc_free(tmp_ctx);
2777                                 return -1;
2778                         }
2779                 
2780                         ctdb_client_async_add(async_data, state);
2781                 }
2782         }
2783         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2784                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2785                 talloc_free(tmp_ctx);
2786                 return -1;
2787         }
2788         talloc_free(async_data);
2789
2790
2791         /* tell all nodes to get their own IPs */
2792         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2793         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2794
2795         async_data->fail_callback = fail_callback;
2796         async_data->callback_data = callback_data;
2797
2798         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2799                 if (tmp_ip->pnn == -1) {
2800                         /* this IP won't be taken over */
2801                         continue;
2802                 }
2803
2804                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2805                         ipv4.pnn = tmp_ip->pnn;
2806                         ipv4.sin = tmp_ip->addr.ip;
2807
2808                         timeout = TAKEOVER_TIMEOUT();
2809                         data.dsize = sizeof(ipv4);
2810                         data.dptr  = (uint8_t *)&ipv4;
2811                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2812                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2813                                         data, async_data,
2814                                         &timeout, NULL);
2815                 } else {
2816                         ip.pnn  = tmp_ip->pnn;
2817                         ip.addr = tmp_ip->addr;
2818
2819                         timeout = TAKEOVER_TIMEOUT();
2820                         data.dsize = sizeof(ip);
2821                         data.dptr  = (uint8_t *)&ip;
2822                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2823                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2824                                         data, async_data,
2825                                         &timeout, NULL);
2826                 }
2827                 if (state == NULL) {
2828                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2829                         talloc_free(tmp_ctx);
2830                         return -1;
2831                 }
2832                 
2833                 ctdb_client_async_add(async_data, state);
2834         }
2835         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2836                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2837                 talloc_free(tmp_ctx);
2838                 return -1;
2839         }
2840
2841 ipreallocated:
2842         /* 
2843          * Tell all nodes to run eventscripts to process the
2844          * "ipreallocated" event.  This can do a lot of things,
2845          * including restarting services to reconfigure them if public
2846          * IPs have moved.  Once upon a time this event only used to
2847          * update natwg.
2848          */
2849         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2850         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2851         iprealloc_data.retry_nodes = retry_data;
2852         iprealloc_data.retry_count = 0;
2853         iprealloc_data.fail_callback = fail_callback;
2854         iprealloc_data.fail_callback_data = callback_data;
2855         iprealloc_data.nodemap = nodemap;
2856
2857         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2858         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2859                                       nodes, 0, TAKEOVER_TIMEOUT(),
2860                                       false, tdb_null,
2861                                       NULL, iprealloc_fail_callback,
2862                                       &iprealloc_data) != 0) {
2863
2864                 /* If the control failed then we should retry to any
2865                  * nodes flagged by iprealloc_fail_callback using the
2866                  * EVENTSCRIPT control.  This is a best-effort at
2867                  * backward compatiblity when running a mixed cluster
2868                  * where some nodes have not yet been upgraded to
2869                  * support the IPREALLOCATED control.
2870                  */
2871                 DEBUG(DEBUG_WARNING,
2872                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2873
2874                 nodes = talloc_array(tmp_ctx, uint32_t,
2875                                      iprealloc_data.retry_count);
2876                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2877
2878                 j = 0;
2879                 for (i=0; i<nodemap->num; i++) {
2880                         if (iprealloc_data.retry_nodes[i]) {
2881                                 nodes[j] = i;
2882                                 j++;
2883                         }
2884                 }
2885
2886                 data.dptr  = discard_const("ipreallocated");
2887                 data.dsize = strlen((char *)data.dptr) + 1; 
2888                 if (ctdb_client_async_control(ctdb,
2889                                               CTDB_CONTROL_RUN_EVENTSCRIPTS,
2890                                               nodes, 0, TAKEOVER_TIMEOUT(),
2891                                               false, data,
2892                                               NULL, fail_callback,
2893                                               callback_data) != 0) {
2894                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2895                 }
2896         }
2897
2898         talloc_free(tmp_ctx);
2899         return 0;
2900 }
2901
2902
2903 /*
2904   destroy a ctdb_client_ip structure
2905  */
2906 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2907 {
2908         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2909                 ctdb_addr_to_str(&ip->addr),
2910                 ntohs(ip->addr.ip.sin_port),
2911                 ip->client_id));
2912
2913         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2914         return 0;
2915 }
2916
2917 /*
2918   called by a client to inform us of a TCP connection that it is managing
2919   that should tickled with an ACK when IP takeover is done
2920   we handle both the old ipv4 style of packets as well as the new ipv4/6
2921   pdus.
2922  */
2923 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2924                                 TDB_DATA indata)
2925 {
2926         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2927         struct ctdb_control_tcp *old_addr = NULL;
2928         struct ctdb_control_tcp_addr new_addr;
2929         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2930         struct ctdb_tcp_list *tcp;
2931         struct ctdb_tcp_connection t;
2932         int ret;
2933         TDB_DATA data;
2934         struct ctdb_client_ip *ip;
2935         struct ctdb_vnn *vnn;
2936         ctdb_sock_addr addr;
2937
2938         switch (indata.dsize) {
2939         case sizeof(struct ctdb_control_tcp):
2940                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2941                 ZERO_STRUCT(new_addr);
2942                 tcp_sock = &new_addr;
2943                 tcp_sock->src.ip  = old_addr->src;
2944                 tcp_sock->dest.ip = old_addr->dest;
2945                 break;
2946         case sizeof(struct ctdb_control_tcp_addr):
2947                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2948                 break;
2949         default:
2950                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2951                                  "to ctdb_control_tcp_client. size was %d but "
2952                                  "only allowed sizes are %lu and %lu\n",
2953                                  (int)indata.dsize,
2954                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2955                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2956                 return -1;
2957         }
2958
2959         addr = tcp_sock->src;
2960         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2961         addr = tcp_sock->dest;
2962         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2963
2964         ZERO_STRUCT(addr);
2965         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2966         vnn = find_public_ip_vnn(ctdb, &addr);
2967         if (vnn == NULL) {
2968                 switch (addr.sa.sa_family) {
2969                 case AF_INET:
2970                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2971                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2972                                         ctdb_addr_to_str(&addr)));
2973                         }
2974                         break;
2975                 case AF_INET6:
2976                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2977                                 ctdb_addr_to_str(&addr)));
2978                         break;
2979                 default:
2980                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2981                 }
2982
2983                 return 0;
2984         }
2985
2986         if (vnn->pnn != ctdb->pnn) {
2987                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2988                         ctdb_addr_to_str(&addr),
2989                         client_id, client->pid));
2990                 /* failing this call will tell smbd to die */
2991                 return -1;
2992         }
2993
2994         ip = talloc(client, struct ctdb_client_ip);
2995         CTDB_NO_MEMORY(ctdb, ip);
2996
2997         ip->ctdb      = ctdb;
2998         ip->addr      = addr;
2999         ip->client_id = client_id;
3000         talloc_set_destructor(ip, ctdb_client_ip_destructor);
3001         DLIST_ADD(ctdb->client_ip_list, ip);
3002
3003         tcp = talloc(client, struct ctdb_tcp_list);
3004         CTDB_NO_MEMORY(ctdb, tcp);
3005
3006         tcp->connection.src_addr = tcp_sock->src;
3007         tcp->connection.dst_addr = tcp_sock->dest;
3008
3009         DLIST_ADD(client->tcp_list, tcp);
3010
3011         t.src_addr = tcp_sock->src;
3012         t.dst_addr = tcp_sock->dest;
3013
3014         data.dptr = (uint8_t *)&t;
3015         data.dsize = sizeof(t);
3016
3017         switch (addr.sa.sa_family) {
3018         case AF_INET:
3019                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3020                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
3021                         ctdb_addr_to_str(&tcp_sock->src),
3022                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
3023                 break;
3024         case AF_INET6:
3025                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3026                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
3027                         ctdb_addr_to_str(&tcp_sock->src),
3028                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3029                 break;
3030         default:
3031                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3032         }
3033
3034
3035         /* tell all nodes about this tcp connection */
3036         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3037                                        CTDB_CONTROL_TCP_ADD,
3038                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3039         if (ret != 0) {
3040                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3041                 return -1;
3042         }
3043
3044         return 0;
3045 }
3046
3047 /*
3048   find a tcp address on a list
3049  */
3050 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3051                                            struct ctdb_tcp_connection *tcp)
3052 {
3053         int i;
3054
3055         if (array == NULL) {
3056                 return NULL;
3057         }
3058
3059         for (i=0;i<array->num;i++) {
3060                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3061                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3062                         return &array->connections[i];
3063                 }
3064         }
3065         return NULL;
3066 }
3067
3068
3069
3070 /*
3071   called by a daemon to inform us of a TCP connection that one of its
3072   clients managing that should tickled with an ACK when IP takeover is
3073   done
3074  */
3075 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3076 {
3077         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3078         struct ctdb_tcp_array *tcparray;
3079         struct ctdb_tcp_connection tcp;
3080         struct ctdb_vnn *vnn;
3081
3082         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3083         if (vnn == NULL) {
3084                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3085                         ctdb_addr_to_str(&p->dst_addr)));
3086
3087                 return -1;
3088         }
3089
3090
3091         tcparray = vnn->tcp_array;
3092
3093         /* If this is the first tickle */
3094         if (tcparray == NULL) {
3095                 tcparray = talloc_size(ctdb->nodes, 
3096                         offsetof(struct ctdb_tcp_array, connections) +
3097                         sizeof(struct ctdb_tcp_connection) * 1);
3098                 CTDB_NO_MEMORY(ctdb, tcparray);
3099                 vnn->tcp_array = tcparray;
3100
3101                 tcparray->num = 0;
3102                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3103                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3104
3105                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3106                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3107                 tcparray->num++;
3108
3109                 if (tcp_update_needed) {
3110                         vnn->tcp_update_needed = true;
3111                 }
3112                 return 0;
3113         }
3114
3115
3116         /* Do we already have this tickle ?*/
3117         tcp.src_addr = p->src_addr;
3118         tcp.dst_addr = p->dst_addr;
3119         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
3120                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3121                         ctdb_addr_to_str(&tcp.dst_addr),
3122                         ntohs(tcp.dst_addr.ip.sin_port),
3123                         vnn->pnn));
3124                 return 0;
3125         }
3126
3127         /* A new tickle, we must add it to the array */
3128         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3129                                         struct ctdb_tcp_connection,
3130                                         tcparray->num+1);
3131         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3132
3133         vnn->tcp_array = tcparray;
3134         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3135         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3136         tcparray->num++;
3137                                 
3138         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3139                 ctdb_addr_to_str(&tcp.dst_addr),
3140                 ntohs(tcp.dst_addr.ip.sin_port),
3141                 vnn->pnn));
3142
3143         if (tcp_update_needed) {
3144                 vnn->tcp_update_needed = true;
3145         }
3146
3147         return 0;
3148 }
3149
3150
3151 /*
3152   called by a daemon to inform us of a TCP connection that one of its
3153   clients managing that should tickled with an ACK when IP takeover is
3154   done
3155  */
3156 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3157 {
3158         struct ctdb_tcp_connection *tcpp;
3159         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3160
3161         if (vnn == NULL) {
3162                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3163                         ctdb_addr_to_str(&conn->dst_addr)));
3164                 return;
3165         }
3166
3167         /* if the array is empty we cant remove it
3168            and we dont need to do anything
3169          */
3170         if (vnn->tcp_array == NULL) {
3171                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3172                         ctdb_addr_to_str(&conn->dst_addr),
3173                         ntohs(conn->dst_addr.ip.sin_port)));
3174                 return;
3175         }
3176
3177
3178         /* See if we know this connection
3179            if we dont know this connection  then we dont need to do anything
3180          */
3181         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3182         if (tcpp == NULL) {
3183                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3184                         ctdb_addr_to_str(&conn->dst_addr),
3185                         ntohs(conn->dst_addr.ip.sin_port)));
3186                 return;
3187         }
3188
3189
3190         /* We need to remove this entry from the array.
3191            Instead of allocating a new array and copying data to it
3192            we cheat and just copy the last entry in the existing array
3193            to the entry that is to be removed and just shring the 
3194            ->num field
3195          */
3196         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3197         vnn->tcp_array->num--;
3198
3199         /* If we deleted the last entry we also need to remove the entire array
3200          */
3201         if (vnn->tcp_array->num == 0) {
3202                 talloc_free(vnn->tcp_array);
3203                 vnn->tcp_array = NULL;
3204         }               
3205
3206         vnn->tcp_update_needed = true;
3207
3208         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3209                 ctdb_addr_to_str(&conn->src_addr),
3210                 ntohs(conn->src_addr.ip.sin_port)));
3211 }
3212
3213
3214 /*
3215   called by a daemon to inform us of a TCP connection that one of its
3216   clients used are no longer needed in the tickle database
3217  */
3218 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3219 {
3220         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3221
3222         ctdb_remove_tcp_connection(ctdb, conn);
3223
3224         return 0;
3225 }
3226
3227
3228 /*
3229   called when a daemon restarts - send all tickes for all public addresses
3230   we are serving immediately to the new node.
3231  */
3232 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
3233 {
3234 /*XXX here we should send all tickes we are serving to the new node */
3235         return 0;
3236 }
3237
3238
3239 /*
3240   called when a client structure goes away - hook to remove
3241   elements from the tcp_list in all daemons
3242  */
3243 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3244 {
3245         while (client->tcp_list) {
3246                 struct ctdb_tcp_list *tcp = client->tcp_list;
3247                 DLIST_REMOVE(client->tcp_list, tcp);
3248                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3249         }
3250 }
3251
3252
3253 /*
3254   release all IPs on shutdown
3255  */
3256 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3257 {
3258         struct ctdb_vnn *vnn;
3259         int count = 0;
3260
3261         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3262                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3263                         ctdb_vnn_unassign_iface(ctdb, vnn);
3264                         continue;
3265                 }
3266                 if (!vnn->iface) {
3267                         continue;
3268                 }
3269
3270                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3271                                     ctdb_addr_to_str(&vnn->public_address),
3272                                     vnn->public_netmask_bits,
3273                                     ctdb_vnn_iface_string(vnn)));
3274
3275                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3276                                   ctdb_vnn_iface_string(vnn),
3277                                   ctdb_addr_to_str(&vnn->public_address),
3278                                   vnn->public_netmask_bits);
3279                 release_kill_clients(ctdb, &vnn->public_address);
3280                 ctdb_vnn_unassign_iface(ctdb, vnn);
3281                 count++;
3282         }
3283
3284         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3285 }
3286
3287
3288 /*
3289   get list of public IPs
3290  */
3291 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3292                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3293 {
3294         int i, num, len;
3295         struct ctdb_all_public_ips *ips;
3296         struct ctdb_vnn *vnn;
3297         bool only_available = false;
3298
3299         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3300                 only_available = true;
3301         }
3302
3303         /* count how many public ip structures we have */
3304         num = 0;
3305         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3306                 num++;
3307         }
3308
3309         len = offsetof(struct ctdb_all_public_ips, ips) + 
3310                 num*sizeof(struct ctdb_public_ip);
3311         ips = talloc_zero_size(outdata, len);
3312         CTDB_NO_MEMORY(ctdb, ips);
3313
3314         i = 0;
3315         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3316                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3317                         continue;
3318                 }
3319                 ips->ips[i].pnn  = vnn->pnn;
3320                 ips->ips[i].addr = vnn->public_address;
3321                 i++;
3322         }
3323         ips->num = i;
3324         len = offsetof(struct ctdb_all_public_ips, ips) +
3325                 i*sizeof(struct ctdb_public_ip);
3326
3327         outdata->dsize = len;
3328         outdata->dptr  = (uint8_t *)ips;
3329
3330         return 0;
3331 }
3332
3333
3334 /*
3335   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3336  */
3337 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3338                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3339 {
3340         int i, num, len;
3341         struct ctdb_all_public_ipsv4 *ips;
3342         struct ctdb_vnn *vnn;
3343
3344         /* count how many public ip structures we have */
3345         num = 0;
3346         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3347                 if (vnn->public_address.sa.sa_family != AF_INET) {
3348                         continue;
3349                 }
3350                 num++;
3351         }
3352
3353         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3354                 num*sizeof(struct ctdb_public_ipv4);
3355         ips = talloc_zero_size(outdata, len);
3356         CTDB_NO_MEMORY(ctdb, ips);
3357
3358         outdata->dsize = len;
3359         outdata->dptr  = (uint8_t *)ips;
3360
3361         ips->num = num;
3362         i = 0;
3363         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3364                 if (vnn->public_address.sa.sa_family != AF_INET) {
3365                         continue;
3366                 }
3367                 ips->ips[i].pnn = vnn->pnn;
3368                 ips->ips[i].sin = vnn->public_address.ip;
3369                 i++;
3370         }
3371
3372         return 0;
3373 }
3374
3375 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3376                                         struct ctdb_req_control *c,
3377                                         TDB_DATA indata,
3378                                         TDB_DATA *outdata)
3379 {
3380         int i, num, len;
3381         ctdb_sock_addr *addr;
3382         struct ctdb_control_public_ip_info *info;
3383         struct ctdb_vnn *vnn;
3384
3385         addr = (ctdb_sock_addr *)indata.dptr;
3386
3387         vnn = find_public_ip_vnn(ctdb, addr);
3388         if (vnn == NULL) {
3389                 /* if it is not a public ip   it could be our 'single ip' */
3390                 if (ctdb->single_ip_vnn) {
3391                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3392                                 vnn = ctdb->single_ip_vnn;
3393                         }
3394                 }
3395         }
3396         if (vnn == NULL) {
3397                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3398                                  "'%s'not a public address\n",
3399                                  ctdb_addr_to_str(addr)));
3400                 return -1;
3401         }
3402
3403         /* count how many public ip structures we have */
3404         num = 0;
3405         for (;vnn->ifaces[num];) {
3406                 num++;
3407         }
3408
3409         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3410                 num*sizeof(struct ctdb_control_iface_info);
3411         info = talloc_zero_size(outdata, len);
3412         CTDB_NO_MEMORY(ctdb, info);
3413
3414         info->ip.addr = vnn->public_address;
3415         info->ip.pnn = vnn->pnn;
3416         info->active_idx = 0xFFFFFFFF;
3417
3418         for (i=0; vnn->ifaces[i]; i++) {
3419                 struct ctdb_iface *cur;
3420
3421                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3422                 if (cur == NULL) {
3423                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3424                                            vnn->ifaces[i]));
3425                         return -1;
3426                 }
3427                 if (vnn->iface == cur) {
3428                         info->active_idx = i;
3429                 }
3430                 strcpy(info->ifaces[i].name, cur->name);
3431                 info->ifaces[i].link_state = cur->link_up;
3432                 info->ifaces[i].references = cur->references;
3433         }
3434         info->num = i;
3435         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3436                 i*sizeof(struct ctdb_control_iface_info);
3437
3438         outdata->dsize = len;
3439         outdata->dptr  = (uint8_t *)info;
3440
3441         return 0;
3442 }
3443
3444 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3445                                 struct ctdb_req_control *c,
3446                                 TDB_DATA *outdata)
3447 {
3448         int i, num, len;
3449         struct ctdb_control_get_ifaces *ifaces;
3450         struct ctdb_iface *cur;
3451
3452         /* count how many public ip structures we have */
3453         num = 0;
3454         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3455                 num++;
3456         }
3457
3458         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3459                 num*sizeof(struct ctdb_control_iface_info);
3460         ifaces = talloc_zero_size(outdata, len);
3461         CTDB_NO_MEMORY(ctdb, ifaces);
3462
3463         i = 0;
3464         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3465                 strcpy(ifaces->ifaces[i].name, cur->name);
3466                 ifaces->ifaces[i].link_state = cur->link_up;
3467                 ifaces->ifaces[i].references = cur->references;
3468                 i++;
3469         }
3470         ifaces->num = i;
3471         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3472                 i*sizeof(struct ctdb_control_iface_info);
3473
3474         outdata->dsize = len;
3475         outdata->dptr  = (uint8_t *)ifaces;
3476
3477         return 0;
3478 }
3479
3480 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3481                                     struct ctdb_req_control *c,
3482                                     TDB_DATA indata)
3483 {
3484         struct ctdb_control_iface_info *info;
3485         struct ctdb_iface *iface;
3486         bool link_up = false;
3487
3488         info = (struct ctdb_control_iface_info *)indata.dptr;
3489
3490         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3491                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3492                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3493                                   len, len, info->name));
3494                 return -1;
3495         }
3496
3497         switch (info->link_state) {
3498         case 0:
3499                 link_up = false;
3500                 break;
3501         case 1:
3502                 link_up = true;
3503                 break;
3504         default:
3505                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3506                                   (unsigned int)info->link_state));
3507                 return -1;
3508         }
3509
3510         if (info->references != 0) {
3511                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3512                                   (unsigned int)info->references));
3513                 return -1;
3514         }
3515
3516         iface = ctdb_find_iface(ctdb, info->name);
3517         if (iface == NULL) {
3518                 return -1;
3519         }
3520
3521         if (link_up == iface->link_up) {
3522                 return 0;
3523         }
3524
3525         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3526               ("iface[%s] has changed it's link status %s => %s\n",
3527                iface->name,
3528                iface->link_up?"up":"down",
3529                link_up?"up":"down"));
3530
3531         iface->link_up = link_up;
3532         return 0;
3533 }
3534
3535
3536 /* 
3537    structure containing the listening socket and the list of tcp connections
3538    that the ctdb daemon is to kill
3539 */
3540 struct ctdb_kill_tcp {
3541         struct ctdb_vnn *vnn;
3542         struct ctdb_context *ctdb;
3543         int capture_fd;
3544         struct fd_event *fde;
3545         trbt_tree_t *connections;
3546         void *private_data;
3547 };
3548
3549 /*
3550   a tcp connection that is to be killed
3551  */
3552 struct ctdb_killtcp_con {
3553         ctdb_sock_addr src_addr;
3554         ctdb_sock_addr dst_addr;
3555         int count;
3556         struct ctdb_kill_tcp *killtcp;
3557 };
3558
3559 /* this function is used to create a key to represent this socketpair
3560    in the killtcp tree.
3561    this key is used to insert and lookup matching socketpairs that are
3562    to be tickled and RST
3563 */
3564 #define KILLTCP_KEYLEN  10
3565 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3566 {
3567         static uint32_t key[KILLTCP_KEYLEN];
3568
3569         bzero(key, sizeof(key));
3570
3571         if (src->sa.sa_family != dst->sa.sa_family) {
3572                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3573                 return key;
3574         }
3575         
3576         switch (src->sa.sa_family) {
3577         case AF_INET:
3578                 key[0]  = dst->ip.sin_addr.s_addr;
3579                 key[1]  = src->ip.sin_addr.s_addr;
3580                 key[2]  = dst->ip.sin_port;
3581                 key[3]  = src->ip.sin_port;
3582                 break;
3583         case AF_INET6: {
3584                 uint32_t *dst6_addr32 =
3585                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3586                 uint32_t *src6_addr32 =
3587                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3588                 key[0]  = dst6_addr32[3];
3589                 key[1]  = src6_addr32[3];
3590                 key[2]  = dst6_addr32[2];
3591                 key[3]  = src6_addr32[2];
3592                 key[4]  = dst6_addr32[1];
3593                 key[5]  = src6_addr32[1];
3594                 key[6]  = dst6_addr32[0];
3595                 key[7]  = src6_addr32[0];
3596                 key[8]  = dst->ip6.sin6_port;
3597                 key[9]  = src->ip6.sin6_port;
3598                 break;
3599         }
3600         default:
3601                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3602                 return key;
3603         }
3604
3605         return key;
3606 }
3607
3608 /*
3609   called when we get a read event on the raw socket
3610  */
3611 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3612                                 uint16_t flags, void *private_data)
3613 {
3614         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3615         struct ctdb_killtcp_con *con;
3616         ctdb_sock_addr src, dst;
3617         uint32_t ack_seq, seq;
3618
3619         if (!(flags & EVENT_FD_READ)) {
3620                 return;
3621         }
3622
3623         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3624                                 killtcp->private_data,
3625                                 &src, &dst,
3626                                 &ack_seq, &seq) != 0) {
3627                 /* probably a non-tcp ACK packet */
3628                 return;
3629         }
3630
3631         /* check if we have this guy in our list of connections
3632            to kill
3633         */
3634         con = trbt_lookuparray32(killtcp->connections, 
3635                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3636         if (con == NULL) {
3637                 /* no this was some other packet we can just ignore */
3638                 return;
3639         }
3640
3641         /* This one has been tickled !
3642            now reset him and remove him from the list.
3643          */
3644         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3645                 ntohs(con->dst_addr.ip.sin_port),
3646                 ctdb_addr_to_str(&con->src_addr),
3647                 ntohs(con->src_addr.ip.sin_port)));
3648
3649         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3650         talloc_free(con);
3651 }
3652
3653
3654 /* when traversing the list of all tcp connections to send tickle acks to
3655    (so that we can capture the ack coming back and kill the connection
3656     by a RST)
3657    this callback is called for each connection we are currently trying to kill
3658 */
3659 static int tickle_connection_traverse(void *param, void *data)
3660 {
3661         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3662
3663         /* have tried too many times, just give up */
3664         if (con->count >= 5) {
3665                 /* can't delete in traverse: reparent to delete_cons */
3666                 talloc_steal(param, con);
3667                 return 0;
3668         }
3669
3670         /* othervise, try tickling it again */
3671         con->count++;
3672         ctdb_sys_send_tcp(
3673                 (ctdb_sock_addr *)&con->dst_addr,
3674                 (ctdb_sock_addr *)&con->src_addr,
3675                 0, 0, 0);
3676         return 0;
3677 }
3678
3679
3680 /* 
3681    called every second until all sentenced connections have been reset
3682  */
3683 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3684                                               struct timeval t, void *private_data)
3685 {
3686         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3687         void *delete_cons = talloc_new(NULL);
3688
3689         /* loop over all connections sending tickle ACKs */
3690         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3691
3692         /* now we've finished traverse, it's safe to do deletion. */
3693         talloc_free(delete_cons);
3694
3695         /* If there are no more connections to kill we can remove the
3696            entire killtcp structure
3697          */
3698         if ( (killtcp->connections == NULL) || 
3699              (killtcp->connections->root == NULL) ) {
3700                 talloc_free(killtcp);
3701                 return;
3702         }
3703
3704         /* try tickling them again in a seconds time
3705          */
3706         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3707                         ctdb_tickle_sentenced_connections, killtcp);
3708 }
3709
3710 /*
3711   destroy the killtcp structure
3712  */
3713 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3714 {
3715         struct ctdb_vnn *tmpvnn;
3716
3717         /* verify that this vnn is still active */
3718         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3719                 if (tmpvnn == killtcp->vnn) {
3720                         break;
3721                 }
3722         }
3723
3724         if (tmpvnn == NULL) {
3725                 return 0;
3726         }
3727
3728         if (killtcp->vnn->killtcp != killtcp) {
3729                 return 0;
3730         }
3731
3732         killtcp->vnn->killtcp = NULL;
3733
3734         return 0;
3735 }
3736
3737
3738 /* nothing fancy here, just unconditionally replace any existing
3739    connection structure with the new one.
3740
3741    dont even free the old one if it did exist, that one is talloc_stolen
3742    by the same node in the tree anyway and will be deleted when the new data 
3743    is deleted
3744 */
3745 static void *add_killtcp_callback(void *parm, void *data)
3746 {
3747         return parm;
3748 }
3749
3750 /*
3751   add a tcp socket to the list of connections we want to RST
3752  */
3753 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3754                                        ctdb_sock_addr *s,
3755                                        ctdb_sock_addr *d)
3756 {
3757         ctdb_sock_addr src, dst;
3758         struct ctdb_kill_tcp *killtcp;
3759         struct ctdb_killtcp_con *con;
3760         struct ctdb_vnn *vnn;
3761
3762         ctdb_canonicalize_ip(s, &src);
3763         ctdb_canonicalize_ip(d, &dst);
3764
3765         vnn = find_public_ip_vnn(ctdb, &dst);
3766         if (vnn == NULL) {
3767                 vnn = find_public_ip_vnn(ctdb, &src);
3768         }
3769         if (vnn == NULL) {
3770                 /* if it is not a public ip   it could be our 'single ip' */
3771                 if (ctdb->single_ip_vnn) {
3772                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3773                                 vnn = ctdb->single_ip_vnn;
3774                         }
3775                 }
3776         }
3777         if (vnn == NULL) {
3778                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3779                 return -1;
3780         }
3781
3782         killtcp = vnn->killtcp;
3783         
3784         /* If this is the first connection to kill we must allocate
3785            a new structure
3786          */
3787         if (killtcp == NULL) {
3788                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3789                 CTDB_NO_MEMORY(ctdb, killtcp);
3790
3791                 killtcp->vnn         = vnn;
3792                 killtcp->ctdb        = ctdb;
3793                 killtcp->capture_fd  = -1;
3794                 killtcp->connections = trbt_create(killtcp, 0);
3795
3796                 vnn->killtcp         = killtcp;
3797                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3798         }
3799
3800
3801
3802         /* create a structure that describes this connection we want to
3803            RST and store it in killtcp->connections
3804         */
3805         con = talloc(killtcp, struct ctdb_killtcp_con);
3806         CTDB_NO_MEMORY(ctdb, con);
3807         con->src_addr = src;
3808         con->dst_addr = dst;
3809         con->count    = 0;
3810         con->killtcp  = killtcp;
3811
3812
3813         trbt_insertarray32_callback(killtcp->connections,
3814                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3815                         add_killtcp_callback, con);
3816
3817         /* 
3818            If we dont have a socket to listen on yet we must create it
3819          */
3820         if (killtcp->capture_fd == -1) {
3821                 const char *iface = ctdb_vnn_iface_string(vnn);
3822                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3823                 if (killtcp->capture_fd == -1) {
3824                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3825                                           "socket on iface '%s' for killtcp (%s)\n",
3826                                           iface, strerror(errno)));
3827                         goto failed;
3828                 }
3829         }
3830
3831
3832         if (killtcp->fde == NULL) {
3833                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3834                                             EVENT_FD_READ,
3835                                             capture_tcp_handler, killtcp);
3836                 tevent_fd_set_auto_close(killtcp->fde);
3837
3838                 /* We also need to set up some events to tickle all these connections
3839                    until they are all reset
3840                 */
3841                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3842                                 ctdb_tickle_sentenced_connections, killtcp);
3843         }
3844
3845         /* tickle him once now */
3846         ctdb_sys_send_tcp(
3847                 &con->dst_addr,
3848                 &con->src_addr,
3849                 0, 0, 0);
3850
3851         return 0;
3852
3853 failed:
3854         talloc_free(vnn->killtcp);
3855         vnn->killtcp = NULL;
3856         return -1;
3857 }
3858
3859 /*
3860   kill a TCP connection.
3861  */
3862 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3863 {
3864         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3865
3866         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3867 }
3868
3869 /*
3870   called by a daemon to inform us of the entire list of TCP tickles for
3871   a particular public address.
3872   this control should only be sent by the node that is currently serving
3873   that public address.
3874  */
3875 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3876 {
3877         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3878         struct ctdb_tcp_array *tcparray;
3879         struct ctdb_vnn *vnn;
3880
3881         /* We must at least have tickles.num or else we cant verify the size
3882            of the received data blob
3883          */
3884         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3885                                         tickles.connections)) {
3886                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3887                 return -1;
3888         }
3889
3890         /* verify that the size of data matches what we expect */
3891         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3892                                 tickles.connections)
3893                          + sizeof(struct ctdb_tcp_connection)
3894                                  * list->tickles.num) {
3895                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3896                 return -1;
3897         }       
3898
3899         vnn = find_public_ip_vnn(ctdb, &list->addr);
3900         if (vnn == NULL) {
3901                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3902                         ctdb_addr_to_str(&list->addr)));
3903
3904                 return 1;
3905         }
3906
3907         /* remove any old ticklelist we might have */
3908         talloc_free(vnn->tcp_array);
3909         vnn->tcp_array = NULL;
3910
3911         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3912         CTDB_NO_MEMORY(ctdb, tcparray);
3913
3914         tcparray->num = list->tickles.num;
3915
3916         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3917         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3918
3919         memcpy(tcparray->connections, &list->tickles.connections[0], 
3920                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3921
3922         /* We now have a new fresh tickle list array for this vnn */
3923         vnn->tcp_array = talloc_steal(vnn, tcparray);
3924         
3925         return 0;
3926 }
3927
3928 /*
3929   called to return the full list of tickles for the puclic address associated 
3930   with the provided vnn
3931  */
3932 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3933 {
3934         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3935         struct ctdb_control_tcp_tickle_list *list;
3936         struct ctdb_tcp_array *tcparray;
3937         int num;
3938         struct ctdb_vnn *vnn;
3939
3940         vnn = find_public_ip_vnn(ctdb, addr);
3941         if (vnn == NULL) {
3942                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3943                         ctdb_addr_to_str(addr)));
3944
3945                 return 1;
3946         }
3947
3948         tcparray = vnn->tcp_array;
3949         if (tcparray) {
3950                 num = tcparray->num;
3951         } else {
3952                 num = 0;
3953         }
3954
3955         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3956                                 tickles.connections)
3957                         + sizeof(struct ctdb_tcp_connection) * num;
3958
3959         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3960         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3961         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3962
3963         list->addr = *addr;
3964         list->tickles.num = num;
3965         if (num) {
3966                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3967                         sizeof(struct ctdb_tcp_connection) * num);
3968         }
3969
3970         return 0;
3971 }
3972
3973
3974 /*
3975   set the list of all tcp tickles for a public address
3976  */
3977 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3978                               struct timeval timeout, uint32_t destnode, 
3979                               ctdb_sock_addr *addr,
3980                               struct ctdb_tcp_array *tcparray)
3981 {
3982         int ret, num;
3983         TDB_DATA data;
3984         struct ctdb_control_tcp_tickle_list *list;
3985
3986         if (tcparray) {
3987                 num = tcparray->num;
3988         } else {
3989                 num = 0;
3990         }
3991
3992         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3993                                 tickles.connections) +
3994                         sizeof(struct ctdb_tcp_connection) * num;
3995         data.dptr = talloc_size(ctdb, data.dsize);
3996         CTDB_NO_MEMORY(ctdb, data.dptr);
3997
3998         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3999         list->addr = *addr;
4000         list->tickles.num = num;
4001         if (tcparray) {
4002                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
4003         }
4004
4005         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
4006                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4007                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4008         if (ret != 0) {
4009                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4010                 return -1;
4011         }
4012
4013         talloc_free(data.dptr);
4014
4015         return ret;
4016 }
4017
4018
4019 /*
4020   perform tickle updates if required
4021  */
4022 static void ctdb_update_tcp_tickles(struct event_context *ev, 
4023                                 struct timed_event *te, 
4024                                 struct timeval t, void *private_data)
4025 {
4026         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4027         int ret;
4028         struct ctdb_vnn *vnn;
4029
4030         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4031                 /* we only send out updates for public addresses that 
4032                    we have taken over
4033                  */
4034                 if (ctdb->pnn != vnn->pnn) {
4035                         continue;
4036                 }
4037                 /* We only send out the updates if we need to */
4038                 if (!vnn->tcp_update_needed) {
4039                         continue;
4040                 }
4041                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
4042                                 TAKEOVER_TIMEOUT(),
4043                                 CTDB_BROADCAST_CONNECTED,
4044                                 &vnn->public_address,
4045                                 vnn->tcp_array);
4046                 if (ret != 0) {
4047                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4048                                 ctdb_addr_to_str(&vnn->public_address)));
4049                 }
4050         }
4051
4052         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4053                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4054                              ctdb_update_tcp_tickles, ctdb);
4055 }               
4056         
4057
4058 /*
4059   start periodic update of tcp tickles
4060  */
4061 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4062 {
4063         ctdb->tickle_update_context = talloc_new(ctdb);
4064
4065         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4066                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4067                              ctdb_update_tcp_tickles, ctdb);
4068 }
4069
4070
4071
4072
4073 struct control_gratious_arp {
4074         struct ctdb_context *ctdb;
4075         ctdb_sock_addr addr;
4076         const char *iface;
4077         int count;
4078 };
4079
4080 /*
4081   send a control_gratuitous arp
4082  */
4083 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4084                                   struct timeval t, void *private_data)
4085 {
4086         int ret;
4087         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4088                                                         struct control_gratious_arp);
4089
4090         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4091         if (ret != 0) {
4092                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4093                                  arp->iface, strerror(errno)));
4094         }
4095
4096
4097         arp->count++;
4098         if (arp->count == CTDB_ARP_REPEAT) {
4099                 talloc_free(arp);
4100                 return;
4101         }
4102
4103         event_add_timed(arp->ctdb->ev, arp, 
4104                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4105                         send_gratious_arp, arp);
4106 }
4107
4108
4109 /*
4110   send a gratious arp 
4111  */
4112 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4113 {
4114         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4115         struct control_gratious_arp *arp;
4116
4117         /* verify the size of indata */
4118         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4119                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4120                                  (unsigned)indata.dsize, 
4121                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4122                 return -1;
4123         }
4124         if (indata.dsize != 
4125                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4126                 + gratious_arp->len ) ){
4127
4128                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4129                         "but should be %u bytes\n", 
4130                          (unsigned)indata.dsize, 
4131                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4132                 return -1;
4133         }
4134
4135
4136         arp = talloc(ctdb, struct control_gratious_arp);
4137         CTDB_NO_MEMORY(ctdb, arp);
4138
4139         arp->ctdb  = ctdb;
4140         arp->addr   = gratious_arp->addr;
4141         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4142         CTDB_NO_MEMORY(ctdb, arp->iface);
4143         arp->count = 0;
4144         
4145         event_add_timed(arp->ctdb->ev, arp, 
4146                         timeval_zero(), send_gratious_arp, arp);
4147
4148         return 0;
4149 }
4150
4151 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4152 {
4153         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4154         int ret;
4155
4156         /* verify the size of indata */
4157         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4158                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4159                 return -1;
4160         }
4161         if (indata.dsize != 
4162                 ( offsetof(struct ctdb_control_ip_iface, iface)
4163                 + pub->len ) ){
4164
4165                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4166                         "but should be %u bytes\n", 
4167                          (unsigned)indata.dsize, 
4168                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4169                 return -1;
4170         }
4171
4172         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4173
4174         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4175
4176         if (ret != 0) {
4177                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4178                 return -1;
4179         }
4180
4181         return 0;
4182 }
4183
4184 /*
4185   called when releaseip event finishes for del_public_address
4186  */
4187 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
4188                                 void *private_data)
4189 {
4190         talloc_free(private_data);
4191 }
4192
4193 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4194 {
4195         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4196         struct ctdb_vnn *vnn;
4197         int ret;
4198
4199         /* verify the size of indata */
4200         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4201                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4202                 return -1;
4203         }
4204         if (indata.dsize != 
4205                 ( offsetof(struct ctdb_control_ip_iface, iface)
4206                 + pub->len ) ){
4207
4208                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4209                         "but should be %u bytes\n", 
4210                          (unsigned)indata.dsize, 
4211                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4212                 return -1;
4213         }
4214
4215         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4216
4217         /* walk over all public addresses until we find a match */
4218         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4219                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4220                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4221
4222                         DLIST_REMOVE(ctdb->vnn, vnn);
4223                         talloc_steal(mem_ctx, vnn);
4224                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
4225                         if (vnn->pnn != ctdb->pnn) {
4226                                 if (vnn->iface != NULL) {
4227                                         ctdb_vnn_unassign_iface(ctdb, vnn);
4228                                 }
4229                                 talloc_free(mem_ctx);
4230                                 return 0;
4231                         }
4232                         vnn->pnn = -1;
4233
4234                         ret = ctdb_event_script_callback(ctdb, 
4235                                          mem_ctx, delete_ip_callback, mem_ctx,
4236                                          false,
4237                                          CTDB_EVENT_RELEASE_IP,
4238                                          "%s %s %u",
4239                                          ctdb_vnn_iface_string(vnn),
4240                                          ctdb_addr_to_str(&vnn->public_address),
4241                                          vnn->public_netmask_bits);
4242                         if (vnn->iface != NULL) {
4243                                 ctdb_vnn_unassign_iface(ctdb, vnn);
4244                         }
4245                         if (ret != 0) {
4246                                 return -1;
4247                         }
4248                         return 0;
4249                 }
4250         }
4251
4252         return -1;
4253 }
4254
4255
4256 struct ipreallocated_callback_state {
4257         struct ctdb_req_control *c;
4258 };
4259
4260 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4261                                         int status, void *p)
4262 {
4263         struct ipreallocated_callback_state *state =
4264                 talloc_get_type(p, struct ipreallocated_callback_state);
4265
4266         if (status != 0) {
4267                 DEBUG(DEBUG_ERR,
4268                       (" \"ipreallocated\" event script failed (status %d)\n",
4269                        status));
4270                 if (status == -ETIME) {
4271                         ctdb_ban_self(ctdb);
4272                 }
4273         }
4274
4275         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4276         talloc_free(state);
4277 }
4278
4279 /* A control to run the ipreallocated event */
4280 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4281                                    struct ctdb_req_control *c,
4282                                    bool *async_reply)
4283 {
4284         int ret;
4285         struct ipreallocated_callback_state *state;
4286
4287         state = talloc(ctdb, struct ipreallocated_callback_state);
4288         CTDB_NO_MEMORY(ctdb, state);
4289
4290         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4291
4292         ret = ctdb_event_script_callback(ctdb, state,
4293                                          ctdb_ipreallocated_callback, state,
4294                                          false, CTDB_EVENT_IPREALLOCATED,
4295                                          "%s", "");
4296
4297         if (ret != 0) {
4298                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4299                 talloc_free(state);
4300                 return -1;
4301         }
4302
4303         /* tell the control that we will be reply asynchronously */
4304         state->c    = talloc_steal(state, c);
4305         *async_reply = true;
4306
4307         return 0;
4308 }
4309
4310
4311 /* This function is called from the recovery daemon to verify that a remote
4312    node has the expected ip allocation.
4313    This is verified against ctdb->ip_tree
4314 */
4315 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4316                                 struct ctdb_all_public_ips *ips,
4317                                 uint32_t pnn)
4318 {
4319         struct ctdb_public_ip_list *tmp_ip; 
4320         int i;
4321
4322         if (ctdb->ip_tree == NULL) {
4323                 /* dont know the expected allocation yet, assume remote node
4324                    is correct. */
4325                 return 0;
4326         }
4327
4328         if (ips == NULL) {
4329                 return 0;
4330         }
4331
4332         for (i=0; i<ips->num; i++) {
4333                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4334                 if (tmp_ip == NULL) {
4335                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4336                         return -1;
4337                 }
4338
4339                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4340                         continue;
4341                 }
4342
4343                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4344                         DEBUG(DEBUG_ERR,
4345                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4346                                pnn,
4347                                ctdb_addr_to_str(&ips->ips[i].addr),
4348                                ips->ips[i].pnn, tmp_ip->pnn));
4349                         return -1;
4350                 }
4351         }
4352
4353         return 0;
4354 }
4355
4356 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4357 {
4358         struct ctdb_public_ip_list *tmp_ip; 
4359
4360         if (ctdb->ip_tree == NULL) {
4361                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4362                 return -1;
4363         }
4364
4365         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4366         if (tmp_ip == NULL) {
4367                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4368                 return -1;
4369         }
4370
4371         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4372         tmp_ip->pnn = ip->pnn;
4373
4374         return 0;
4375 }
4376
4377
4378 struct ctdb_reloadips_handle {
4379         struct ctdb_context *ctdb;
4380         struct ctdb_req_control *c;
4381         int status;
4382         int fd[2];
4383         pid_t child;
4384         struct fd_event *fde;
4385 };
4386
4387 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4388 {
4389         if (h == h->ctdb->reload_ips) {
4390                 h->ctdb->reload_ips = NULL;
4391         }
4392         if (h->c != NULL) {
4393                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4394                 h->c = NULL;
4395         }
4396         ctdb_kill(h->ctdb, h->child, SIGKILL);
4397         return 0;
4398 }
4399
4400 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4401                                 struct timed_event *te,
4402                                 struct timeval t, void *private_data)
4403 {
4404         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4405
4406         talloc_free(h);
4407 }       
4408
4409 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4410                              uint16_t flags, void *private_data)
4411 {
4412         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4413
4414         char res;
4415         int ret;
4416
4417         ret = read(h->fd[0], &res, 1);
4418         if (ret < 1 || res != 0) {
4419                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4420                 res = 1;
4421         }
4422         h->status = res;
4423
4424         talloc_free(h);
4425 }
4426
4427 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4428 {
4429         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4430         struct ctdb_all_public_ips *ips;
4431         struct ctdb_vnn *vnn;
4432         int i, ret;
4433
4434         CTDB_NO_MEMORY(ctdb, mem_ctx);
4435
4436         /* read the ip allocation from the local node */
4437         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4438         if (ret != 0) {
4439                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4440                 talloc_free(mem_ctx);
4441                 return -1;
4442         }
4443
4444         /* re-read the public ips file */
4445         ctdb->vnn = NULL;
4446         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4447                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4448                 talloc_free(mem_ctx);
4449                 return -1;
4450         }
4451
4452
4453         /* check the previous list of ips and scan for ips that have been
4454            dropped.
4455          */
4456         for (i = 0; i < ips->num; i++) {
4457                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4458                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4459                                 break;
4460                         }
4461                 }
4462
4463                 /* we need to delete this ip, no longer available on this node */
4464                 if (vnn == NULL) {
4465                         struct ctdb_control_ip_iface pub;
4466
4467                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4468                         pub.addr  = ips->ips[i].addr;
4469                         pub.mask  = 0;
4470                         pub.len   = 0;
4471
4472                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4473                         if (ret != 0) {
4474                                 talloc_free(mem_ctx);
4475                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4476                                 return -1;
4477                         }
4478                 }
4479         }
4480
4481
4482         /* loop over all new ones and check the ones we need to add */
4483         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4484                 for (i = 0; i < ips->num; i++) {
4485                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4486                                 break;
4487                         }
4488                 }
4489                 if (i == ips->num) {
4490                         struct ctdb_control_ip_iface *pub;
4491                         const char *ifaces = NULL;
4492                         int iface = 0;
4493
4494                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4495
4496                         pub = talloc_zero(mem_ctx, struct ctdb_control_ip_iface);
4497                         pub->addr  = vnn->public_address;
4498                         pub->mask  = vnn->public_netmask_bits;
4499
4500                         ifaces = vnn->ifaces[0];
4501                         iface = 1;
4502                         while (vnn->ifaces[iface] != NULL) {
4503                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4504                                 iface++;
4505                         }
4506                         pub->len   = strlen(ifaces)+1;
4507                         pub = talloc_realloc_size(mem_ctx, pub,
4508                                 offsetof(struct ctdb_control_ip_iface, iface) + pub->len);
4509                         if (pub == NULL) {
4510                                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory\n"));
4511                                 talloc_free(mem_ctx);
4512                                 return -1;
4513                         }
4514                         memcpy(&pub->iface[0], ifaces, pub->len);
4515
4516                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(),
4517                                                       CTDB_CURRENT_NODE, pub);
4518                         if (ret != 0) {
4519                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4520                                 talloc_free(mem_ctx);
4521                                 return -1;
4522                         }
4523                 }
4524         }
4525
4526         talloc_free(mem_ctx);
4527         return 0;
4528 }
4529
4530 /* This control is sent to force the node to re-read the public addresses file
4531    and drop any addresses we should nnot longer host, and add new addresses
4532    that we are now able to host
4533 */
4534 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4535 {
4536         struct ctdb_reloadips_handle *h;
4537         pid_t parent = getpid();
4538
4539         if (ctdb->reload_ips != NULL) {
4540                 talloc_free(ctdb->reload_ips);
4541                 ctdb->reload_ips = NULL;
4542         }
4543
4544         h = talloc(ctdb, struct ctdb_reloadips_handle);
4545         CTDB_NO_MEMORY(ctdb, h);
4546         h->ctdb     = ctdb;
4547         h->c        = NULL;
4548         h->status   = -1;
4549         
4550         if (pipe(h->fd) == -1) {
4551                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4552                 talloc_free(h);
4553                 return -1;
4554         }
4555
4556         h->child = ctdb_fork(ctdb);
4557         if (h->child == (pid_t)-1) {
4558                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4559                 close(h->fd[0]);
4560                 close(h->fd[1]);
4561                 talloc_free(h);
4562                 return -1;
4563         }
4564
4565         /* child process */
4566         if (h->child == 0) {
4567                 signed char res = 0;
4568
4569                 close(h->fd[0]);
4570                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4571
4572                 ctdb_set_process_name("ctdb_reloadips");
4573                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4574                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4575                         res = -1;
4576                 } else {
4577                         res = ctdb_reloadips_child(ctdb);
4578                         if (res != 0) {
4579                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4580                         }
4581                 }
4582
4583                 write(h->fd[1], &res, 1);
4584                 /* make sure we die when our parent dies */
4585                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4586                         sleep(5);
4587                 }
4588                 _exit(0);
4589         }
4590
4591         h->c             = talloc_steal(h, c);
4592
4593         close(h->fd[1]);
4594         set_close_on_exec(h->fd[0]);
4595
4596         talloc_set_destructor(h, ctdb_reloadips_destructor);
4597
4598
4599         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4600                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4601                         (void *)h);
4602         tevent_fd_set_auto_close(h->fde);
4603
4604         event_add_timed(ctdb->ev, h,
4605                         timeval_current_ofs(120, 0),
4606                         ctdb_reloadips_timeout_event, h);
4607
4608         /* we reply later */
4609         *async_reply = true;
4610         return 0;
4611 }