ctdb:daemon avoid goto ctdb_remove_orphaned_ifaces()
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn)
122 {
123         struct ctdb_iface *i, *next;
124
125         /* For each interface, check if there's an IP using it. */
126         for (i = ctdb->ifaces; i != NULL; i = next) {
127                 struct ctdb_vnn *tv;
128                 bool found;
129                 next = i->next;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         talloc_free(i);
156                 }
157         }
158 }
159
160
161 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
162                                           const char *iface)
163 {
164         struct ctdb_iface *i;
165
166         for (i=ctdb->ifaces;i;i=i->next) {
167                 if (strcmp(i->name, iface) == 0) {
168                         return i;
169                 }
170         }
171
172         return NULL;
173 }
174
175 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
176                                               struct ctdb_vnn *vnn)
177 {
178         int i;
179         struct ctdb_iface *cur = NULL;
180         struct ctdb_iface *best = NULL;
181
182         for (i=0; vnn->ifaces[i]; i++) {
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (!cur->link_up) {
190                         continue;
191                 }
192
193                 if (best == NULL) {
194                         best = cur;
195                         continue;
196                 }
197
198                 if (cur->references < best->references) {
199                         best = cur;
200                         continue;
201                 }
202         }
203
204         return best;
205 }
206
207 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
208                                      struct ctdb_vnn *vnn)
209 {
210         struct ctdb_iface *best = NULL;
211
212         if (vnn->iface) {
213                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
214                                    "still assigned to iface '%s'\n",
215                                    ctdb_addr_to_str(&vnn->public_address),
216                                    ctdb_vnn_iface_string(vnn)));
217                 return 0;
218         }
219
220         best = ctdb_vnn_best_iface(ctdb, vnn);
221         if (best == NULL) {
222                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
223                                   "cannot assign to iface any iface\n",
224                                   ctdb_addr_to_str(&vnn->public_address)));
225                 return -1;
226         }
227
228         vnn->iface = best;
229         best->references++;
230         vnn->pnn = ctdb->pnn;
231
232         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                            "now assigned to iface '%s' refs[%d]\n",
234                            ctdb_addr_to_str(&vnn->public_address),
235                            ctdb_vnn_iface_string(vnn),
236                            best->references));
237         return 0;
238 }
239
240 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
241                                     struct ctdb_vnn *vnn)
242 {
243         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
244                            "now unassigned (old iface '%s' refs[%d])\n",
245                            ctdb_addr_to_str(&vnn->public_address),
246                            ctdb_vnn_iface_string(vnn),
247                            vnn->iface?vnn->iface->references:0));
248         if (vnn->iface) {
249                 vnn->iface->references--;
250         }
251         vnn->iface = NULL;
252         if (vnn->pnn == ctdb->pnn) {
253                 vnn->pnn = -1;
254         }
255 }
256
257 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
258                                struct ctdb_vnn *vnn)
259 {
260         int i;
261
262         if (vnn->delete_pending) {
263                 return false;
264         }
265
266         if (vnn->iface && vnn->iface->link_up) {
267                 return true;
268         }
269
270         for (i=0; vnn->ifaces[i]; i++) {
271                 struct ctdb_iface *cur;
272
273                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
274                 if (cur == NULL) {
275                         continue;
276                 }
277
278                 if (cur->link_up) {
279                         return true;
280                 }
281         }
282
283         return false;
284 }
285
286 struct ctdb_takeover_arp {
287         struct ctdb_context *ctdb;
288         uint32_t count;
289         ctdb_sock_addr addr;
290         struct ctdb_tcp_array *tcparray;
291         struct ctdb_vnn *vnn;
292 };
293
294
295 /*
296   lists of tcp endpoints
297  */
298 struct ctdb_tcp_list {
299         struct ctdb_tcp_list *prev, *next;
300         struct ctdb_tcp_connection connection;
301 };
302
303 /*
304   list of clients to kill on IP release
305  */
306 struct ctdb_client_ip {
307         struct ctdb_client_ip *prev, *next;
308         struct ctdb_context *ctdb;
309         ctdb_sock_addr addr;
310         uint32_t client_id;
311 };
312
313
314 /*
315   send a gratuitous arp
316  */
317 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
318                                   struct timeval t, void *private_data)
319 {
320         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
321                                                         struct ctdb_takeover_arp);
322         int i, ret;
323         struct ctdb_tcp_array *tcparray;
324         const char *iface = ctdb_vnn_iface_string(arp->vnn);
325
326         ret = ctdb_sys_send_arp(&arp->addr, iface);
327         if (ret != 0) {
328                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
329                                   iface, strerror(errno)));
330         }
331
332         tcparray = arp->tcparray;
333         if (tcparray) {
334                 for (i=0;i<tcparray->num;i++) {
335                         struct ctdb_tcp_connection *tcon;
336
337                         tcon = &tcparray->connections[i];
338                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
339                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
340                                 ctdb_addr_to_str(&tcon->src_addr),
341                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
342                         ret = ctdb_sys_send_tcp(
343                                 &tcon->src_addr, 
344                                 &tcon->dst_addr,
345                                 0, 0, 0);
346                         if (ret != 0) {
347                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
348                                         ctdb_addr_to_str(&tcon->src_addr)));
349                         }
350                 }
351         }
352
353         arp->count++;
354
355         if (arp->count == CTDB_ARP_REPEAT) {
356                 talloc_free(arp);
357                 return;
358         }
359
360         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
361                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
362                         ctdb_control_send_arp, arp);
363 }
364
365 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
366                                        struct ctdb_vnn *vnn)
367 {
368         struct ctdb_takeover_arp *arp;
369         struct ctdb_tcp_array *tcparray;
370
371         if (!vnn->takeover_ctx) {
372                 vnn->takeover_ctx = talloc_new(vnn);
373                 if (!vnn->takeover_ctx) {
374                         return -1;
375                 }
376         }
377
378         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
379         if (!arp) {
380                 return -1;
381         }
382
383         arp->ctdb = ctdb;
384         arp->addr = vnn->public_address;
385         arp->vnn  = vnn;
386
387         tcparray = vnn->tcp_array;
388         if (tcparray) {
389                 /* add all of the known tcp connections for this IP to the
390                    list of tcp connections to send tickle acks for */
391                 arp->tcparray = talloc_steal(arp, tcparray);
392
393                 vnn->tcp_array = NULL;
394                 vnn->tcp_update_needed = true;
395         }
396
397         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
398                         timeval_zero(), ctdb_control_send_arp, arp);
399
400         return 0;
401 }
402
403 struct takeover_callback_state {
404         struct ctdb_req_control *c;
405         ctdb_sock_addr *addr;
406         struct ctdb_vnn *vnn;
407 };
408
409 struct ctdb_do_takeip_state {
410         struct ctdb_req_control *c;
411         struct ctdb_vnn *vnn;
412 };
413
414 /*
415   called when takeip event finishes
416  */
417 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
418                                     void *private_data)
419 {
420         struct ctdb_do_takeip_state *state =
421                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
422         int32_t ret;
423         TDB_DATA data;
424
425         if (status != 0) {
426                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
427         
428                 if (status == -ETIME) {
429                         ctdb_ban_self(ctdb);
430                 }
431                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
432                                  ctdb_addr_to_str(&state->vnn->public_address),
433                                  ctdb_vnn_iface_string(state->vnn)));
434                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
435
436                 node->flags |= NODE_FLAGS_UNHEALTHY;
437                 talloc_free(state);
438                 return;
439         }
440
441         if (ctdb->do_checkpublicip) {
442
443         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
444         if (ret != 0) {
445                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
446                 talloc_free(state);
447                 return;
448         }
449
450         }
451
452         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
453         data.dsize = strlen((char *)data.dptr) + 1;
454         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
455
456         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
457
458
459         /* the control succeeded */
460         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
461         talloc_free(state);
462         return;
463 }
464
465 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
466 {
467         state->vnn->update_in_flight = false;
468         return 0;
469 }
470
471 /*
472   take over an ip address
473  */
474 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
475                               struct ctdb_req_control *c,
476                               struct ctdb_vnn *vnn)
477 {
478         int ret;
479         struct ctdb_do_takeip_state *state;
480
481         if (vnn->update_in_flight) {
482                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
483                                     "update for this IP already in flight\n",
484                                     ctdb_addr_to_str(&vnn->public_address),
485                                     vnn->public_netmask_bits));
486                 return -1;
487         }
488
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
492                                  "assign a usable interface\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits));
495                 return -1;
496         }
497
498         state = talloc(vnn, struct ctdb_do_takeip_state);
499         CTDB_NO_MEMORY(ctdb, state);
500
501         state->c = talloc_steal(ctdb, c);
502         state->vnn   = vnn;
503
504         vnn->update_in_flight = true;
505         talloc_set_destructor(state, ctdb_takeip_destructor);
506
507         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
508                             ctdb_addr_to_str(&vnn->public_address),
509                             vnn->public_netmask_bits,
510                             ctdb_vnn_iface_string(vnn)));
511
512         ret = ctdb_event_script_callback(ctdb,
513                                          state,
514                                          ctdb_do_takeip_callback,
515                                          state,
516                                          CTDB_EVENT_TAKE_IP,
517                                          "%s %s %u",
518                                          ctdb_vnn_iface_string(vnn),
519                                          ctdb_addr_to_str(&vnn->public_address),
520                                          vnn->public_netmask_bits);
521
522         if (ret != 0) {
523                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
524                         ctdb_addr_to_str(&vnn->public_address),
525                         ctdb_vnn_iface_string(vnn)));
526                 talloc_free(state);
527                 return -1;
528         }
529
530         return 0;
531 }
532
533 struct ctdb_do_updateip_state {
534         struct ctdb_req_control *c;
535         struct ctdb_iface *old;
536         struct ctdb_vnn *vnn;
537 };
538
539 /*
540   called when updateip event finishes
541  */
542 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
543                                       void *private_data)
544 {
545         struct ctdb_do_updateip_state *state =
546                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
547         int32_t ret;
548
549         if (status != 0) {
550                 if (status == -ETIME) {
551                         ctdb_ban_self(ctdb);
552                 }
553                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
554                         ctdb_addr_to_str(&state->vnn->public_address),
555                         state->old->name,
556                         ctdb_vnn_iface_string(state->vnn)));
557
558                 /*
559                  * All we can do is reset the old interface
560                  * and let the next run fix it
561                  */
562                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
563                 state->vnn->iface = state->old;
564                 state->vnn->iface->references++;
565
566                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
567                 talloc_free(state);
568                 return;
569         }
570
571         if (ctdb->do_checkpublicip) {
572
573         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
574         if (ret != 0) {
575                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
576                 talloc_free(state);
577                 return;
578         }
579
580         }
581
582         /* the control succeeded */
583         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
584         talloc_free(state);
585         return;
586 }
587
588 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
589 {
590         state->vnn->update_in_flight = false;
591         return 0;
592 }
593
594 /*
595   update (move) an ip address
596  */
597 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
598                                 struct ctdb_req_control *c,
599                                 struct ctdb_vnn *vnn)
600 {
601         int ret;
602         struct ctdb_do_updateip_state *state;
603         struct ctdb_iface *old = vnn->iface;
604         const char *new_name;
605
606         if (vnn->update_in_flight) {
607                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
608                                     "update for this IP already in flight\n",
609                                     ctdb_addr_to_str(&vnn->public_address),
610                                     vnn->public_netmask_bits));
611                 return -1;
612         }
613
614         ctdb_vnn_unassign_iface(ctdb, vnn);
615         ret = ctdb_vnn_assign_iface(ctdb, vnn);
616         if (ret != 0) {
617                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
618                                  "assin a usable interface (old iface '%s')\n",
619                                  ctdb_addr_to_str(&vnn->public_address),
620                                  vnn->public_netmask_bits,
621                                  old->name));
622                 return -1;
623         }
624
625         new_name = ctdb_vnn_iface_string(vnn);
626         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
627                 /* A benign update from one interface onto itself.
628                  * no need to run the eventscripts in this case, just return
629                  * success.
630                  */
631                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
632                 return 0;
633         }
634
635         state = talloc(vnn, struct ctdb_do_updateip_state);
636         CTDB_NO_MEMORY(ctdb, state);
637
638         state->c = talloc_steal(ctdb, c);
639         state->old = old;
640         state->vnn = vnn;
641
642         vnn->update_in_flight = true;
643         talloc_set_destructor(state, ctdb_updateip_destructor);
644
645         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
646                             "interface %s to %s\n",
647                             ctdb_addr_to_str(&vnn->public_address),
648                             vnn->public_netmask_bits,
649                             old->name,
650                             new_name));
651
652         ret = ctdb_event_script_callback(ctdb,
653                                          state,
654                                          ctdb_do_updateip_callback,
655                                          state,
656                                          CTDB_EVENT_UPDATE_IP,
657                                          "%s %s %s %u",
658                                          state->old->name,
659                                          new_name,
660                                          ctdb_addr_to_str(&vnn->public_address),
661                                          vnn->public_netmask_bits);
662         if (ret != 0) {
663                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
664                                  ctdb_addr_to_str(&vnn->public_address),
665                                  old->name, new_name));
666                 talloc_free(state);
667                 return -1;
668         }
669
670         return 0;
671 }
672
673 /*
674   Find the vnn of the node that has a public ip address
675   returns -1 if the address is not known as a public address
676  */
677 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
678 {
679         struct ctdb_vnn *vnn;
680
681         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
682                 if (ctdb_same_ip(&vnn->public_address, addr)) {
683                         return vnn;
684                 }
685         }
686
687         return NULL;
688 }
689
690 /*
691   take over an ip address
692  */
693 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
694                                  struct ctdb_req_control *c,
695                                  TDB_DATA indata,
696                                  bool *async_reply)
697 {
698         int ret;
699         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
700         struct ctdb_vnn *vnn;
701         bool have_ip = false;
702         bool do_updateip = false;
703         bool do_takeip = false;
704         struct ctdb_iface *best_iface = NULL;
705
706         if (pip->pnn != ctdb->pnn) {
707                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
708                                  "with pnn %d, but we're node %d\n",
709                                  ctdb_addr_to_str(&pip->addr),
710                                  pip->pnn, ctdb->pnn));
711                 return -1;
712         }
713
714         /* update out vnn list */
715         vnn = find_public_ip_vnn(ctdb, &pip->addr);
716         if (vnn == NULL) {
717                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
718                         ctdb_addr_to_str(&pip->addr)));
719                 return 0;
720         }
721
722         if (ctdb->do_checkpublicip) {
723                 have_ip = ctdb_sys_have_ip(&pip->addr);
724         }
725         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
726         if (best_iface == NULL) {
727                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
728                                  "a usable interface (old %s, have_ip %d)\n",
729                                  ctdb_addr_to_str(&vnn->public_address),
730                                  vnn->public_netmask_bits,
731                                  ctdb_vnn_iface_string(vnn),
732                                  have_ip));
733                 return -1;
734         }
735
736         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
737                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
738                 have_ip = false;
739         }
740
741
742         if (vnn->iface == NULL && have_ip) {
743                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
744                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
745                                  ctdb_addr_to_str(&vnn->public_address)));
746                 return 0;
747         }
748
749         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
750                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
751                                   "and we have it on iface[%s], but it was assigned to node %d"
752                                   "and we are node %d, banning ourself\n",
753                                  ctdb_addr_to_str(&vnn->public_address),
754                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
755                 ctdb_ban_self(ctdb);
756                 return -1;
757         }
758
759         if (vnn->pnn == -1 && have_ip) {
760                 vnn->pnn = ctdb->pnn;
761                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
762                                   "and we already have it on iface[%s], update local daemon\n",
763                                  ctdb_addr_to_str(&vnn->public_address),
764                                   ctdb_vnn_iface_string(vnn)));
765                 return 0;
766         }
767
768         if (vnn->iface) {
769                 if (vnn->iface != best_iface) {
770                         if (!vnn->iface->link_up) {
771                                 do_updateip = true;
772                         } else if (vnn->iface->references > (best_iface->references + 1)) {
773                                 /* only move when the rebalance gains something */
774                                         do_updateip = true;
775                         }
776                 }
777         }
778
779         if (!have_ip) {
780                 if (do_updateip) {
781                         ctdb_vnn_unassign_iface(ctdb, vnn);
782                         do_updateip = false;
783                 }
784                 do_takeip = true;
785         }
786
787         if (do_takeip) {
788                 ret = ctdb_do_takeip(ctdb, c, vnn);
789                 if (ret != 0) {
790                         return -1;
791                 }
792         } else if (do_updateip) {
793                 ret = ctdb_do_updateip(ctdb, c, vnn);
794                 if (ret != 0) {
795                         return -1;
796                 }
797         } else {
798                 /*
799                  * The interface is up and the kernel known the ip
800                  * => do nothing
801                  */
802                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits,
805                         ctdb_vnn_iface_string(vnn)));
806                 return 0;
807         }
808
809         /* tell ctdb_control.c that we will be replying asynchronously */
810         *async_reply = true;
811
812         return 0;
813 }
814
815 /*
816   takeover an ip address old v4 style
817  */
818 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
819                                 struct ctdb_req_control *c,
820                                 TDB_DATA indata, 
821                                 bool *async_reply)
822 {
823         TDB_DATA data;
824         
825         data.dsize = sizeof(struct ctdb_public_ip);
826         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
827         CTDB_NO_MEMORY(ctdb, data.dptr);
828         
829         memcpy(data.dptr, indata.dptr, indata.dsize);
830         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
831 }
832
833 /*
834   kill any clients that are registered with a IP that is being released
835  */
836 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
837 {
838         struct ctdb_client_ip *ip;
839
840         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
841                 ctdb_addr_to_str(addr)));
842
843         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
844                 ctdb_sock_addr tmp_addr;
845
846                 tmp_addr = ip->addr;
847                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
848                         ip->client_id,
849                         ctdb_addr_to_str(&ip->addr)));
850
851                 if (ctdb_same_ip(&tmp_addr, addr)) {
852                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
853                                                                      ip->client_id, 
854                                                                      struct ctdb_client);
855                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
856                                 ip->client_id,
857                                 ctdb_addr_to_str(&ip->addr),
858                                 client->pid));
859
860                         if (client->pid != 0) {
861                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
862                                         (unsigned)client->pid,
863                                         ctdb_addr_to_str(addr),
864                                         ip->client_id));
865                                 kill(client->pid, SIGKILL);
866                         }
867                 }
868         }
869 }
870
871 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
872 {
873         DLIST_REMOVE(ctdb->vnn, vnn);
874         ctdb_vnn_unassign_iface(ctdb, vnn);
875         ctdb_remove_orphaned_ifaces(ctdb, vnn);
876         talloc_free(vnn);
877 }
878
879 /*
880   called when releaseip event finishes
881  */
882 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
883                                 void *private_data)
884 {
885         struct takeover_callback_state *state = 
886                 talloc_get_type(private_data, struct takeover_callback_state);
887         TDB_DATA data;
888
889         if (status == -ETIME) {
890                 ctdb_ban_self(ctdb);
891         }
892
893         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
894                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
895                                   ctdb_addr_to_str(state->addr)));
896                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
897                 talloc_free(state);
898                 return;
899         }
900
901         /* send a message to all clients of this node telling them
902            that the cluster has been reconfigured and they should
903            release any sockets on this IP */
904         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
905         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
906         data.dsize = strlen((char *)data.dptr)+1;
907
908         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
909
910         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
911
912         /* kill clients that have registered with this IP */
913         release_kill_clients(ctdb, state->addr);
914
915         ctdb_vnn_unassign_iface(ctdb, state->vnn);
916
917         /* Process the IP if it has been marked for deletion */
918         if (state->vnn->delete_pending) {
919                 do_delete_ip(ctdb, state->vnn);
920                 state->vnn = NULL;
921         }
922
923         /* the control succeeded */
924         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
925         talloc_free(state);
926 }
927
928 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
929 {
930         if (state->vnn != NULL) {
931                 state->vnn->update_in_flight = false;
932         }
933         return 0;
934 }
935
936 /*
937   release an ip address
938  */
939 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
940                                 struct ctdb_req_control *c,
941                                 TDB_DATA indata, 
942                                 bool *async_reply)
943 {
944         int ret;
945         struct takeover_callback_state *state;
946         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
947         struct ctdb_vnn *vnn;
948         char *iface;
949
950         /* update our vnn list */
951         vnn = find_public_ip_vnn(ctdb, &pip->addr);
952         if (vnn == NULL) {
953                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
954                         ctdb_addr_to_str(&pip->addr)));
955                 return 0;
956         }
957         vnn->pnn = pip->pnn;
958
959         /* stop any previous arps */
960         talloc_free(vnn->takeover_ctx);
961         vnn->takeover_ctx = NULL;
962
963         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
964          * lazy multicast to drop an IP from any node that isn't the
965          * intended new node.  The following causes makes ctdbd ignore
966          * a release for any address it doesn't host.
967          */
968         if (ctdb->do_checkpublicip) {
969                 if (!ctdb_sys_have_ip(&pip->addr)) {
970                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
971                                 ctdb_addr_to_str(&pip->addr),
972                                 vnn->public_netmask_bits,
973                                 ctdb_vnn_iface_string(vnn)));
974                         ctdb_vnn_unassign_iface(ctdb, vnn);
975                         return 0;
976                 }
977         } else {
978                 if (vnn->iface == NULL) {
979                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
980                                            ctdb_addr_to_str(&pip->addr),
981                                            vnn->public_netmask_bits));
982                         return 0;
983                 }
984         }
985
986         /* There is a potential race between take_ip and us because we
987          * update the VNN via a callback that run when the
988          * eventscripts have been run.  Avoid the race by allowing one
989          * update to be in flight at a time.
990          */
991         if (vnn->update_in_flight) {
992                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
993                                     "update for this IP already in flight\n",
994                                     ctdb_addr_to_str(&vnn->public_address),
995                                     vnn->public_netmask_bits));
996                 return -1;
997         }
998
999         if (ctdb->do_checkpublicip) {
1000                 iface = ctdb_sys_find_ifname(&pip->addr);
1001                 if (iface == NULL) {
1002                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
1003                         return 0;
1004                 }
1005                 if (vnn->iface == NULL) {
1006                         DEBUG(DEBUG_WARNING,
1007                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
1008                                ctdb_addr_to_str(&pip->addr),
1009                                iface));
1010                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
1011                         DEBUG(DEBUG_WARNING,
1012                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
1013                                ctdb_addr_to_str(&pip->addr),
1014                                iface,
1015                                ctdb_vnn_iface_string(vnn)));
1016                         /* Should we fix vnn->iface?  If we do, what
1017                          * happens to reference counts?
1018                          */
1019                 }
1020         } else {
1021                 iface = strdup(ctdb_vnn_iface_string(vnn));
1022         }
1023
1024         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1025                 ctdb_addr_to_str(&pip->addr),
1026                 vnn->public_netmask_bits,
1027                 iface,
1028                 pip->pnn));
1029
1030         state = talloc(ctdb, struct takeover_callback_state);
1031         CTDB_NO_MEMORY(ctdb, state);
1032
1033         state->c = talloc_steal(state, c);
1034         state->addr = talloc(state, ctdb_sock_addr);       
1035         CTDB_NO_MEMORY(ctdb, state->addr);
1036         *state->addr = pip->addr;
1037         state->vnn   = vnn;
1038
1039         vnn->update_in_flight = true;
1040         talloc_set_destructor(state, ctdb_releaseip_destructor);
1041
1042         ret = ctdb_event_script_callback(ctdb, 
1043                                          state, release_ip_callback, state,
1044                                          CTDB_EVENT_RELEASE_IP,
1045                                          "%s %s %u",
1046                                          iface,
1047                                          ctdb_addr_to_str(&pip->addr),
1048                                          vnn->public_netmask_bits);
1049         free(iface);
1050         if (ret != 0) {
1051                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1052                         ctdb_addr_to_str(&pip->addr),
1053                         ctdb_vnn_iface_string(vnn)));
1054                 talloc_free(state);
1055                 return -1;
1056         }
1057
1058         /* tell the control that we will be reply asynchronously */
1059         *async_reply = true;
1060         return 0;
1061 }
1062
1063 /*
1064   release an ip address old v4 style
1065  */
1066 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1067                                 struct ctdb_req_control *c,
1068                                 TDB_DATA indata, 
1069                                 bool *async_reply)
1070 {
1071         TDB_DATA data;
1072         
1073         data.dsize = sizeof(struct ctdb_public_ip);
1074         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1075         CTDB_NO_MEMORY(ctdb, data.dptr);
1076         
1077         memcpy(data.dptr, indata.dptr, indata.dsize);
1078         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1079 }
1080
1081
1082 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1083                                    ctdb_sock_addr *addr,
1084                                    unsigned mask, const char *ifaces,
1085                                    bool check_address)
1086 {
1087         struct ctdb_vnn      *vnn;
1088         uint32_t num = 0;
1089         char *tmp;
1090         const char *iface;
1091         int i;
1092         int ret;
1093
1094         tmp = strdup(ifaces);
1095         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1096                 if (!ctdb_sys_check_iface_exists(iface)) {
1097                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1098                         free(tmp);
1099                         return -1;
1100                 }
1101         }
1102         free(tmp);
1103
1104         /* Verify that we dont have an entry for this ip yet */
1105         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1106                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1107                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1108                                 ctdb_addr_to_str(addr)));
1109                         return -1;
1110                 }               
1111         }
1112
1113         /* create a new vnn structure for this ip address */
1114         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1115         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1116         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1117         tmp = talloc_strdup(vnn, ifaces);
1118         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1119         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1120                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1121                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1122                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1123                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1124                 num++;
1125         }
1126         talloc_free(tmp);
1127         vnn->ifaces[num] = NULL;
1128         vnn->public_address      = *addr;
1129         vnn->public_netmask_bits = mask;
1130         vnn->pnn                 = -1;
1131         if (check_address) {
1132                 if (ctdb_sys_have_ip(addr)) {
1133                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1134                         vnn->pnn = ctdb->pnn;
1135                 }
1136         }
1137
1138         for (i=0; vnn->ifaces[i]; i++) {
1139                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1140                 if (ret != 0) {
1141                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1142                                            "for public_address[%s]\n",
1143                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1144                         talloc_free(vnn);
1145                         return -1;
1146                 }
1147         }
1148
1149         DLIST_ADD(ctdb->vnn, vnn);
1150
1151         return 0;
1152 }
1153
1154 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1155                                   struct timeval t, void *private_data)
1156 {
1157         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1158                                                         struct ctdb_context);
1159         struct ctdb_vnn *vnn;
1160
1161         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1162                 int i;
1163
1164                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1165                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1166                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1167                                         vnn->ifaces[i],
1168                                         ctdb_addr_to_str(&vnn->public_address)));
1169                         }
1170                 }
1171         }
1172
1173         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1174                 timeval_current_ofs(30, 0), 
1175                 ctdb_check_interfaces_event, ctdb);
1176 }
1177
1178
1179 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1180 {
1181         if (ctdb->check_public_ifaces_ctx != NULL) {
1182                 talloc_free(ctdb->check_public_ifaces_ctx);
1183                 ctdb->check_public_ifaces_ctx = NULL;
1184         }
1185
1186         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1187         if (ctdb->check_public_ifaces_ctx == NULL) {
1188                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1189         }
1190
1191         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1192                 timeval_current_ofs(30, 0), 
1193                 ctdb_check_interfaces_event, ctdb);
1194
1195         return 0;
1196 }
1197
1198
1199 /*
1200   setup the public address lists from a file
1201 */
1202 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1203 {
1204         char **lines;
1205         int nlines;
1206         int i;
1207
1208         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1209         if (lines == NULL) {
1210                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1211                 return -1;
1212         }
1213         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1214                 nlines--;
1215         }
1216
1217         for (i=0;i<nlines;i++) {
1218                 unsigned mask;
1219                 ctdb_sock_addr addr;
1220                 const char *addrstr;
1221                 const char *ifaces;
1222                 char *tok, *line;
1223
1224                 line = lines[i];
1225                 while ((*line == ' ') || (*line == '\t')) {
1226                         line++;
1227                 }
1228                 if (*line == '#') {
1229                         continue;
1230                 }
1231                 if (strcmp(line, "") == 0) {
1232                         continue;
1233                 }
1234                 tok = strtok(line, " \t");
1235                 addrstr = tok;
1236                 tok = strtok(NULL, " \t");
1237                 if (tok == NULL) {
1238                         if (NULL == ctdb->default_public_interface) {
1239                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1240                                          i+1));
1241                                 talloc_free(lines);
1242                                 return -1;
1243                         }
1244                         ifaces = ctdb->default_public_interface;
1245                 } else {
1246                         ifaces = tok;
1247                 }
1248
1249                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1250                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1251                         talloc_free(lines);
1252                         return -1;
1253                 }
1254                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1255                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1256                         talloc_free(lines);
1257                         return -1;
1258                 }
1259         }
1260
1261
1262         talloc_free(lines);
1263         return 0;
1264 }
1265
1266 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1267                               const char *iface,
1268                               const char *ip)
1269 {
1270         struct ctdb_vnn *svnn;
1271         struct ctdb_iface *cur = NULL;
1272         bool ok;
1273         int ret;
1274
1275         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1276         CTDB_NO_MEMORY(ctdb, svnn);
1277
1278         svnn->ifaces = talloc_array(svnn, const char *, 2);
1279         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1280         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1281         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1282         svnn->ifaces[1] = NULL;
1283
1284         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1285         if (!ok) {
1286                 talloc_free(svnn);
1287                 return -1;
1288         }
1289
1290         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1291         if (ret != 0) {
1292                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1293                                    "for single_ip[%s]\n",
1294                                    svnn->ifaces[0],
1295                                    ctdb_addr_to_str(&svnn->public_address)));
1296                 talloc_free(svnn);
1297                 return -1;
1298         }
1299
1300         /* assume the single public ip interface is initially "good" */
1301         cur = ctdb_find_iface(ctdb, iface);
1302         if (cur == NULL) {
1303                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1304                 return -1;
1305         }
1306         cur->link_up = true;
1307
1308         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1309         if (ret != 0) {
1310                 talloc_free(svnn);
1311                 return -1;
1312         }
1313
1314         ctdb->single_ip_vnn = svnn;
1315         return 0;
1316 }
1317
1318 struct ctdb_public_ip_list {
1319         struct ctdb_public_ip_list *next;
1320         uint32_t pnn;
1321         ctdb_sock_addr addr;
1322 };
1323
1324 /* Given a physical node, return the number of
1325    public addresses that is currently assigned to this node.
1326 */
1327 static int node_ip_coverage(struct ctdb_context *ctdb, 
1328         int32_t pnn,
1329         struct ctdb_public_ip_list *ips)
1330 {
1331         int num=0;
1332
1333         for (;ips;ips=ips->next) {
1334                 if (ips->pnn == pnn) {
1335                         num++;
1336                 }
1337         }
1338         return num;
1339 }
1340
1341
1342 /* Can the given node host the given IP: is the public IP known to the
1343  * node and is NOIPHOST unset?
1344 */
1345 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1346                              struct ctdb_ipflags ipflags,
1347                              struct ctdb_public_ip_list *ip)
1348 {
1349         struct ctdb_all_public_ips *public_ips;
1350         int i;
1351
1352         if (ipflags.noiphost) {
1353                 return false;
1354         }
1355
1356         public_ips = ctdb->nodes[pnn]->available_public_ips;
1357
1358         if (public_ips == NULL) {
1359                 return false;
1360         }
1361
1362         for (i=0; i<public_ips->num; i++) {
1363                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1364                         /* yes, this node can serve this public ip */
1365                         return true;
1366                 }
1367         }
1368
1369         return false;
1370 }
1371
1372 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1373                                  struct ctdb_ipflags ipflags,
1374                                  struct ctdb_public_ip_list *ip)
1375 {
1376         if (ipflags.noiptakeover) {
1377                 return false;
1378         }
1379
1380         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1381 }
1382
1383 /* search the node lists list for a node to takeover this ip.
1384    pick the node that currently are serving the least number of ips
1385    so that the ips get spread out evenly.
1386 */
1387 static int find_takeover_node(struct ctdb_context *ctdb, 
1388                 struct ctdb_ipflags *ipflags,
1389                 struct ctdb_public_ip_list *ip,
1390                 struct ctdb_public_ip_list *all_ips)
1391 {
1392         int pnn, min=0, num;
1393         int i, numnodes;
1394
1395         numnodes = talloc_array_length(ipflags);
1396         pnn    = -1;
1397         for (i=0; i<numnodes; i++) {
1398                 /* verify that this node can serve this ip */
1399                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1400                         /* no it couldnt   so skip to the next node */
1401                         continue;
1402                 }
1403
1404                 num = node_ip_coverage(ctdb, i, all_ips);
1405                 /* was this the first node we checked ? */
1406                 if (pnn == -1) {
1407                         pnn = i;
1408                         min  = num;
1409                 } else {
1410                         if (num < min) {
1411                                 pnn = i;
1412                                 min  = num;
1413                         }
1414                 }
1415         }       
1416         if (pnn == -1) {
1417                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1418                         ctdb_addr_to_str(&ip->addr)));
1419
1420                 return -1;
1421         }
1422
1423         ip->pnn = pnn;
1424         return 0;
1425 }
1426
1427 #define IP_KEYLEN       4
1428 static uint32_t *ip_key(ctdb_sock_addr *ip)
1429 {
1430         static uint32_t key[IP_KEYLEN];
1431
1432         bzero(key, sizeof(key));
1433
1434         switch (ip->sa.sa_family) {
1435         case AF_INET:
1436                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1437                 break;
1438         case AF_INET6: {
1439                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1440                 key[0]  = htonl(s6_a32[0]);
1441                 key[1]  = htonl(s6_a32[1]);
1442                 key[2]  = htonl(s6_a32[2]);
1443                 key[3]  = htonl(s6_a32[3]);
1444                 break;
1445         }
1446         default:
1447                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1448                 return key;
1449         }
1450
1451         return key;
1452 }
1453
1454 static void *add_ip_callback(void *parm, void *data)
1455 {
1456         struct ctdb_public_ip_list *this_ip = parm; 
1457         struct ctdb_public_ip_list *prev_ip = data; 
1458
1459         if (prev_ip == NULL) {
1460                 return parm;
1461         }
1462         if (this_ip->pnn == -1) {
1463                 this_ip->pnn = prev_ip->pnn;
1464         }
1465
1466         return parm;
1467 }
1468
1469 static int getips_count_callback(void *param, void *data)
1470 {
1471         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1472         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1473
1474         new_ip->next = *ip_list;
1475         *ip_list     = new_ip;
1476         return 0;
1477 }
1478
1479 static struct ctdb_public_ip_list *
1480 create_merged_ip_list(struct ctdb_context *ctdb)
1481 {
1482         int i, j;
1483         struct ctdb_public_ip_list *ip_list;
1484         struct ctdb_all_public_ips *public_ips;
1485
1486         if (ctdb->ip_tree != NULL) {
1487                 talloc_free(ctdb->ip_tree);
1488                 ctdb->ip_tree = NULL;
1489         }
1490         ctdb->ip_tree = trbt_create(ctdb, 0);
1491
1492         for (i=0;i<ctdb->num_nodes;i++) {
1493                 public_ips = ctdb->nodes[i]->known_public_ips;
1494
1495                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1496                         continue;
1497                 }
1498
1499                 /* there were no public ips for this node */
1500                 if (public_ips == NULL) {
1501                         continue;
1502                 }               
1503
1504                 for (j=0;j<public_ips->num;j++) {
1505                         struct ctdb_public_ip_list *tmp_ip; 
1506
1507                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1508                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1509                         /* Do not use information about IP addresses hosted
1510                          * on other nodes, it may not be accurate */
1511                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1512                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1513                         } else {
1514                                 tmp_ip->pnn = -1;
1515                         }
1516                         tmp_ip->addr = public_ips->ips[j].addr;
1517                         tmp_ip->next = NULL;
1518
1519                         trbt_insertarray32_callback(ctdb->ip_tree,
1520                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1521                                 add_ip_callback,
1522                                 tmp_ip);
1523                 }
1524         }
1525
1526         ip_list = NULL;
1527         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1528
1529         return ip_list;
1530 }
1531
1532 /* 
1533  * This is the length of the longtest common prefix between the IPs.
1534  * It is calculated by XOR-ing the 2 IPs together and counting the
1535  * number of leading zeroes.  The implementation means that all
1536  * addresses end up being 128 bits long.
1537  *
1538  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1539  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1540  * lots of nodes and IP addresses?
1541  */
1542 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1543 {
1544         uint32_t ip1_k[IP_KEYLEN];
1545         uint32_t *t;
1546         int i;
1547         uint32_t x;
1548
1549         uint32_t distance = 0;
1550
1551         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1552         t = ip_key(ip2);
1553         for (i=0; i<IP_KEYLEN; i++) {
1554                 x = ip1_k[i] ^ t[i];
1555                 if (x == 0) {
1556                         distance += 32;
1557                 } else {
1558                         /* Count number of leading zeroes. 
1559                          * FIXME? This could be optimised...
1560                          */
1561                         while ((x & (1 << 31)) == 0) {
1562                                 x <<= 1;
1563                                 distance += 1;
1564                         }
1565                 }
1566         }
1567
1568         return distance;
1569 }
1570
1571 /* Calculate the IP distance for the given IP relative to IPs on the
1572    given node.  The ips argument is generally the all_ips variable
1573    used in the main part of the algorithm.
1574  */
1575 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1576                                   struct ctdb_public_ip_list *ips,
1577                                   int pnn)
1578 {
1579         struct ctdb_public_ip_list *t;
1580         uint32_t d;
1581
1582         uint32_t sum = 0;
1583
1584         for (t=ips; t != NULL; t=t->next) {
1585                 if (t->pnn != pnn) {
1586                         continue;
1587                 }
1588
1589                 /* Optimisation: We never calculate the distance
1590                  * between an address and itself.  This allows us to
1591                  * calculate the effect of removing an address from a
1592                  * node by simply calculating the distance between
1593                  * that address and all of the exitsing addresses.
1594                  * Moreover, we assume that we're only ever dealing
1595                  * with addresses from all_ips so we can identify an
1596                  * address via a pointer rather than doing a more
1597                  * expensive address comparison. */
1598                 if (&(t->addr) == ip) {
1599                         continue;
1600                 }
1601
1602                 d = ip_distance(ip, &(t->addr));
1603                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1604         }
1605
1606         return sum;
1607 }
1608
1609 /* Return the LCP2 imbalance metric for addresses currently assigned
1610    to the given node.
1611  */
1612 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1613 {
1614         struct ctdb_public_ip_list *t;
1615
1616         uint32_t imbalance = 0;
1617
1618         for (t=all_ips; t!=NULL; t=t->next) {
1619                 if (t->pnn != pnn) {
1620                         continue;
1621                 }
1622                 /* Pass the rest of the IPs rather than the whole
1623                    all_ips input list.
1624                 */
1625                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1626         }
1627
1628         return imbalance;
1629 }
1630
1631 /* Allocate any unassigned IPs just by looping through the IPs and
1632  * finding the best node for each.
1633  */
1634 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1635                                       struct ctdb_ipflags *ipflags,
1636                                       struct ctdb_public_ip_list *all_ips)
1637 {
1638         struct ctdb_public_ip_list *tmp_ip;
1639
1640         /* loop over all ip's and find a physical node to cover for 
1641            each unassigned ip.
1642         */
1643         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1644                 if (tmp_ip->pnn == -1) {
1645                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1646                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1647                                         ctdb_addr_to_str(&tmp_ip->addr)));
1648                         }
1649                 }
1650         }
1651 }
1652
1653 /* Basic non-deterministic rebalancing algorithm.
1654  */
1655 static void basic_failback(struct ctdb_context *ctdb,
1656                            struct ctdb_ipflags *ipflags,
1657                            struct ctdb_public_ip_list *all_ips,
1658                            int num_ips)
1659 {
1660         int i, numnodes;
1661         int maxnode, maxnum, minnode, minnum, num, retries;
1662         struct ctdb_public_ip_list *tmp_ip;
1663
1664         numnodes = talloc_array_length(ipflags);
1665         retries = 0;
1666
1667 try_again:
1668         maxnum=0;
1669         minnum=0;
1670
1671         /* for each ip address, loop over all nodes that can serve
1672            this ip and make sure that the difference between the node
1673            serving the most and the node serving the least ip's are
1674            not greater than 1.
1675         */
1676         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1677                 if (tmp_ip->pnn == -1) {
1678                         continue;
1679                 }
1680
1681                 /* Get the highest and lowest number of ips's served by any 
1682                    valid node which can serve this ip.
1683                 */
1684                 maxnode = -1;
1685                 minnode = -1;
1686                 for (i=0; i<numnodes; i++) {
1687                         /* only check nodes that can actually serve this ip */
1688                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1689                                 /* no it couldnt   so skip to the next node */
1690                                 continue;
1691                         }
1692
1693                         num = node_ip_coverage(ctdb, i, all_ips);
1694                         if (maxnode == -1) {
1695                                 maxnode = i;
1696                                 maxnum  = num;
1697                         } else {
1698                                 if (num > maxnum) {
1699                                         maxnode = i;
1700                                         maxnum  = num;
1701                                 }
1702                         }
1703                         if (minnode == -1) {
1704                                 minnode = i;
1705                                 minnum  = num;
1706                         } else {
1707                                 if (num < minnum) {
1708                                         minnode = i;
1709                                         minnum  = num;
1710                                 }
1711                         }
1712                 }
1713                 if (maxnode == -1) {
1714                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1715                                 ctdb_addr_to_str(&tmp_ip->addr)));
1716
1717                         continue;
1718                 }
1719
1720                 /* if the spread between the smallest and largest coverage by
1721                    a node is >=2 we steal one of the ips from the node with
1722                    most coverage to even things out a bit.
1723                    try to do this a limited number of times since we dont
1724                    want to spend too much time balancing the ip coverage.
1725                 */
1726                 if ( (maxnum > minnum+1)
1727                      && (retries < (num_ips + 5)) ){
1728                         struct ctdb_public_ip_list *tmp;
1729
1730                         /* Reassign one of maxnode's VNNs */
1731                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1732                                 if (tmp->pnn == maxnode) {
1733                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1734                                         retries++;
1735                                         goto try_again;;
1736                                 }
1737                         }
1738                 }
1739         }
1740 }
1741
1742 static void lcp2_init(struct ctdb_context *tmp_ctx,
1743                       struct ctdb_ipflags *ipflags,
1744                       struct ctdb_public_ip_list *all_ips,
1745                       uint32_t *force_rebalance_nodes,
1746                       uint32_t **lcp2_imbalances,
1747                       bool **rebalance_candidates)
1748 {
1749         int i, numnodes;
1750         struct ctdb_public_ip_list *tmp_ip;
1751
1752         numnodes = talloc_array_length(ipflags);
1753
1754         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1755         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1756         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1757         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1758
1759         for (i=0; i<numnodes; i++) {
1760                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1761                 /* First step: assume all nodes are candidates */
1762                 (*rebalance_candidates)[i] = true;
1763         }
1764
1765         /* 2nd step: if a node has IPs assigned then it must have been
1766          * healthy before, so we remove it from consideration.  This
1767          * is overkill but is all we have because we don't maintain
1768          * state between takeover runs.  An alternative would be to
1769          * keep state and invalidate it every time the recovery master
1770          * changes.
1771          */
1772         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1773                 if (tmp_ip->pnn != -1) {
1774                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1775                 }
1776         }
1777
1778         /* 3rd step: if a node is forced to re-balance then
1779            we allow failback onto the node */
1780         if (force_rebalance_nodes == NULL) {
1781                 return;
1782         }
1783         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1784                 uint32_t pnn = force_rebalance_nodes[i];
1785                 if (pnn >= numnodes) {
1786                         DEBUG(DEBUG_ERR,
1787                               (__location__ "unknown node %u\n", pnn));
1788                         continue;
1789                 }
1790
1791                 DEBUG(DEBUG_NOTICE,
1792                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1793                 (*rebalance_candidates)[pnn] = true;
1794         }
1795 }
1796
1797 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1798  * the IP/node combination that will cost the least.
1799  */
1800 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1801                                      struct ctdb_ipflags *ipflags,
1802                                      struct ctdb_public_ip_list *all_ips,
1803                                      uint32_t *lcp2_imbalances)
1804 {
1805         struct ctdb_public_ip_list *tmp_ip;
1806         int dstnode, numnodes;
1807
1808         int minnode;
1809         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1810         struct ctdb_public_ip_list *minip;
1811
1812         bool should_loop = true;
1813         bool have_unassigned = true;
1814
1815         numnodes = talloc_array_length(ipflags);
1816
1817         while (have_unassigned && should_loop) {
1818                 should_loop = false;
1819
1820                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1821                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1822
1823                 minnode = -1;
1824                 mindsum = 0;
1825                 minip = NULL;
1826
1827                 /* loop over each unassigned ip. */
1828                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1829                         if (tmp_ip->pnn != -1) {
1830                                 continue;
1831                         }
1832
1833                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1834                                 /* only check nodes that can actually takeover this ip */
1835                                 if (!can_node_takeover_ip(ctdb, dstnode,
1836                                                           ipflags[dstnode],
1837                                                           tmp_ip)) {
1838                                         /* no it couldnt   so skip to the next node */
1839                                         continue;
1840                                 }
1841
1842                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1843                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1844                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1845                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1846                                                    dstnode,
1847                                                    dstimbl - lcp2_imbalances[dstnode]));
1848
1849
1850                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1851                                         minnode = dstnode;
1852                                         minimbl = dstimbl;
1853                                         mindsum = dstdsum;
1854                                         minip = tmp_ip;
1855                                         should_loop = true;
1856                                 }
1857                         }
1858                 }
1859
1860                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1861
1862                 /* If we found one then assign it to the given node. */
1863                 if (minnode != -1) {
1864                         minip->pnn = minnode;
1865                         lcp2_imbalances[minnode] = minimbl;
1866                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1867                                           ctdb_addr_to_str(&(minip->addr)),
1868                                           minnode,
1869                                           mindsum));
1870                 }
1871
1872                 /* There might be a better way but at least this is clear. */
1873                 have_unassigned = false;
1874                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1875                         if (tmp_ip->pnn == -1) {
1876                                 have_unassigned = true;
1877                         }
1878                 }
1879         }
1880
1881         /* We know if we have an unassigned addresses so we might as
1882          * well optimise.
1883          */
1884         if (have_unassigned) {
1885                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1886                         if (tmp_ip->pnn == -1) {
1887                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1888                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1889                         }
1890                 }
1891         }
1892 }
1893
1894 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1895  * to move IPs from, determines the best IP/destination node
1896  * combination to move from the source node.
1897  */
1898 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1899                                     struct ctdb_ipflags *ipflags,
1900                                     struct ctdb_public_ip_list *all_ips,
1901                                     int srcnode,
1902                                     uint32_t *lcp2_imbalances,
1903                                     bool *rebalance_candidates)
1904 {
1905         int dstnode, mindstnode, numnodes;
1906         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1907         uint32_t minsrcimbl, mindstimbl;
1908         struct ctdb_public_ip_list *minip;
1909         struct ctdb_public_ip_list *tmp_ip;
1910
1911         /* Find an IP and destination node that best reduces imbalance. */
1912         srcimbl = 0;
1913         minip = NULL;
1914         minsrcimbl = 0;
1915         mindstnode = -1;
1916         mindstimbl = 0;
1917
1918         numnodes = talloc_array_length(ipflags);
1919
1920         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1921         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1922                            srcnode, lcp2_imbalances[srcnode]));
1923
1924         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1925                 /* Only consider addresses on srcnode. */
1926                 if (tmp_ip->pnn != srcnode) {
1927                         continue;
1928                 }
1929
1930                 /* What is this IP address costing the source node? */
1931                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1932                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1933
1934                 /* Consider this IP address would cost each potential
1935                  * destination node.  Destination nodes are limited to
1936                  * those that are newly healthy, since we don't want
1937                  * to do gratuitous failover of IPs just to make minor
1938                  * balance improvements.
1939                  */
1940                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1941                         if (!rebalance_candidates[dstnode]) {
1942                                 continue;
1943                         }
1944
1945                         /* only check nodes that can actually takeover this ip */
1946                         if (!can_node_takeover_ip(ctdb, dstnode,
1947                                                   ipflags[dstnode], tmp_ip)) {
1948                                 /* no it couldnt   so skip to the next node */
1949                                 continue;
1950                         }
1951
1952                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1953                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1954                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1955                                            srcnode, -srcdsum,
1956                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1957                                            dstnode, dstdsum));
1958
1959                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1960                             (dstdsum < srcdsum) &&                      \
1961                             ((mindstnode == -1) ||                              \
1962                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1963
1964                                 minip = tmp_ip;
1965                                 minsrcimbl = srcimbl;
1966                                 mindstnode = dstnode;
1967                                 mindstimbl = dstimbl;
1968                         }
1969                 }
1970         }
1971         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1972
1973         if (mindstnode != -1) {
1974                 /* We found a move that makes things better... */
1975                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1976                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1977                                   ctdb_addr_to_str(&(minip->addr)),
1978                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1979
1980
1981                 lcp2_imbalances[srcnode] = minsrcimbl;
1982                 lcp2_imbalances[mindstnode] = mindstimbl;
1983                 minip->pnn = mindstnode;
1984
1985                 return true;
1986         }
1987
1988         return false;
1989         
1990 }
1991
1992 struct lcp2_imbalance_pnn {
1993         uint32_t imbalance;
1994         int pnn;
1995 };
1996
1997 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1998 {
1999         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2000         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2001
2002         if (lipa->imbalance > lipb->imbalance) {
2003                 return -1;
2004         } else if (lipa->imbalance == lipb->imbalance) {
2005                 return 0;
2006         } else {
2007                 return 1;
2008         }
2009 }
2010
2011 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2012  * node with the highest LCP2 imbalance, and then determines the best
2013  * IP/destination node combination to move from the source node.
2014  */
2015 static void lcp2_failback(struct ctdb_context *ctdb,
2016                           struct ctdb_ipflags *ipflags,
2017                           struct ctdb_public_ip_list *all_ips,
2018                           uint32_t *lcp2_imbalances,
2019                           bool *rebalance_candidates)
2020 {
2021         int i, numnodes;
2022         struct lcp2_imbalance_pnn * lips;
2023         bool again;
2024
2025         numnodes = talloc_array_length(ipflags);
2026
2027 try_again:
2028         /* Put the imbalances and nodes into an array, sort them and
2029          * iterate through candidates.  Usually the 1st one will be
2030          * used, so this doesn't cost much...
2031          */
2032         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2033         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2034         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2035         for (i=0; i<numnodes; i++) {
2036                 lips[i].imbalance = lcp2_imbalances[i];
2037                 lips[i].pnn = i;
2038                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2039         }
2040         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2041               lcp2_cmp_imbalance_pnn);
2042
2043         again = false;
2044         for (i=0; i<numnodes; i++) {
2045                 /* This means that all nodes had 0 or 1 addresses, so
2046                  * can't be imbalanced.
2047                  */
2048                 if (lips[i].imbalance == 0) {
2049                         break;
2050                 }
2051
2052                 if (lcp2_failback_candidate(ctdb,
2053                                             ipflags,
2054                                             all_ips,
2055                                             lips[i].pnn,
2056                                             lcp2_imbalances,
2057                                             rebalance_candidates)) {
2058                         again = true;
2059                         break;
2060                 }
2061         }
2062
2063         talloc_free(lips);
2064         if (again) {
2065                 goto try_again;
2066         }
2067 }
2068
2069 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2070                                     struct ctdb_ipflags *ipflags,
2071                                     struct ctdb_public_ip_list *all_ips)
2072 {
2073         struct ctdb_public_ip_list *tmp_ip;
2074
2075         /* verify that the assigned nodes can serve that public ip
2076            and set it to -1 if not
2077         */
2078         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2079                 if (tmp_ip->pnn == -1) {
2080                         continue;
2081                 }
2082                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2083                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2084                         /* this node can not serve this ip. */
2085                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2086                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2087                                            tmp_ip->pnn));
2088                         tmp_ip->pnn = -1;
2089                 }
2090         }
2091 }
2092
2093 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2094                                        struct ctdb_ipflags *ipflags,
2095                                        struct ctdb_public_ip_list *all_ips)
2096 {
2097         struct ctdb_public_ip_list *tmp_ip;
2098         int i, numnodes;
2099
2100         numnodes = talloc_array_length(ipflags);
2101
2102         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2103        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2104         *  always be allocated the same way for a specific set of
2105         *  available/unavailable nodes.
2106         */
2107
2108         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2109                 tmp_ip->pnn = i % numnodes;
2110         }
2111
2112         /* IP failback doesn't make sense with deterministic
2113          * IPs, since the modulo step above implicitly fails
2114          * back IPs to their "home" node.
2115          */
2116         if (1 == ctdb->tunable.no_ip_failback) {
2117                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2118         }
2119
2120         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2121
2122         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2123
2124         /* No failback here! */
2125 }
2126
2127 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2128                                           struct ctdb_ipflags *ipflags,
2129                                           struct ctdb_public_ip_list *all_ips)
2130 {
2131         /* This should be pushed down into basic_failback. */
2132         struct ctdb_public_ip_list *tmp_ip;
2133         int num_ips = 0;
2134         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2135                 num_ips++;
2136         }
2137
2138         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2139
2140         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2141
2142         /* If we don't want IPs to fail back then don't rebalance IPs. */
2143         if (1 == ctdb->tunable.no_ip_failback) {
2144                 return;
2145         }
2146
2147         /* Now, try to make sure the ip adresses are evenly distributed
2148            across the nodes.
2149         */
2150         basic_failback(ctdb, ipflags, all_ips, num_ips);
2151 }
2152
2153 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2154                           struct ctdb_ipflags *ipflags,
2155                           struct ctdb_public_ip_list *all_ips,
2156                           uint32_t *force_rebalance_nodes)
2157 {
2158         uint32_t *lcp2_imbalances;
2159         bool *rebalance_candidates;
2160         int numnodes, num_rebalance_candidates, i;
2161
2162         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2163
2164         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2165
2166         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2167                   &lcp2_imbalances, &rebalance_candidates);
2168
2169         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2170
2171         /* If we don't want IPs to fail back then don't rebalance IPs. */
2172         if (1 == ctdb->tunable.no_ip_failback) {
2173                 goto finished;
2174         }
2175
2176         /* It is only worth continuing if we have suitable target
2177          * nodes to transfer IPs to.  This check is much cheaper than
2178          * continuing on...
2179          */
2180         numnodes = talloc_array_length(ipflags);
2181         num_rebalance_candidates = 0;
2182         for (i=0; i<numnodes; i++) {
2183                 if (rebalance_candidates[i]) {
2184                         num_rebalance_candidates++;
2185                 }
2186         }
2187         if (num_rebalance_candidates == 0) {
2188                 goto finished;
2189         }
2190
2191         /* Now, try to make sure the ip adresses are evenly distributed
2192            across the nodes.
2193         */
2194         lcp2_failback(ctdb, ipflags, all_ips,
2195                       lcp2_imbalances, rebalance_candidates);
2196
2197 finished:
2198         talloc_free(tmp_ctx);
2199 }
2200
2201 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2202 {
2203         int i;
2204
2205         for (i=0;i<nodemap->num;i++) {
2206                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2207                         /* Found one completely healthy node */
2208                         return false;
2209                 }
2210         }
2211
2212         return true;
2213 }
2214
2215 /* The calculation part of the IP allocation algorithm. */
2216 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2217                                    struct ctdb_ipflags *ipflags,
2218                                    struct ctdb_public_ip_list **all_ips_p,
2219                                    uint32_t *force_rebalance_nodes)
2220 {
2221         /* since nodes only know about those public addresses that
2222            can be served by that particular node, no single node has
2223            a full list of all public addresses that exist in the cluster.
2224            Walk over all node structures and create a merged list of
2225            all public addresses that exist in the cluster.
2226
2227            keep the tree of ips around as ctdb->ip_tree
2228         */
2229         *all_ips_p = create_merged_ip_list(ctdb);
2230
2231         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2232                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2233         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2234                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2235         } else {
2236                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2237         }
2238
2239         /* at this point ->pnn is the node which will own each IP
2240            or -1 if there is no node that can cover this ip
2241         */
2242
2243         return;
2244 }
2245
2246 struct get_tunable_callback_data {
2247         const char *tunable;
2248         uint32_t *out;
2249         bool fatal;
2250 };
2251
2252 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2253                                  int32_t res, TDB_DATA outdata,
2254                                  void *callback)
2255 {
2256         struct get_tunable_callback_data *cd =
2257                 (struct get_tunable_callback_data *)callback;
2258         int size;
2259
2260         if (res != 0) {
2261                 /* Already handled in fail callback */
2262                 return;
2263         }
2264
2265         if (outdata.dsize != sizeof(uint32_t)) {
2266                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2267                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2268                                  (int)outdata.dsize));
2269                 cd->fatal = true;
2270                 return;
2271         }
2272
2273         size = talloc_array_length(cd->out);
2274         if (pnn >= size) {
2275                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2276                                  cd->tunable, pnn, size));
2277                 return;
2278         }
2279
2280                 
2281         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2282 }
2283
2284 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2285                                        int32_t res, TDB_DATA outdata,
2286                                        void *callback)
2287 {
2288         struct get_tunable_callback_data *cd =
2289                 (struct get_tunable_callback_data *)callback;
2290
2291         switch (res) {
2292         case -ETIME:
2293                 DEBUG(DEBUG_ERR,
2294                       ("Timed out getting tunable \"%s\" from node %d\n",
2295                        cd->tunable, pnn));
2296                 cd->fatal = true;
2297                 break;
2298         case -EINVAL:
2299         case -1:
2300                 DEBUG(DEBUG_WARNING,
2301                       ("Tunable \"%s\" not implemented on node %d\n",
2302                        cd->tunable, pnn));
2303                 break;
2304         default:
2305                 DEBUG(DEBUG_ERR,
2306                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2307                        cd->tunable, pnn));
2308                 cd->fatal = true;
2309         }
2310 }
2311
2312 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2313                                         TALLOC_CTX *tmp_ctx,
2314                                         struct ctdb_node_map *nodemap,
2315                                         const char *tunable,
2316                                         uint32_t default_value)
2317 {
2318         TDB_DATA data;
2319         struct ctdb_control_get_tunable *t;
2320         uint32_t *nodes;
2321         uint32_t *tvals;
2322         struct get_tunable_callback_data callback_data;
2323         int i;
2324
2325         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2326         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2327         for (i=0; i<nodemap->num; i++) {
2328                 tvals[i] = default_value;
2329         }
2330                 
2331         callback_data.out = tvals;
2332         callback_data.tunable = tunable;
2333         callback_data.fatal = false;
2334
2335         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2336         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2337         t = (struct ctdb_control_get_tunable *)data.dptr;
2338         t->length = strlen(tunable)+1;
2339         memcpy(t->name, tunable, t->length);
2340         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2341         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2342                                       nodes, 0, TAKEOVER_TIMEOUT(),
2343                                       false, data,
2344                                       get_tunable_callback,
2345                                       get_tunable_fail_callback,
2346                                       &callback_data) != 0) {
2347                 if (callback_data.fatal) {
2348                         talloc_free(tvals);
2349                         tvals = NULL;
2350                 }
2351         }
2352         talloc_free(nodes);
2353         talloc_free(data.dptr);
2354
2355         return tvals;
2356 }
2357
2358 struct get_runstate_callback_data {
2359         enum ctdb_runstate *out;
2360         bool fatal;
2361 };
2362
2363 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2364                                   int32_t res, TDB_DATA outdata,
2365                                   void *callback_data)
2366 {
2367         struct get_runstate_callback_data *cd =
2368                 (struct get_runstate_callback_data *)callback_data;
2369         int size;
2370
2371         if (res != 0) {
2372                 /* Already handled in fail callback */
2373                 return;
2374         }
2375
2376         if (outdata.dsize != sizeof(uint32_t)) {
2377                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2378                                  pnn, (int)sizeof(uint32_t),
2379                                  (int)outdata.dsize));
2380                 cd->fatal = true;
2381                 return;
2382         }
2383
2384         size = talloc_array_length(cd->out);
2385         if (pnn >= size) {
2386                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2387                                  pnn, size));
2388                 return;
2389         }
2390
2391         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2392 }
2393
2394 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2395                                        int32_t res, TDB_DATA outdata,
2396                                        void *callback)
2397 {
2398         struct get_runstate_callback_data *cd =
2399                 (struct get_runstate_callback_data *)callback;
2400
2401         switch (res) {
2402         case -ETIME:
2403                 DEBUG(DEBUG_ERR,
2404                       ("Timed out getting runstate from node %d\n", pnn));
2405                 cd->fatal = true;
2406                 break;
2407         default:
2408                 DEBUG(DEBUG_WARNING,
2409                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2410                        pnn));
2411         }
2412 }
2413
2414 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2415                                                     TALLOC_CTX *tmp_ctx,
2416                                                     struct ctdb_node_map *nodemap,
2417                                                     enum ctdb_runstate default_value)
2418 {
2419         uint32_t *nodes;
2420         enum ctdb_runstate *rs;
2421         struct get_runstate_callback_data callback_data;
2422         int i;
2423
2424         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2425         CTDB_NO_MEMORY_NULL(ctdb, rs);
2426         for (i=0; i<nodemap->num; i++) {
2427                 rs[i] = default_value;
2428         }
2429
2430         callback_data.out = rs;
2431         callback_data.fatal = false;
2432
2433         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2434         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2435                                       nodes, 0, TAKEOVER_TIMEOUT(),
2436                                       true, tdb_null,
2437                                       get_runstate_callback,
2438                                       get_runstate_fail_callback,
2439                                       &callback_data) != 0) {
2440                 if (callback_data.fatal) {
2441                         free(rs);
2442                         rs = NULL;
2443                 }
2444         }
2445         talloc_free(nodes);
2446
2447         return rs;
2448 }
2449
2450 /* Set internal flags for IP allocation:
2451  *   Clear ip flags
2452  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2453  *   Set NOIPHOST ip flag for each INACTIVE node
2454  *   if all nodes are disabled:
2455  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2456  *   else
2457  *     Set NOIPHOST ip flags for disabled nodes
2458  */
2459 static struct ctdb_ipflags *
2460 set_ipflags_internal(struct ctdb_context *ctdb,
2461                      TALLOC_CTX *tmp_ctx,
2462                      struct ctdb_node_map *nodemap,
2463                      uint32_t *tval_noiptakeover,
2464                      uint32_t *tval_noiphostonalldisabled,
2465                      enum ctdb_runstate *runstate)
2466 {
2467         int i;
2468         struct ctdb_ipflags *ipflags;
2469
2470         /* Clear IP flags - implicit due to talloc_zero */
2471         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2472         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2473
2474         for (i=0;i<nodemap->num;i++) {
2475                 /* Can not take IPs on node with NoIPTakeover set */
2476                 if (tval_noiptakeover[i] != 0) {
2477                         ipflags[i].noiptakeover = true;
2478                 }
2479
2480                 /* Can not host IPs on node not in RUNNING state */
2481                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2482                         ipflags[i].noiphost = true;
2483                         continue;
2484                 }
2485                 /* Can not host IPs on INACTIVE node */
2486                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2487                         ipflags[i].noiphost = true;
2488                 }
2489         }
2490
2491         if (all_nodes_are_disabled(nodemap)) {
2492                 /* If all nodes are disabled, can not host IPs on node
2493                  * with NoIPHostOnAllDisabled set
2494                  */
2495                 for (i=0;i<nodemap->num;i++) {
2496                         if (tval_noiphostonalldisabled[i] != 0) {
2497                                 ipflags[i].noiphost = true;
2498                         }
2499                 }
2500         } else {
2501                 /* If some nodes are not disabled, then can not host
2502                  * IPs on DISABLED node
2503                  */
2504                 for (i=0;i<nodemap->num;i++) {
2505                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2506                                 ipflags[i].noiphost = true;
2507                         }
2508                 }
2509         }
2510
2511         return ipflags;
2512 }
2513
2514 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2515                                         TALLOC_CTX *tmp_ctx,
2516                                         struct ctdb_node_map *nodemap)
2517 {
2518         uint32_t *tval_noiptakeover;
2519         uint32_t *tval_noiphostonalldisabled;
2520         struct ctdb_ipflags *ipflags;
2521         enum ctdb_runstate *runstate;
2522
2523
2524         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2525                                                    "NoIPTakeover", 0);
2526         if (tval_noiptakeover == NULL) {
2527                 return NULL;
2528         }
2529
2530         tval_noiphostonalldisabled =
2531                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2532                                        "NoIPHostOnAllDisabled", 0);
2533         if (tval_noiphostonalldisabled == NULL) {
2534                 /* Caller frees tmp_ctx */
2535                 return NULL;
2536         }
2537
2538         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2539          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2540          * reasonable behaviour on a mixed cluster during upgrade.
2541          */
2542         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2543                                            CTDB_RUNSTATE_RUNNING);
2544         if (runstate == NULL) {
2545                 /* Caller frees tmp_ctx */
2546                 return NULL;
2547         }
2548
2549         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2550                                        tval_noiptakeover,
2551                                        tval_noiphostonalldisabled,
2552                                        runstate);
2553
2554         talloc_free(tval_noiptakeover);
2555         talloc_free(tval_noiphostonalldisabled);
2556         talloc_free(runstate);
2557
2558         return ipflags;
2559 }
2560
2561 struct iprealloc_callback_data {
2562         bool *retry_nodes;
2563         int retry_count;
2564         client_async_callback fail_callback;
2565         void *fail_callback_data;
2566         struct ctdb_node_map *nodemap;
2567 };
2568
2569 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2570                                         int32_t res, TDB_DATA outdata,
2571                                         void *callback)
2572 {
2573         int numnodes;
2574         struct iprealloc_callback_data *cd =
2575                 (struct iprealloc_callback_data *)callback;
2576
2577         numnodes = talloc_array_length(cd->retry_nodes);
2578         if (pnn > numnodes) {
2579                 DEBUG(DEBUG_ERR,
2580                       ("ipreallocated failure from node %d, "
2581                        "but only %d nodes in nodemap\n",
2582                        pnn, numnodes));
2583                 return;
2584         }
2585
2586         /* Can't run the "ipreallocated" event on a INACTIVE node */
2587         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2588                 DEBUG(DEBUG_WARNING,
2589                       ("ipreallocated failed on inactive node %d, ignoring\n",
2590                        pnn));
2591                 return;
2592         }
2593
2594         switch (res) {
2595         case -ETIME:
2596                 /* If the control timed out then that's a real error,
2597                  * so call the real fail callback
2598                  */
2599                 if (cd->fail_callback) {
2600                         cd->fail_callback(ctdb, pnn, res, outdata,
2601                                           cd->fail_callback_data);
2602                 } else {
2603                         DEBUG(DEBUG_WARNING,
2604                               ("iprealloc timed out but no callback registered\n"));
2605                 }
2606                 break;
2607         default:
2608                 /* If not a timeout then either the ipreallocated
2609                  * eventscript (or some setup) failed.  This might
2610                  * have failed because the IPREALLOCATED control isn't
2611                  * implemented - right now there is no way of knowing
2612                  * because the error codes are all folded down to -1.
2613                  * Consider retrying using EVENTSCRIPT control...
2614                  */
2615                 DEBUG(DEBUG_WARNING,
2616                       ("ipreallocated failure from node %d, flagging retry\n",
2617                        pnn));
2618                 cd->retry_nodes[pnn] = true;
2619                 cd->retry_count++;
2620         }
2621 }
2622
2623 struct takeover_callback_data {
2624         bool *node_failed;
2625         client_async_callback fail_callback;
2626         void *fail_callback_data;
2627         struct ctdb_node_map *nodemap;
2628 };
2629
2630 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2631                                        uint32_t node_pnn, int32_t res,
2632                                        TDB_DATA outdata, void *callback_data)
2633 {
2634         struct takeover_callback_data *cd =
2635                 talloc_get_type_abort(callback_data,
2636                                       struct takeover_callback_data);
2637         int i;
2638
2639         for (i = 0; i < cd->nodemap->num; i++) {
2640                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2641                         break;
2642                 }
2643         }
2644
2645         if (i == cd->nodemap->num) {
2646                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2647                 return;
2648         }
2649
2650         if (!cd->node_failed[i]) {
2651                 cd->node_failed[i] = true;
2652                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2653                                   cd->fail_callback_data);
2654         }
2655 }
2656
2657 /*
2658   make any IP alias changes for public addresses that are necessary 
2659  */
2660 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2661                       uint32_t *force_rebalance_nodes,
2662                       client_async_callback fail_callback, void *callback_data)
2663 {
2664         int i, j, ret;
2665         struct ctdb_public_ip ip;
2666         struct ctdb_public_ipv4 ipv4;
2667         uint32_t *nodes;
2668         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2669         TDB_DATA data;
2670         struct timeval timeout;
2671         struct client_async_data *async_data;
2672         struct ctdb_client_control_state *state;
2673         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2674         struct ctdb_ipflags *ipflags;
2675         struct takeover_callback_data *takeover_data;
2676         struct iprealloc_callback_data iprealloc_data;
2677         bool *retry_data;
2678
2679         /*
2680          * ip failover is completely disabled, just send out the 
2681          * ipreallocated event.
2682          */
2683         if (ctdb->tunable.disable_ip_failover != 0) {
2684                 goto ipreallocated;
2685         }
2686
2687         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2688         if (ipflags == NULL) {
2689                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2690                 talloc_free(tmp_ctx);
2691                 return -1;
2692         }
2693
2694         ZERO_STRUCT(ip);
2695
2696         /* Do the IP reassignment calculations */
2697         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2698
2699         /* Now tell all nodes to release any public IPs should not
2700          * host.  This will be a NOOP on nodes that don't currently
2701          * hold the given IP.
2702          */
2703         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2704         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2705
2706         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2707                                                        bool, nodemap->num);
2708         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2709         takeover_data->fail_callback = fail_callback;
2710         takeover_data->fail_callback_data = callback_data;
2711         takeover_data->nodemap = nodemap;
2712
2713         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2714         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2715
2716         async_data->fail_callback = takeover_run_fail_callback;
2717         async_data->callback_data = takeover_data;
2718
2719         for (i=0;i<nodemap->num;i++) {
2720                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2721                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2722                         continue;
2723                 }
2724
2725                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2726                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2727                                 /* This node should be serving this
2728                                    vnn so dont tell it to release the ip
2729                                 */
2730                                 continue;
2731                         }
2732                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2733                                 ipv4.pnn = tmp_ip->pnn;
2734                                 ipv4.sin = tmp_ip->addr.ip;
2735
2736                                 timeout = TAKEOVER_TIMEOUT();
2737                                 data.dsize = sizeof(ipv4);
2738                                 data.dptr  = (uint8_t *)&ipv4;
2739                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2740                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2741                                                 data, async_data,
2742                                                 &timeout, NULL);
2743                         } else {
2744                                 ip.pnn  = tmp_ip->pnn;
2745                                 ip.addr = tmp_ip->addr;
2746
2747                                 timeout = TAKEOVER_TIMEOUT();
2748                                 data.dsize = sizeof(ip);
2749                                 data.dptr  = (uint8_t *)&ip;
2750                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2751                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2752                                                 data, async_data,
2753                                                 &timeout, NULL);
2754                         }
2755
2756                         if (state == NULL) {
2757                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2758                                 talloc_free(tmp_ctx);
2759                                 return -1;
2760                         }
2761                 
2762                         ctdb_client_async_add(async_data, state);
2763                 }
2764         }
2765         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2766                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2767                 talloc_free(tmp_ctx);
2768                 return -1;
2769         }
2770         talloc_free(async_data);
2771
2772
2773         /* tell all nodes to get their own IPs */
2774         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2775         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2776
2777         async_data->fail_callback = fail_callback;
2778         async_data->callback_data = callback_data;
2779
2780         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2781                 if (tmp_ip->pnn == -1) {
2782                         /* this IP won't be taken over */
2783                         continue;
2784                 }
2785
2786                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2787                         ipv4.pnn = tmp_ip->pnn;
2788                         ipv4.sin = tmp_ip->addr.ip;
2789
2790                         timeout = TAKEOVER_TIMEOUT();
2791                         data.dsize = sizeof(ipv4);
2792                         data.dptr  = (uint8_t *)&ipv4;
2793                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2794                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2795                                         data, async_data,
2796                                         &timeout, NULL);
2797                 } else {
2798                         ip.pnn  = tmp_ip->pnn;
2799                         ip.addr = tmp_ip->addr;
2800
2801                         timeout = TAKEOVER_TIMEOUT();
2802                         data.dsize = sizeof(ip);
2803                         data.dptr  = (uint8_t *)&ip;
2804                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2805                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2806                                         data, async_data,
2807                                         &timeout, NULL);
2808                 }
2809                 if (state == NULL) {
2810                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2811                         talloc_free(tmp_ctx);
2812                         return -1;
2813                 }
2814                 
2815                 ctdb_client_async_add(async_data, state);
2816         }
2817         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2818                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2819                 talloc_free(tmp_ctx);
2820                 return -1;
2821         }
2822
2823 ipreallocated:
2824         /* 
2825          * Tell all nodes to run eventscripts to process the
2826          * "ipreallocated" event.  This can do a lot of things,
2827          * including restarting services to reconfigure them if public
2828          * IPs have moved.  Once upon a time this event only used to
2829          * update natwg.
2830          */
2831         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2832         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2833         iprealloc_data.retry_nodes = retry_data;
2834         iprealloc_data.retry_count = 0;
2835         iprealloc_data.fail_callback = fail_callback;
2836         iprealloc_data.fail_callback_data = callback_data;
2837         iprealloc_data.nodemap = nodemap;
2838
2839         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2840         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2841                                         nodes, 0, TAKEOVER_TIMEOUT(),
2842                                         false, tdb_null,
2843                                         NULL, iprealloc_fail_callback,
2844                                         &iprealloc_data);
2845         if (ret != 0) {
2846                 /* If the control failed then we should retry to any
2847                  * nodes flagged by iprealloc_fail_callback using the
2848                  * EVENTSCRIPT control.  This is a best-effort at
2849                  * backward compatiblity when running a mixed cluster
2850                  * where some nodes have not yet been upgraded to
2851                  * support the IPREALLOCATED control.
2852                  */
2853                 DEBUG(DEBUG_WARNING,
2854                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2855
2856                 nodes = talloc_array(tmp_ctx, uint32_t,
2857                                      iprealloc_data.retry_count);
2858                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2859
2860                 j = 0;
2861                 for (i=0; i<nodemap->num; i++) {
2862                         if (iprealloc_data.retry_nodes[i]) {
2863                                 nodes[j] = i;
2864                                 j++;
2865                         }
2866                 }
2867
2868                 data.dptr  = discard_const("ipreallocated");
2869                 data.dsize = strlen((char *)data.dptr) + 1; 
2870                 ret = ctdb_client_async_control(ctdb,
2871                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2872                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2873                                                 false, data,
2874                                                 NULL, fail_callback,
2875                                                 callback_data);
2876                 if (ret != 0) {
2877                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2878                 }
2879         }
2880
2881         talloc_free(tmp_ctx);
2882         return ret;
2883 }
2884
2885
2886 /*
2887   destroy a ctdb_client_ip structure
2888  */
2889 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2890 {
2891         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2892                 ctdb_addr_to_str(&ip->addr),
2893                 ntohs(ip->addr.ip.sin_port),
2894                 ip->client_id));
2895
2896         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2897         return 0;
2898 }
2899
2900 /*
2901   called by a client to inform us of a TCP connection that it is managing
2902   that should tickled with an ACK when IP takeover is done
2903   we handle both the old ipv4 style of packets as well as the new ipv4/6
2904   pdus.
2905  */
2906 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2907                                 TDB_DATA indata)
2908 {
2909         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2910         struct ctdb_control_tcp *old_addr = NULL;
2911         struct ctdb_control_tcp_addr new_addr;
2912         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2913         struct ctdb_tcp_list *tcp;
2914         struct ctdb_tcp_connection t;
2915         int ret;
2916         TDB_DATA data;
2917         struct ctdb_client_ip *ip;
2918         struct ctdb_vnn *vnn;
2919         ctdb_sock_addr addr;
2920
2921         /* If we don't have public IPs, tickles are useless */
2922         if (ctdb->vnn == NULL) {
2923                 return 0;
2924         }
2925
2926         switch (indata.dsize) {
2927         case sizeof(struct ctdb_control_tcp):
2928                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2929                 ZERO_STRUCT(new_addr);
2930                 tcp_sock = &new_addr;
2931                 tcp_sock->src.ip  = old_addr->src;
2932                 tcp_sock->dest.ip = old_addr->dest;
2933                 break;
2934         case sizeof(struct ctdb_control_tcp_addr):
2935                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2936                 break;
2937         default:
2938                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2939                                  "to ctdb_control_tcp_client. size was %d but "
2940                                  "only allowed sizes are %lu and %lu\n",
2941                                  (int)indata.dsize,
2942                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2943                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2944                 return -1;
2945         }
2946
2947         addr = tcp_sock->src;
2948         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2949         addr = tcp_sock->dest;
2950         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2951
2952         ZERO_STRUCT(addr);
2953         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2954         vnn = find_public_ip_vnn(ctdb, &addr);
2955         if (vnn == NULL) {
2956                 switch (addr.sa.sa_family) {
2957                 case AF_INET:
2958                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2959                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2960                                         ctdb_addr_to_str(&addr)));
2961                         }
2962                         break;
2963                 case AF_INET6:
2964                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2965                                 ctdb_addr_to_str(&addr)));
2966                         break;
2967                 default:
2968                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2969                 }
2970
2971                 return 0;
2972         }
2973
2974         if (vnn->pnn != ctdb->pnn) {
2975                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2976                         ctdb_addr_to_str(&addr),
2977                         client_id, client->pid));
2978                 /* failing this call will tell smbd to die */
2979                 return -1;
2980         }
2981
2982         ip = talloc(client, struct ctdb_client_ip);
2983         CTDB_NO_MEMORY(ctdb, ip);
2984
2985         ip->ctdb      = ctdb;
2986         ip->addr      = addr;
2987         ip->client_id = client_id;
2988         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2989         DLIST_ADD(ctdb->client_ip_list, ip);
2990
2991         tcp = talloc(client, struct ctdb_tcp_list);
2992         CTDB_NO_MEMORY(ctdb, tcp);
2993
2994         tcp->connection.src_addr = tcp_sock->src;
2995         tcp->connection.dst_addr = tcp_sock->dest;
2996
2997         DLIST_ADD(client->tcp_list, tcp);
2998
2999         t.src_addr = tcp_sock->src;
3000         t.dst_addr = tcp_sock->dest;
3001
3002         data.dptr = (uint8_t *)&t;
3003         data.dsize = sizeof(t);
3004
3005         switch (addr.sa.sa_family) {
3006         case AF_INET:
3007                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3008                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
3009                         ctdb_addr_to_str(&tcp_sock->src),
3010                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
3011                 break;
3012         case AF_INET6:
3013                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3014                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
3015                         ctdb_addr_to_str(&tcp_sock->src),
3016                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3017                 break;
3018         default:
3019                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3020         }
3021
3022
3023         /* tell all nodes about this tcp connection */
3024         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3025                                        CTDB_CONTROL_TCP_ADD,
3026                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3027         if (ret != 0) {
3028                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3029                 return -1;
3030         }
3031
3032         return 0;
3033 }
3034
3035 /*
3036   find a tcp address on a list
3037  */
3038 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3039                                            struct ctdb_tcp_connection *tcp)
3040 {
3041         int i;
3042
3043         if (array == NULL) {
3044                 return NULL;
3045         }
3046
3047         for (i=0;i<array->num;i++) {
3048                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3049                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3050                         return &array->connections[i];
3051                 }
3052         }
3053         return NULL;
3054 }
3055
3056
3057
3058 /*
3059   called by a daemon to inform us of a TCP connection that one of its
3060   clients managing that should tickled with an ACK when IP takeover is
3061   done
3062  */
3063 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3064 {
3065         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3066         struct ctdb_tcp_array *tcparray;
3067         struct ctdb_tcp_connection tcp;
3068         struct ctdb_vnn *vnn;
3069
3070         /* If we don't have public IPs, tickles are useless */
3071         if (ctdb->vnn == NULL) {
3072                 return 0;
3073         }
3074
3075         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3076         if (vnn == NULL) {
3077                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3078                         ctdb_addr_to_str(&p->dst_addr)));
3079
3080                 return -1;
3081         }
3082
3083
3084         tcparray = vnn->tcp_array;
3085
3086         /* If this is the first tickle */
3087         if (tcparray == NULL) {
3088                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3089                 CTDB_NO_MEMORY(ctdb, tcparray);
3090                 vnn->tcp_array = tcparray;
3091
3092                 tcparray->num = 0;
3093                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3094                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3095
3096                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3097                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3098                 tcparray->num++;
3099
3100                 if (tcp_update_needed) {
3101                         vnn->tcp_update_needed = true;
3102                 }
3103                 return 0;
3104         }
3105
3106
3107         /* Do we already have this tickle ?*/
3108         tcp.src_addr = p->src_addr;
3109         tcp.dst_addr = p->dst_addr;
3110         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3111                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3112                         ctdb_addr_to_str(&tcp.dst_addr),
3113                         ntohs(tcp.dst_addr.ip.sin_port),
3114                         vnn->pnn));
3115                 return 0;
3116         }
3117
3118         /* A new tickle, we must add it to the array */
3119         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3120                                         struct ctdb_tcp_connection,
3121                                         tcparray->num+1);
3122         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3123
3124         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3125         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3126         tcparray->num++;
3127
3128         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3129                 ctdb_addr_to_str(&tcp.dst_addr),
3130                 ntohs(tcp.dst_addr.ip.sin_port),
3131                 vnn->pnn));
3132
3133         if (tcp_update_needed) {
3134                 vnn->tcp_update_needed = true;
3135         }
3136
3137         return 0;
3138 }
3139
3140
3141 /*
3142   called by a daemon to inform us of a TCP connection that one of its
3143   clients managing that should tickled with an ACK when IP takeover is
3144   done
3145  */
3146 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3147 {
3148         struct ctdb_tcp_connection *tcpp;
3149         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3150
3151         if (vnn == NULL) {
3152                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3153                         ctdb_addr_to_str(&conn->dst_addr)));
3154                 return;
3155         }
3156
3157         /* if the array is empty we cant remove it
3158            and we dont need to do anything
3159          */
3160         if (vnn->tcp_array == NULL) {
3161                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3162                         ctdb_addr_to_str(&conn->dst_addr),
3163                         ntohs(conn->dst_addr.ip.sin_port)));
3164                 return;
3165         }
3166
3167
3168         /* See if we know this connection
3169            if we dont know this connection  then we dont need to do anything
3170          */
3171         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3172         if (tcpp == NULL) {
3173                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3174                         ctdb_addr_to_str(&conn->dst_addr),
3175                         ntohs(conn->dst_addr.ip.sin_port)));
3176                 return;
3177         }
3178
3179
3180         /* We need to remove this entry from the array.
3181            Instead of allocating a new array and copying data to it
3182            we cheat and just copy the last entry in the existing array
3183            to the entry that is to be removed and just shring the 
3184            ->num field
3185          */
3186         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3187         vnn->tcp_array->num--;
3188
3189         /* If we deleted the last entry we also need to remove the entire array
3190          */
3191         if (vnn->tcp_array->num == 0) {
3192                 talloc_free(vnn->tcp_array);
3193                 vnn->tcp_array = NULL;
3194         }               
3195
3196         vnn->tcp_update_needed = true;
3197
3198         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3199                 ctdb_addr_to_str(&conn->src_addr),
3200                 ntohs(conn->src_addr.ip.sin_port)));
3201 }
3202
3203
3204 /*
3205   called by a daemon to inform us of a TCP connection that one of its
3206   clients used are no longer needed in the tickle database
3207  */
3208 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3209 {
3210         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3211
3212         /* If we don't have public IPs, tickles are useless */
3213         if (ctdb->vnn == NULL) {
3214                 return 0;
3215         }
3216
3217         ctdb_remove_tcp_connection(ctdb, conn);
3218
3219         return 0;
3220 }
3221
3222
3223 /*
3224   Called when another daemon starts - caises all tickles for all
3225   public addresses we are serving to be sent to the new node on the
3226   next check.  This actually causes the next scheduled call to
3227   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3228   doesn't require careful error handling.
3229  */
3230 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3231 {
3232         struct ctdb_vnn *vnn;
3233
3234         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3235                 vnn->tcp_update_needed = true;
3236         }
3237
3238         return 0;
3239 }
3240
3241
3242 /*
3243   called when a client structure goes away - hook to remove
3244   elements from the tcp_list in all daemons
3245  */
3246 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3247 {
3248         while (client->tcp_list) {
3249                 struct ctdb_tcp_list *tcp = client->tcp_list;
3250                 DLIST_REMOVE(client->tcp_list, tcp);
3251                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3252         }
3253 }
3254
3255
3256 /*
3257   release all IPs on shutdown
3258  */
3259 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3260 {
3261         struct ctdb_vnn *vnn;
3262         int count = 0;
3263
3264         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3265                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3266                         ctdb_vnn_unassign_iface(ctdb, vnn);
3267                         continue;
3268                 }
3269                 if (!vnn->iface) {
3270                         continue;
3271                 }
3272
3273                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3274                                     ctdb_addr_to_str(&vnn->public_address),
3275                                     vnn->public_netmask_bits,
3276                                     ctdb_vnn_iface_string(vnn)));
3277
3278                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3279                                   ctdb_vnn_iface_string(vnn),
3280                                   ctdb_addr_to_str(&vnn->public_address),
3281                                   vnn->public_netmask_bits);
3282                 release_kill_clients(ctdb, &vnn->public_address);
3283                 ctdb_vnn_unassign_iface(ctdb, vnn);
3284                 count++;
3285         }
3286
3287         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3288 }
3289
3290
3291 /*
3292   get list of public IPs
3293  */
3294 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3295                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3296 {
3297         int i, num, len;
3298         struct ctdb_all_public_ips *ips;
3299         struct ctdb_vnn *vnn;
3300         bool only_available = false;
3301
3302         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3303                 only_available = true;
3304         }
3305
3306         /* count how many public ip structures we have */
3307         num = 0;
3308         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3309                 num++;
3310         }
3311
3312         len = offsetof(struct ctdb_all_public_ips, ips) + 
3313                 num*sizeof(struct ctdb_public_ip);
3314         ips = talloc_zero_size(outdata, len);
3315         CTDB_NO_MEMORY(ctdb, ips);
3316
3317         i = 0;
3318         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3319                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3320                         continue;
3321                 }
3322                 ips->ips[i].pnn  = vnn->pnn;
3323                 ips->ips[i].addr = vnn->public_address;
3324                 i++;
3325         }
3326         ips->num = i;
3327         len = offsetof(struct ctdb_all_public_ips, ips) +
3328                 i*sizeof(struct ctdb_public_ip);
3329
3330         outdata->dsize = len;
3331         outdata->dptr  = (uint8_t *)ips;
3332
3333         return 0;
3334 }
3335
3336
3337 /*
3338   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3339  */
3340 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3341                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3342 {
3343         int i, num, len;
3344         struct ctdb_all_public_ipsv4 *ips;
3345         struct ctdb_vnn *vnn;
3346
3347         /* count how many public ip structures we have */
3348         num = 0;
3349         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3350                 if (vnn->public_address.sa.sa_family != AF_INET) {
3351                         continue;
3352                 }
3353                 num++;
3354         }
3355
3356         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3357                 num*sizeof(struct ctdb_public_ipv4);
3358         ips = talloc_zero_size(outdata, len);
3359         CTDB_NO_MEMORY(ctdb, ips);
3360
3361         outdata->dsize = len;
3362         outdata->dptr  = (uint8_t *)ips;
3363
3364         ips->num = num;
3365         i = 0;
3366         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3367                 if (vnn->public_address.sa.sa_family != AF_INET) {
3368                         continue;
3369                 }
3370                 ips->ips[i].pnn = vnn->pnn;
3371                 ips->ips[i].sin = vnn->public_address.ip;
3372                 i++;
3373         }
3374
3375         return 0;
3376 }
3377
3378 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3379                                         struct ctdb_req_control *c,
3380                                         TDB_DATA indata,
3381                                         TDB_DATA *outdata)
3382 {
3383         int i, num, len;
3384         ctdb_sock_addr *addr;
3385         struct ctdb_control_public_ip_info *info;
3386         struct ctdb_vnn *vnn;
3387
3388         addr = (ctdb_sock_addr *)indata.dptr;
3389
3390         vnn = find_public_ip_vnn(ctdb, addr);
3391         if (vnn == NULL) {
3392                 /* if it is not a public ip   it could be our 'single ip' */
3393                 if (ctdb->single_ip_vnn) {
3394                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3395                                 vnn = ctdb->single_ip_vnn;
3396                         }
3397                 }
3398         }
3399         if (vnn == NULL) {
3400                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3401                                  "'%s'not a public address\n",
3402                                  ctdb_addr_to_str(addr)));
3403                 return -1;
3404         }
3405
3406         /* count how many public ip structures we have */
3407         num = 0;
3408         for (;vnn->ifaces[num];) {
3409                 num++;
3410         }
3411
3412         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3413                 num*sizeof(struct ctdb_control_iface_info);
3414         info = talloc_zero_size(outdata, len);
3415         CTDB_NO_MEMORY(ctdb, info);
3416
3417         info->ip.addr = vnn->public_address;
3418         info->ip.pnn = vnn->pnn;
3419         info->active_idx = 0xFFFFFFFF;
3420
3421         for (i=0; vnn->ifaces[i]; i++) {
3422                 struct ctdb_iface *cur;
3423
3424                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3425                 if (cur == NULL) {
3426                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3427                                            vnn->ifaces[i]));
3428                         return -1;
3429                 }
3430                 if (vnn->iface == cur) {
3431                         info->active_idx = i;
3432                 }
3433                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3434                 info->ifaces[i].link_state = cur->link_up;
3435                 info->ifaces[i].references = cur->references;
3436         }
3437         info->num = i;
3438         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3439                 i*sizeof(struct ctdb_control_iface_info);
3440
3441         outdata->dsize = len;
3442         outdata->dptr  = (uint8_t *)info;
3443
3444         return 0;
3445 }
3446
3447 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3448                                 struct ctdb_req_control *c,
3449                                 TDB_DATA *outdata)
3450 {
3451         int i, num, len;
3452         struct ctdb_control_get_ifaces *ifaces;
3453         struct ctdb_iface *cur;
3454
3455         /* count how many public ip structures we have */
3456         num = 0;
3457         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3458                 num++;
3459         }
3460
3461         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3462                 num*sizeof(struct ctdb_control_iface_info);
3463         ifaces = talloc_zero_size(outdata, len);
3464         CTDB_NO_MEMORY(ctdb, ifaces);
3465
3466         i = 0;
3467         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3468                 strcpy(ifaces->ifaces[i].name, cur->name);
3469                 ifaces->ifaces[i].link_state = cur->link_up;
3470                 ifaces->ifaces[i].references = cur->references;
3471                 i++;
3472         }
3473         ifaces->num = i;
3474         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3475                 i*sizeof(struct ctdb_control_iface_info);
3476
3477         outdata->dsize = len;
3478         outdata->dptr  = (uint8_t *)ifaces;
3479
3480         return 0;
3481 }
3482
3483 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3484                                     struct ctdb_req_control *c,
3485                                     TDB_DATA indata)
3486 {
3487         struct ctdb_control_iface_info *info;
3488         struct ctdb_iface *iface;
3489         bool link_up = false;
3490
3491         info = (struct ctdb_control_iface_info *)indata.dptr;
3492
3493         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3494                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3495                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3496                                   len, len, info->name));
3497                 return -1;
3498         }
3499
3500         switch (info->link_state) {
3501         case 0:
3502                 link_up = false;
3503                 break;
3504         case 1:
3505                 link_up = true;
3506                 break;
3507         default:
3508                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3509                                   (unsigned int)info->link_state));
3510                 return -1;
3511         }
3512
3513         if (info->references != 0) {
3514                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3515                                   (unsigned int)info->references));
3516                 return -1;
3517         }
3518
3519         iface = ctdb_find_iface(ctdb, info->name);
3520         if (iface == NULL) {
3521                 return -1;
3522         }
3523
3524         if (link_up == iface->link_up) {
3525                 return 0;
3526         }
3527
3528         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3529               ("iface[%s] has changed it's link status %s => %s\n",
3530                iface->name,
3531                iface->link_up?"up":"down",
3532                link_up?"up":"down"));
3533
3534         iface->link_up = link_up;
3535         return 0;
3536 }
3537
3538
3539 /* 
3540    structure containing the listening socket and the list of tcp connections
3541    that the ctdb daemon is to kill
3542 */
3543 struct ctdb_kill_tcp {
3544         struct ctdb_vnn *vnn;
3545         struct ctdb_context *ctdb;
3546         int capture_fd;
3547         struct fd_event *fde;
3548         trbt_tree_t *connections;
3549         void *private_data;
3550 };
3551
3552 /*
3553   a tcp connection that is to be killed
3554  */
3555 struct ctdb_killtcp_con {
3556         ctdb_sock_addr src_addr;
3557         ctdb_sock_addr dst_addr;
3558         int count;
3559         struct ctdb_kill_tcp *killtcp;
3560 };
3561
3562 /* this function is used to create a key to represent this socketpair
3563    in the killtcp tree.
3564    this key is used to insert and lookup matching socketpairs that are
3565    to be tickled and RST
3566 */
3567 #define KILLTCP_KEYLEN  10
3568 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3569 {
3570         static uint32_t key[KILLTCP_KEYLEN];
3571
3572         bzero(key, sizeof(key));
3573
3574         if (src->sa.sa_family != dst->sa.sa_family) {
3575                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3576                 return key;
3577         }
3578         
3579         switch (src->sa.sa_family) {
3580         case AF_INET:
3581                 key[0]  = dst->ip.sin_addr.s_addr;
3582                 key[1]  = src->ip.sin_addr.s_addr;
3583                 key[2]  = dst->ip.sin_port;
3584                 key[3]  = src->ip.sin_port;
3585                 break;
3586         case AF_INET6: {
3587                 uint32_t *dst6_addr32 =
3588                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3589                 uint32_t *src6_addr32 =
3590                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3591                 key[0]  = dst6_addr32[3];
3592                 key[1]  = src6_addr32[3];
3593                 key[2]  = dst6_addr32[2];
3594                 key[3]  = src6_addr32[2];
3595                 key[4]  = dst6_addr32[1];
3596                 key[5]  = src6_addr32[1];
3597                 key[6]  = dst6_addr32[0];
3598                 key[7]  = src6_addr32[0];
3599                 key[8]  = dst->ip6.sin6_port;
3600                 key[9]  = src->ip6.sin6_port;
3601                 break;
3602         }
3603         default:
3604                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3605                 return key;
3606         }
3607
3608         return key;
3609 }
3610
3611 /*
3612   called when we get a read event on the raw socket
3613  */
3614 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3615                                 uint16_t flags, void *private_data)
3616 {
3617         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3618         struct ctdb_killtcp_con *con;
3619         ctdb_sock_addr src, dst;
3620         uint32_t ack_seq, seq;
3621
3622         if (!(flags & EVENT_FD_READ)) {
3623                 return;
3624         }
3625
3626         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3627                                 killtcp->private_data,
3628                                 &src, &dst,
3629                                 &ack_seq, &seq) != 0) {
3630                 /* probably a non-tcp ACK packet */
3631                 return;
3632         }
3633
3634         /* check if we have this guy in our list of connections
3635            to kill
3636         */
3637         con = trbt_lookuparray32(killtcp->connections, 
3638                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3639         if (con == NULL) {
3640                 /* no this was some other packet we can just ignore */
3641                 return;
3642         }
3643
3644         /* This one has been tickled !
3645            now reset him and remove him from the list.
3646          */
3647         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3648                 ntohs(con->dst_addr.ip.sin_port),
3649                 ctdb_addr_to_str(&con->src_addr),
3650                 ntohs(con->src_addr.ip.sin_port)));
3651
3652         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3653         talloc_free(con);
3654 }
3655
3656
3657 /* when traversing the list of all tcp connections to send tickle acks to
3658    (so that we can capture the ack coming back and kill the connection
3659     by a RST)
3660    this callback is called for each connection we are currently trying to kill
3661 */
3662 static int tickle_connection_traverse(void *param, void *data)
3663 {
3664         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3665
3666         /* have tried too many times, just give up */
3667         if (con->count >= 5) {
3668                 /* can't delete in traverse: reparent to delete_cons */
3669                 talloc_steal(param, con);
3670                 return 0;
3671         }
3672
3673         /* othervise, try tickling it again */
3674         con->count++;
3675         ctdb_sys_send_tcp(
3676                 (ctdb_sock_addr *)&con->dst_addr,
3677                 (ctdb_sock_addr *)&con->src_addr,
3678                 0, 0, 0);
3679         return 0;
3680 }
3681
3682
3683 /* 
3684    called every second until all sentenced connections have been reset
3685  */
3686 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3687                                               struct timeval t, void *private_data)
3688 {
3689         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3690         void *delete_cons = talloc_new(NULL);
3691
3692         /* loop over all connections sending tickle ACKs */
3693         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3694
3695         /* now we've finished traverse, it's safe to do deletion. */
3696         talloc_free(delete_cons);
3697
3698         /* If there are no more connections to kill we can remove the
3699            entire killtcp structure
3700          */
3701         if ( (killtcp->connections == NULL) || 
3702              (killtcp->connections->root == NULL) ) {
3703                 talloc_free(killtcp);
3704                 return;
3705         }
3706
3707         /* try tickling them again in a seconds time
3708          */
3709         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3710                         ctdb_tickle_sentenced_connections, killtcp);
3711 }
3712
3713 /*
3714   destroy the killtcp structure
3715  */
3716 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3717 {
3718         struct ctdb_vnn *tmpvnn;
3719
3720         /* verify that this vnn is still active */
3721         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3722                 if (tmpvnn == killtcp->vnn) {
3723                         break;
3724                 }
3725         }
3726
3727         if (tmpvnn == NULL) {
3728                 return 0;
3729         }
3730
3731         if (killtcp->vnn->killtcp != killtcp) {
3732                 return 0;
3733         }
3734
3735         killtcp->vnn->killtcp = NULL;
3736
3737         return 0;
3738 }
3739
3740
3741 /* nothing fancy here, just unconditionally replace any existing
3742    connection structure with the new one.
3743
3744    dont even free the old one if it did exist, that one is talloc_stolen
3745    by the same node in the tree anyway and will be deleted when the new data 
3746    is deleted
3747 */
3748 static void *add_killtcp_callback(void *parm, void *data)
3749 {
3750         return parm;
3751 }
3752
3753 /*
3754   add a tcp socket to the list of connections we want to RST
3755  */
3756 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3757                                        ctdb_sock_addr *s,
3758                                        ctdb_sock_addr *d)
3759 {
3760         ctdb_sock_addr src, dst;
3761         struct ctdb_kill_tcp *killtcp;
3762         struct ctdb_killtcp_con *con;
3763         struct ctdb_vnn *vnn;
3764
3765         ctdb_canonicalize_ip(s, &src);
3766         ctdb_canonicalize_ip(d, &dst);
3767
3768         vnn = find_public_ip_vnn(ctdb, &dst);
3769         if (vnn == NULL) {
3770                 vnn = find_public_ip_vnn(ctdb, &src);
3771         }
3772         if (vnn == NULL) {
3773                 /* if it is not a public ip   it could be our 'single ip' */
3774                 if (ctdb->single_ip_vnn) {
3775                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3776                                 vnn = ctdb->single_ip_vnn;
3777                         }
3778                 }
3779         }
3780         if (vnn == NULL) {
3781                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3782                 return -1;
3783         }
3784
3785         killtcp = vnn->killtcp;
3786         
3787         /* If this is the first connection to kill we must allocate
3788            a new structure
3789          */
3790         if (killtcp == NULL) {
3791                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3792                 CTDB_NO_MEMORY(ctdb, killtcp);
3793
3794                 killtcp->vnn         = vnn;
3795                 killtcp->ctdb        = ctdb;
3796                 killtcp->capture_fd  = -1;
3797                 killtcp->connections = trbt_create(killtcp, 0);
3798
3799                 vnn->killtcp         = killtcp;
3800                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3801         }
3802
3803
3804
3805         /* create a structure that describes this connection we want to
3806            RST and store it in killtcp->connections
3807         */
3808         con = talloc(killtcp, struct ctdb_killtcp_con);
3809         CTDB_NO_MEMORY(ctdb, con);
3810         con->src_addr = src;
3811         con->dst_addr = dst;
3812         con->count    = 0;
3813         con->killtcp  = killtcp;
3814
3815
3816         trbt_insertarray32_callback(killtcp->connections,
3817                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3818                         add_killtcp_callback, con);
3819
3820         /* 
3821            If we dont have a socket to listen on yet we must create it
3822          */
3823         if (killtcp->capture_fd == -1) {
3824                 const char *iface = ctdb_vnn_iface_string(vnn);
3825                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3826                 if (killtcp->capture_fd == -1) {
3827                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3828                                           "socket on iface '%s' for killtcp (%s)\n",
3829                                           iface, strerror(errno)));
3830                         goto failed;
3831                 }
3832         }
3833
3834
3835         if (killtcp->fde == NULL) {
3836                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3837                                             EVENT_FD_READ,
3838                                             capture_tcp_handler, killtcp);
3839                 tevent_fd_set_auto_close(killtcp->fde);
3840
3841                 /* We also need to set up some events to tickle all these connections
3842                    until they are all reset
3843                 */
3844                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3845                                 ctdb_tickle_sentenced_connections, killtcp);
3846         }
3847
3848         /* tickle him once now */
3849         ctdb_sys_send_tcp(
3850                 &con->dst_addr,
3851                 &con->src_addr,
3852                 0, 0, 0);
3853
3854         return 0;
3855
3856 failed:
3857         talloc_free(vnn->killtcp);
3858         vnn->killtcp = NULL;
3859         return -1;
3860 }
3861
3862 /*
3863   kill a TCP connection.
3864  */
3865 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3866 {
3867         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3868
3869         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3870 }
3871
3872 /*
3873   called by a daemon to inform us of the entire list of TCP tickles for
3874   a particular public address.
3875   this control should only be sent by the node that is currently serving
3876   that public address.
3877  */
3878 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3879 {
3880         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3881         struct ctdb_tcp_array *tcparray;
3882         struct ctdb_vnn *vnn;
3883
3884         /* We must at least have tickles.num or else we cant verify the size
3885            of the received data blob
3886          */
3887         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3888                                         tickles.connections)) {
3889                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3890                 return -1;
3891         }
3892
3893         /* verify that the size of data matches what we expect */
3894         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3895                                 tickles.connections)
3896                          + sizeof(struct ctdb_tcp_connection)
3897                                  * list->tickles.num) {
3898                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3899                 return -1;
3900         }
3901
3902         vnn = find_public_ip_vnn(ctdb, &list->addr);
3903         if (vnn == NULL) {
3904                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3905                         ctdb_addr_to_str(&list->addr)));
3906
3907                 return 1;
3908         }
3909
3910         /* remove any old ticklelist we might have */
3911         talloc_free(vnn->tcp_array);
3912         vnn->tcp_array = NULL;
3913
3914         tcparray = talloc(vnn, struct ctdb_tcp_array);
3915         CTDB_NO_MEMORY(ctdb, tcparray);
3916
3917         tcparray->num = list->tickles.num;
3918
3919         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3920         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3921
3922         memcpy(tcparray->connections, &list->tickles.connections[0],
3923                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3924
3925         /* We now have a new fresh tickle list array for this vnn */
3926         vnn->tcp_array = tcparray;
3927
3928         return 0;
3929 }
3930
3931 /*
3932   called to return the full list of tickles for the puclic address associated 
3933   with the provided vnn
3934  */
3935 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3936 {
3937         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3938         struct ctdb_control_tcp_tickle_list *list;
3939         struct ctdb_tcp_array *tcparray;
3940         int num;
3941         struct ctdb_vnn *vnn;
3942
3943         vnn = find_public_ip_vnn(ctdb, addr);
3944         if (vnn == NULL) {
3945                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3946                         ctdb_addr_to_str(addr)));
3947
3948                 return 1;
3949         }
3950
3951         tcparray = vnn->tcp_array;
3952         if (tcparray) {
3953                 num = tcparray->num;
3954         } else {
3955                 num = 0;
3956         }
3957
3958         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3959                                 tickles.connections)
3960                         + sizeof(struct ctdb_tcp_connection) * num;
3961
3962         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3963         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3964         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3965
3966         list->addr = *addr;
3967         list->tickles.num = num;
3968         if (num) {
3969                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3970                         sizeof(struct ctdb_tcp_connection) * num);
3971         }
3972
3973         return 0;
3974 }
3975
3976
3977 /*
3978   set the list of all tcp tickles for a public address
3979  */
3980 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3981                                             ctdb_sock_addr *addr,
3982                                             struct ctdb_tcp_array *tcparray)
3983 {
3984         int ret, num;
3985         TDB_DATA data;
3986         struct ctdb_control_tcp_tickle_list *list;
3987
3988         if (tcparray) {
3989                 num = tcparray->num;
3990         } else {
3991                 num = 0;
3992         }
3993
3994         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3995                                 tickles.connections) +
3996                         sizeof(struct ctdb_tcp_connection) * num;
3997         data.dptr = talloc_size(ctdb, data.dsize);
3998         CTDB_NO_MEMORY(ctdb, data.dptr);
3999
4000         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
4001         list->addr = *addr;
4002         list->tickles.num = num;
4003         if (tcparray) {
4004                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
4005         }
4006
4007         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
4008                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4009                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4010         if (ret != 0) {
4011                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4012                 return -1;
4013         }
4014
4015         talloc_free(data.dptr);
4016
4017         return ret;
4018 }
4019
4020
4021 /*
4022   perform tickle updates if required
4023  */
4024 static void ctdb_update_tcp_tickles(struct event_context *ev, 
4025                                 struct timed_event *te, 
4026                                 struct timeval t, void *private_data)
4027 {
4028         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4029         int ret;
4030         struct ctdb_vnn *vnn;
4031
4032         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4033                 /* we only send out updates for public addresses that 
4034                    we have taken over
4035                  */
4036                 if (ctdb->pnn != vnn->pnn) {
4037                         continue;
4038                 }
4039                 /* We only send out the updates if we need to */
4040                 if (!vnn->tcp_update_needed) {
4041                         continue;
4042                 }
4043                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
4044                                                        &vnn->public_address,
4045                                                        vnn->tcp_array);
4046                 if (ret != 0) {
4047                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4048                                 ctdb_addr_to_str(&vnn->public_address)));
4049                 } else {
4050                         vnn->tcp_update_needed = false;
4051                 }
4052         }
4053
4054         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4055                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4056                              ctdb_update_tcp_tickles, ctdb);
4057 }               
4058         
4059
4060 /*
4061   start periodic update of tcp tickles
4062  */
4063 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4064 {
4065         ctdb->tickle_update_context = talloc_new(ctdb);
4066
4067         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4068                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4069                              ctdb_update_tcp_tickles, ctdb);
4070 }
4071
4072
4073
4074
4075 struct control_gratious_arp {
4076         struct ctdb_context *ctdb;
4077         ctdb_sock_addr addr;
4078         const char *iface;
4079         int count;
4080 };
4081
4082 /*
4083   send a control_gratuitous arp
4084  */
4085 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4086                                   struct timeval t, void *private_data)
4087 {
4088         int ret;
4089         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4090                                                         struct control_gratious_arp);
4091
4092         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4093         if (ret != 0) {
4094                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4095                                  arp->iface, strerror(errno)));
4096         }
4097
4098
4099         arp->count++;
4100         if (arp->count == CTDB_ARP_REPEAT) {
4101                 talloc_free(arp);
4102                 return;
4103         }
4104
4105         event_add_timed(arp->ctdb->ev, arp, 
4106                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4107                         send_gratious_arp, arp);
4108 }
4109
4110
4111 /*
4112   send a gratious arp 
4113  */
4114 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4115 {
4116         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4117         struct control_gratious_arp *arp;
4118
4119         /* verify the size of indata */
4120         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4121                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4122                                  (unsigned)indata.dsize, 
4123                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4124                 return -1;
4125         }
4126         if (indata.dsize != 
4127                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4128                 + gratious_arp->len ) ){
4129
4130                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4131                         "but should be %u bytes\n", 
4132                          (unsigned)indata.dsize, 
4133                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4134                 return -1;
4135         }
4136
4137
4138         arp = talloc(ctdb, struct control_gratious_arp);
4139         CTDB_NO_MEMORY(ctdb, arp);
4140
4141         arp->ctdb  = ctdb;
4142         arp->addr   = gratious_arp->addr;
4143         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4144         CTDB_NO_MEMORY(ctdb, arp->iface);
4145         arp->count = 0;
4146         
4147         event_add_timed(arp->ctdb->ev, arp, 
4148                         timeval_zero(), send_gratious_arp, arp);
4149
4150         return 0;
4151 }
4152
4153 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4154 {
4155         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4156         int ret;
4157
4158         /* verify the size of indata */
4159         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4160                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4161                 return -1;
4162         }
4163         if (indata.dsize != 
4164                 ( offsetof(struct ctdb_control_ip_iface, iface)
4165                 + pub->len ) ){
4166
4167                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4168                         "but should be %u bytes\n", 
4169                          (unsigned)indata.dsize, 
4170                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4171                 return -1;
4172         }
4173
4174         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4175
4176         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4177
4178         if (ret != 0) {
4179                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4180                 return -1;
4181         }
4182
4183         return 0;
4184 }
4185
4186 struct delete_ip_callback_state {
4187         struct ctdb_req_control *c;
4188 };
4189
4190 /*
4191   called when releaseip event finishes for del_public_address
4192  */
4193 static void delete_ip_callback(struct ctdb_context *ctdb,
4194                                int32_t status, TDB_DATA data,
4195                                const char *errormsg,
4196                                void *private_data)
4197 {
4198         struct delete_ip_callback_state *state =
4199                 talloc_get_type(private_data, struct delete_ip_callback_state);
4200
4201         /* If release failed then fail. */
4202         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4203         talloc_free(private_data);
4204 }
4205
4206 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4207                                         struct ctdb_req_control *c,
4208                                         TDB_DATA indata, bool *async_reply)
4209 {
4210         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4211         struct ctdb_vnn *vnn;
4212
4213         /* verify the size of indata */
4214         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4215                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4216                 return -1;
4217         }
4218         if (indata.dsize != 
4219                 ( offsetof(struct ctdb_control_ip_iface, iface)
4220                 + pub->len ) ){
4221
4222                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4223                         "but should be %u bytes\n", 
4224                          (unsigned)indata.dsize, 
4225                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4226                 return -1;
4227         }
4228
4229         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4230
4231         /* walk over all public addresses until we find a match */
4232         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4233                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4234                         if (vnn->pnn == ctdb->pnn) {
4235                                 struct delete_ip_callback_state *state;
4236                                 struct ctdb_public_ip *ip;
4237                                 TDB_DATA data;
4238                                 int ret;
4239
4240                                 vnn->delete_pending = true;
4241
4242                                 state = talloc(ctdb,
4243                                                struct delete_ip_callback_state);
4244                                 CTDB_NO_MEMORY(ctdb, state);
4245                                 state->c = c;
4246
4247                                 ip = talloc(state, struct ctdb_public_ip);
4248                                 if (ip == NULL) {
4249                                         DEBUG(DEBUG_ERR,
4250                                               (__location__ " Out of memory\n"));
4251                                         talloc_free(state);
4252                                         return -1;
4253                                 }
4254                                 ip->pnn = -1;
4255                                 ip->addr = pub->addr;
4256
4257                                 data.dsize = sizeof(struct ctdb_public_ip);
4258                                 data.dptr = (unsigned char *)ip;
4259
4260                                 ret = ctdb_daemon_send_control(ctdb,
4261                                                                ctdb_get_pnn(ctdb),
4262                                                                0,
4263                                                                CTDB_CONTROL_RELEASE_IP,
4264                                                                0, 0,
4265                                                                data,
4266                                                                delete_ip_callback,
4267                                                                state);
4268                                 if (ret == -1) {
4269                                         DEBUG(DEBUG_ERR,
4270                                               (__location__ "Unable to send "
4271                                                "CTDB_CONTROL_RELEASE_IP\n"));
4272                                         talloc_free(state);
4273                                         return -1;
4274                                 }
4275
4276                                 state->c = talloc_steal(state, c);
4277                                 *async_reply = true;
4278                         } else {
4279                                 /* This IP is not hosted on the
4280                                  * current node so just delete it
4281                                  * now. */
4282                                 do_delete_ip(ctdb, vnn);
4283                         }
4284
4285                         return 0;
4286                 }
4287         }
4288
4289         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4290                          ctdb_addr_to_str(&pub->addr)));
4291         return -1;
4292 }
4293
4294
4295 struct ipreallocated_callback_state {
4296         struct ctdb_req_control *c;
4297 };
4298
4299 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4300                                         int status, void *p)
4301 {
4302         struct ipreallocated_callback_state *state =
4303                 talloc_get_type(p, struct ipreallocated_callback_state);
4304
4305         if (status != 0) {
4306                 DEBUG(DEBUG_ERR,
4307                       (" \"ipreallocated\" event script failed (status %d)\n",
4308                        status));
4309                 if (status == -ETIME) {
4310                         ctdb_ban_self(ctdb);
4311                 }
4312         }
4313
4314         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4315         talloc_free(state);
4316 }
4317
4318 /* A control to run the ipreallocated event */
4319 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4320                                    struct ctdb_req_control *c,
4321                                    bool *async_reply)
4322 {
4323         int ret;
4324         struct ipreallocated_callback_state *state;
4325
4326         state = talloc(ctdb, struct ipreallocated_callback_state);
4327         CTDB_NO_MEMORY(ctdb, state);
4328
4329         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4330
4331         ret = ctdb_event_script_callback(ctdb, state,
4332                                          ctdb_ipreallocated_callback, state,
4333                                          CTDB_EVENT_IPREALLOCATED,
4334                                          "%s", "");
4335
4336         if (ret != 0) {
4337                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4338                 talloc_free(state);
4339                 return -1;
4340         }
4341
4342         /* tell the control that we will be reply asynchronously */
4343         state->c    = talloc_steal(state, c);
4344         *async_reply = true;
4345
4346         return 0;
4347 }
4348
4349
4350 /* This function is called from the recovery daemon to verify that a remote
4351    node has the expected ip allocation.
4352    This is verified against ctdb->ip_tree
4353 */
4354 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4355                                 struct ctdb_all_public_ips *ips,
4356                                 uint32_t pnn)
4357 {
4358         struct ctdb_public_ip_list *tmp_ip; 
4359         int i;
4360
4361         if (ctdb->ip_tree == NULL) {
4362                 /* dont know the expected allocation yet, assume remote node
4363                    is correct. */
4364                 return 0;
4365         }
4366
4367         if (ips == NULL) {
4368                 return 0;
4369         }
4370
4371         for (i=0; i<ips->num; i++) {
4372                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4373                 if (tmp_ip == NULL) {
4374                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4375                         return -1;
4376                 }
4377
4378                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4379                         continue;
4380                 }
4381
4382                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4383                         DEBUG(DEBUG_ERR,
4384                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4385                                pnn,
4386                                ctdb_addr_to_str(&ips->ips[i].addr),
4387                                ips->ips[i].pnn, tmp_ip->pnn));
4388                         return -1;
4389                 }
4390         }
4391
4392         return 0;
4393 }
4394
4395 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4396 {
4397         struct ctdb_public_ip_list *tmp_ip; 
4398
4399         if (ctdb->ip_tree == NULL) {
4400                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4401                 return -1;
4402         }
4403
4404         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4405         if (tmp_ip == NULL) {
4406                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4407                 return -1;
4408         }
4409
4410         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4411         tmp_ip->pnn = ip->pnn;
4412
4413         return 0;
4414 }
4415
4416
4417 struct ctdb_reloadips_handle {
4418         struct ctdb_context *ctdb;
4419         struct ctdb_req_control *c;
4420         int status;
4421         int fd[2];
4422         pid_t child;
4423         struct fd_event *fde;
4424 };
4425
4426 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4427 {
4428         if (h == h->ctdb->reload_ips) {
4429                 h->ctdb->reload_ips = NULL;
4430         }
4431         if (h->c != NULL) {
4432                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4433                 h->c = NULL;
4434         }
4435         ctdb_kill(h->ctdb, h->child, SIGKILL);
4436         return 0;
4437 }
4438
4439 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4440                                 struct timed_event *te,
4441                                 struct timeval t, void *private_data)
4442 {
4443         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4444
4445         talloc_free(h);
4446 }       
4447
4448 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4449                              uint16_t flags, void *private_data)
4450 {
4451         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4452
4453         char res;
4454         int ret;
4455
4456         ret = read(h->fd[0], &res, 1);
4457         if (ret < 1 || res != 0) {
4458                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4459                 res = 1;
4460         }
4461         h->status = res;
4462
4463         talloc_free(h);
4464 }
4465
4466 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4467 {
4468         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4469         struct ctdb_all_public_ips *ips;
4470         struct ctdb_vnn *vnn;
4471         struct client_async_data *async_data;
4472         struct timeval timeout;
4473         TDB_DATA data;
4474         struct ctdb_client_control_state *state;
4475         bool first_add;
4476         int i, ret;
4477
4478         CTDB_NO_MEMORY(ctdb, mem_ctx);
4479
4480         /* Read IPs from local node */
4481         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4482                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4483         if (ret != 0) {
4484                 DEBUG(DEBUG_ERR,
4485                       ("Unable to fetch public IPs from local node\n"));
4486                 talloc_free(mem_ctx);
4487                 return -1;
4488         }
4489
4490         /* Read IPs file - this is safe since this is a child process */
4491         ctdb->vnn = NULL;
4492         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4493                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4494                 talloc_free(mem_ctx);
4495                 return -1;
4496         }
4497
4498         async_data = talloc_zero(mem_ctx, struct client_async_data);
4499         CTDB_NO_MEMORY(ctdb, async_data);
4500
4501         /* Compare IPs between node and file for IPs to be deleted */
4502         for (i = 0; i < ips->num; i++) {
4503                 /* */
4504                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4505                         if (ctdb_same_ip(&vnn->public_address,
4506                                          &ips->ips[i].addr)) {
4507                                 /* IP is still in file */
4508                                 break;
4509                         }
4510                 }
4511
4512                 if (vnn == NULL) {
4513                         /* Delete IP ips->ips[i] */
4514                         struct ctdb_control_ip_iface *pub;
4515
4516                         DEBUG(DEBUG_NOTICE,
4517                               ("IP %s no longer configured, deleting it\n",
4518                                ctdb_addr_to_str(&ips->ips[i].addr)));
4519
4520                         pub = talloc_zero(mem_ctx,
4521                                           struct ctdb_control_ip_iface);
4522                         CTDB_NO_MEMORY(ctdb, pub);
4523
4524                         pub->addr  = ips->ips[i].addr;
4525                         pub->mask  = 0;
4526                         pub->len   = 0;
4527
4528                         timeout = TAKEOVER_TIMEOUT();
4529
4530                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4531                                               iface) + pub->len;
4532                         data.dptr = (uint8_t *)pub;
4533
4534                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4535                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4536                                                   0, data, async_data,
4537                                                   &timeout, NULL);
4538                         if (state == NULL) {
4539                                 DEBUG(DEBUG_ERR,
4540                                       (__location__
4541                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4542                                 goto failed;
4543                         }
4544
4545                         ctdb_client_async_add(async_data, state);
4546                 }
4547         }
4548
4549         /* Compare IPs between node and file for IPs to be added */
4550         first_add = true;
4551         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4552                 for (i = 0; i < ips->num; i++) {
4553                         if (ctdb_same_ip(&vnn->public_address,
4554                                          &ips->ips[i].addr)) {
4555                                 /* IP already on node */
4556                                 break;
4557                         }
4558                 }
4559                 if (i == ips->num) {
4560                         /* Add IP ips->ips[i] */
4561                         struct ctdb_control_ip_iface *pub;
4562                         const char *ifaces = NULL;
4563                         uint32_t len;
4564                         int iface = 0;
4565
4566                         DEBUG(DEBUG_NOTICE,
4567                               ("New IP %s configured, adding it\n",
4568                                ctdb_addr_to_str(&vnn->public_address)));
4569                         if (first_add) {
4570                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4571
4572                                 data.dsize = sizeof(pnn);
4573                                 data.dptr  = (uint8_t *)&pnn;
4574
4575                                 ret = ctdb_client_send_message(
4576                                         ctdb,
4577                                         CTDB_BROADCAST_CONNECTED,
4578                                         CTDB_SRVID_REBALANCE_NODE,
4579                                         data);
4580                                 if (ret != 0) {
4581                                         DEBUG(DEBUG_WARNING,
4582                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4583                                 }
4584
4585                                 first_add = false;
4586                         }
4587
4588                         ifaces = vnn->ifaces[0];
4589                         iface = 1;
4590                         while (vnn->ifaces[iface] != NULL) {
4591                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4592                                                          vnn->ifaces[iface]);
4593                                 iface++;
4594                         }
4595
4596                         len   = strlen(ifaces) + 1;
4597                         pub = talloc_zero_size(mem_ctx,
4598                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4599                         CTDB_NO_MEMORY(ctdb, pub);
4600
4601                         pub->addr  = vnn->public_address;
4602                         pub->mask  = vnn->public_netmask_bits;
4603                         pub->len   = len;
4604                         memcpy(&pub->iface[0], ifaces, pub->len);
4605
4606                         timeout = TAKEOVER_TIMEOUT();
4607
4608                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4609                                               iface) + pub->len;
4610                         data.dptr = (uint8_t *)pub;
4611
4612                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4613                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4614                                                   0, data, async_data,
4615                                                   &timeout, NULL);
4616                         if (state == NULL) {
4617                                 DEBUG(DEBUG_ERR,
4618                                       (__location__
4619                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4620                                 goto failed;
4621                         }
4622
4623                         ctdb_client_async_add(async_data, state);
4624                 }
4625         }
4626
4627         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4628                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4629                 goto failed;
4630         }
4631
4632         talloc_free(mem_ctx);
4633         return 0;
4634
4635 failed:
4636         talloc_free(mem_ctx);
4637         return -1;
4638 }
4639
4640 /* This control is sent to force the node to re-read the public addresses file
4641    and drop any addresses we should nnot longer host, and add new addresses
4642    that we are now able to host
4643 */
4644 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4645 {
4646         struct ctdb_reloadips_handle *h;
4647         pid_t parent = getpid();
4648
4649         if (ctdb->reload_ips != NULL) {
4650                 talloc_free(ctdb->reload_ips);
4651                 ctdb->reload_ips = NULL;
4652         }
4653
4654         h = talloc(ctdb, struct ctdb_reloadips_handle);
4655         CTDB_NO_MEMORY(ctdb, h);
4656         h->ctdb     = ctdb;
4657         h->c        = NULL;
4658         h->status   = -1;
4659         
4660         if (pipe(h->fd) == -1) {
4661                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4662                 talloc_free(h);
4663                 return -1;
4664         }
4665
4666         h->child = ctdb_fork(ctdb);
4667         if (h->child == (pid_t)-1) {
4668                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4669                 close(h->fd[0]);
4670                 close(h->fd[1]);
4671                 talloc_free(h);
4672                 return -1;
4673         }
4674
4675         /* child process */
4676         if (h->child == 0) {
4677                 signed char res = 0;
4678
4679                 close(h->fd[0]);
4680                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4681
4682                 ctdb_set_process_name("ctdb_reloadips");
4683                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4684                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4685                         res = -1;
4686                 } else {
4687                         res = ctdb_reloadips_child(ctdb);
4688                         if (res != 0) {
4689                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4690                         }
4691                 }
4692
4693                 write(h->fd[1], &res, 1);
4694                 /* make sure we die when our parent dies */
4695                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4696                         sleep(5);
4697                 }
4698                 _exit(0);
4699         }
4700
4701         h->c             = talloc_steal(h, c);
4702
4703         close(h->fd[1]);
4704         set_close_on_exec(h->fd[0]);
4705
4706         talloc_set_destructor(h, ctdb_reloadips_destructor);
4707
4708
4709         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4710                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4711                         (void *)h);
4712         tevent_fd_set_auto_close(h->fde);
4713
4714         event_add_timed(ctdb->ev, h,
4715                         timeval_current_ofs(120, 0),
4716                         ctdb_reloadips_timeout_event, h);
4717
4718         /* we reply later */
4719         *async_reply = true;
4720         return 0;
4721 }