91f30302d777f2adebdd78ec5a14cab654e2a45f
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          false,
514                                          CTDB_EVENT_TAKE_IP,
515                                          "%s %s %u",
516                                          ctdb_vnn_iface_string(vnn),
517                                          ctdb_addr_to_str(&vnn->public_address),
518                                          vnn->public_netmask_bits);
519
520         if (ret != 0) {
521                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522                         ctdb_addr_to_str(&vnn->public_address),
523                         ctdb_vnn_iface_string(vnn)));
524                 talloc_free(state);
525                 return -1;
526         }
527
528         return 0;
529 }
530
531 struct ctdb_do_updateip_state {
532         struct ctdb_req_control *c;
533         struct ctdb_iface *old;
534         struct ctdb_vnn *vnn;
535 };
536
537 /*
538   called when updateip event finishes
539  */
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541                                       void *private_data)
542 {
543         struct ctdb_do_updateip_state *state =
544                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545         int32_t ret;
546
547         if (status != 0) {
548                 if (status == -ETIME) {
549                         ctdb_ban_self(ctdb);
550                 }
551                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552                         ctdb_addr_to_str(&state->vnn->public_address),
553                         state->old->name,
554                         ctdb_vnn_iface_string(state->vnn)));
555
556                 /*
557                  * All we can do is reset the old interface
558                  * and let the next run fix it
559                  */
560                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561                 state->vnn->iface = state->old;
562                 state->vnn->iface->references++;
563
564                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         if (ctdb->do_checkpublicip) {
570
571         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572         if (ret != 0) {
573                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574                 talloc_free(state);
575                 return;
576         }
577
578         }
579
580         /* the control succeeded */
581         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582         talloc_free(state);
583         return;
584 }
585
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587 {
588         state->vnn->update_in_flight = false;
589         return 0;
590 }
591
592 /*
593   update (move) an ip address
594  */
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596                                 struct ctdb_req_control *c,
597                                 struct ctdb_vnn *vnn)
598 {
599         int ret;
600         struct ctdb_do_updateip_state *state;
601         struct ctdb_iface *old = vnn->iface;
602         const char *new_name;
603
604         if (vnn->update_in_flight) {
605                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606                                     "update for this IP already in flight\n",
607                                     ctdb_addr_to_str(&vnn->public_address),
608                                     vnn->public_netmask_bits));
609                 return -1;
610         }
611
612         ctdb_vnn_unassign_iface(ctdb, vnn);
613         ret = ctdb_vnn_assign_iface(ctdb, vnn);
614         if (ret != 0) {
615                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616                                  "assin a usable interface (old iface '%s')\n",
617                                  ctdb_addr_to_str(&vnn->public_address),
618                                  vnn->public_netmask_bits,
619                                  old->name));
620                 return -1;
621         }
622
623         new_name = ctdb_vnn_iface_string(vnn);
624         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625                 /* A benign update from one interface onto itself.
626                  * no need to run the eventscripts in this case, just return
627                  * success.
628                  */
629                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630                 return 0;
631         }
632
633         state = talloc(vnn, struct ctdb_do_updateip_state);
634         CTDB_NO_MEMORY(ctdb, state);
635
636         state->c = talloc_steal(ctdb, c);
637         state->old = old;
638         state->vnn = vnn;
639
640         vnn->update_in_flight = true;
641         talloc_set_destructor(state, ctdb_updateip_destructor);
642
643         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644                             "interface %s to %s\n",
645                             ctdb_addr_to_str(&vnn->public_address),
646                             vnn->public_netmask_bits,
647                             old->name,
648                             new_name));
649
650         ret = ctdb_event_script_callback(ctdb,
651                                          state,
652                                          ctdb_do_updateip_callback,
653                                          state,
654                                          false,
655                                          CTDB_EVENT_UPDATE_IP,
656                                          "%s %s %s %u",
657                                          state->old->name,
658                                          new_name,
659                                          ctdb_addr_to_str(&vnn->public_address),
660                                          vnn->public_netmask_bits);
661         if (ret != 0) {
662                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663                                  ctdb_addr_to_str(&vnn->public_address),
664                                  old->name, new_name));
665                 talloc_free(state);
666                 return -1;
667         }
668
669         return 0;
670 }
671
672 /*
673   Find the vnn of the node that has a public ip address
674   returns -1 if the address is not known as a public address
675  */
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677 {
678         struct ctdb_vnn *vnn;
679
680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681                 if (ctdb_same_ip(&vnn->public_address, addr)) {
682                         return vnn;
683                 }
684         }
685
686         return NULL;
687 }
688
689 /*
690   take over an ip address
691  */
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693                                  struct ctdb_req_control *c,
694                                  TDB_DATA indata,
695                                  bool *async_reply)
696 {
697         int ret;
698         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699         struct ctdb_vnn *vnn;
700         bool have_ip = false;
701         bool do_updateip = false;
702         bool do_takeip = false;
703         struct ctdb_iface *best_iface = NULL;
704
705         if (pip->pnn != ctdb->pnn) {
706                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707                                  "with pnn %d, but we're node %d\n",
708                                  ctdb_addr_to_str(&pip->addr),
709                                  pip->pnn, ctdb->pnn));
710                 return -1;
711         }
712
713         /* update out vnn list */
714         vnn = find_public_ip_vnn(ctdb, &pip->addr);
715         if (vnn == NULL) {
716                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717                         ctdb_addr_to_str(&pip->addr)));
718                 return 0;
719         }
720
721         if (ctdb->do_checkpublicip) {
722                 have_ip = ctdb_sys_have_ip(&pip->addr);
723         }
724         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725         if (best_iface == NULL) {
726                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727                                  "a usable interface (old %s, have_ip %d)\n",
728                                  ctdb_addr_to_str(&vnn->public_address),
729                                  vnn->public_netmask_bits,
730                                  ctdb_vnn_iface_string(vnn),
731                                  have_ip));
732                 return -1;
733         }
734
735         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737                 have_ip = false;
738         }
739
740
741         if (vnn->iface == NULL && have_ip) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744                                  ctdb_addr_to_str(&vnn->public_address)));
745                 return 0;
746         }
747
748         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750                                   "and we have it on iface[%s], but it was assigned to node %d"
751                                   "and we are node %d, banning ourself\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754                 ctdb_ban_self(ctdb);
755                 return -1;
756         }
757
758         if (vnn->pnn == -1 && have_ip) {
759                 vnn->pnn = ctdb->pnn;
760                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761                                   "and we already have it on iface[%s], update local daemon\n",
762                                  ctdb_addr_to_str(&vnn->public_address),
763                                   ctdb_vnn_iface_string(vnn)));
764                 return 0;
765         }
766
767         if (vnn->iface) {
768                 if (vnn->iface != best_iface) {
769                         if (!vnn->iface->link_up) {
770                                 do_updateip = true;
771                         } else if (vnn->iface->references > (best_iface->references + 1)) {
772                                 /* only move when the rebalance gains something */
773                                         do_updateip = true;
774                         }
775                 }
776         }
777
778         if (!have_ip) {
779                 if (do_updateip) {
780                         ctdb_vnn_unassign_iface(ctdb, vnn);
781                         do_updateip = false;
782                 }
783                 do_takeip = true;
784         }
785
786         if (do_takeip) {
787                 ret = ctdb_do_takeip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else if (do_updateip) {
792                 ret = ctdb_do_updateip(ctdb, c, vnn);
793                 if (ret != 0) {
794                         return -1;
795                 }
796         } else {
797                 /*
798                  * The interface is up and the kernel known the ip
799                  * => do nothing
800                  */
801                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits,
804                         ctdb_vnn_iface_string(vnn)));
805                 return 0;
806         }
807
808         /* tell ctdb_control.c that we will be replying asynchronously */
809         *async_reply = true;
810
811         return 0;
812 }
813
814 /*
815   takeover an ip address old v4 style
816  */
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
818                                 struct ctdb_req_control *c,
819                                 TDB_DATA indata, 
820                                 bool *async_reply)
821 {
822         TDB_DATA data;
823         
824         data.dsize = sizeof(struct ctdb_public_ip);
825         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826         CTDB_NO_MEMORY(ctdb, data.dptr);
827         
828         memcpy(data.dptr, indata.dptr, indata.dsize);
829         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
830 }
831
832 /*
833   kill any clients that are registered with a IP that is being released
834  */
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
836 {
837         struct ctdb_client_ip *ip;
838
839         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840                 ctdb_addr_to_str(addr)));
841
842         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843                 ctdb_sock_addr tmp_addr;
844
845                 tmp_addr = ip->addr;
846                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
847                         ip->client_id,
848                         ctdb_addr_to_str(&ip->addr)));
849
850                 if (ctdb_same_ip(&tmp_addr, addr)) {
851                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
852                                                                      ip->client_id, 
853                                                                      struct ctdb_client);
854                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
855                                 ip->client_id,
856                                 ctdb_addr_to_str(&ip->addr),
857                                 client->pid));
858
859                         if (client->pid != 0) {
860                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861                                         (unsigned)client->pid,
862                                         ctdb_addr_to_str(addr),
863                                         ip->client_id));
864                                 kill(client->pid, SIGKILL);
865                         }
866                 }
867         }
868 }
869
870 /*
871   called when releaseip event finishes
872  */
873 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
874                                 void *private_data)
875 {
876         struct takeover_callback_state *state = 
877                 talloc_get_type(private_data, struct takeover_callback_state);
878         TDB_DATA data;
879
880         if (status == -ETIME) {
881                 ctdb_ban_self(ctdb);
882         }
883
884         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
885                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
886                                   ctdb_addr_to_str(state->addr)));
887                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
888                 talloc_free(state);
889                 return;
890         }
891
892         /* send a message to all clients of this node telling them
893            that the cluster has been reconfigured and they should
894            release any sockets on this IP */
895         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
896         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
897         data.dsize = strlen((char *)data.dptr)+1;
898
899         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
900
901         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
902
903         /* kill clients that have registered with this IP */
904         release_kill_clients(ctdb, state->addr);
905
906         ctdb_vnn_unassign_iface(ctdb, state->vnn);
907
908         /* the control succeeded */
909         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
910         talloc_free(state);
911 }
912
913 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
914 {
915         state->vnn->update_in_flight = false;
916         return 0;
917 }
918
919 /*
920   release an ip address
921  */
922 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
923                                 struct ctdb_req_control *c,
924                                 TDB_DATA indata, 
925                                 bool *async_reply)
926 {
927         int ret;
928         struct takeover_callback_state *state;
929         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
930         struct ctdb_vnn *vnn;
931         char *iface;
932
933         /* update our vnn list */
934         vnn = find_public_ip_vnn(ctdb, &pip->addr);
935         if (vnn == NULL) {
936                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
937                         ctdb_addr_to_str(&pip->addr)));
938                 return 0;
939         }
940         vnn->pnn = pip->pnn;
941
942         /* stop any previous arps */
943         talloc_free(vnn->takeover_ctx);
944         vnn->takeover_ctx = NULL;
945
946         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
947          * lazy multicast to drop an IP from any node that isn't the
948          * intended new node.  The following causes makes ctdbd ignore
949          * a release for any address it doesn't host.
950          */
951         if (ctdb->do_checkpublicip) {
952                 if (!ctdb_sys_have_ip(&pip->addr)) {
953                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
954                                 ctdb_addr_to_str(&pip->addr),
955                                 vnn->public_netmask_bits,
956                                 ctdb_vnn_iface_string(vnn)));
957                         ctdb_vnn_unassign_iface(ctdb, vnn);
958                         return 0;
959                 }
960         } else {
961                 if (vnn->iface == NULL) {
962                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
963                                            ctdb_addr_to_str(&pip->addr),
964                                            vnn->public_netmask_bits));
965                         return 0;
966                 }
967         }
968
969         /* There is a potential race between take_ip and us because we
970          * update the VNN via a callback that run when the
971          * eventscripts have been run.  Avoid the race by allowing one
972          * update to be in flight at a time.
973          */
974         if (vnn->update_in_flight) {
975                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
976                                     "update for this IP already in flight\n",
977                                     ctdb_addr_to_str(&vnn->public_address),
978                                     vnn->public_netmask_bits));
979                 return -1;
980         }
981
982         if (ctdb->do_checkpublicip) {
983                 iface = ctdb_sys_find_ifname(&pip->addr);
984                 if (iface == NULL) {
985                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
986                         return 0;
987                 }
988                 if (vnn->iface == NULL) {
989                         DEBUG(DEBUG_WARNING,
990                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
991                                ctdb_addr_to_str(&pip->addr),
992                                iface));
993                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
994                         DEBUG(DEBUG_WARNING,
995                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
996                                ctdb_addr_to_str(&pip->addr),
997                                iface,
998                                ctdb_vnn_iface_string(vnn)));
999                         /* Should we fix vnn->iface?  If we do, what
1000                          * happens to reference counts?
1001                          */
1002                 }
1003         } else {
1004                 iface = strdup(ctdb_vnn_iface_string(vnn));
1005         }
1006
1007         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1008                 ctdb_addr_to_str(&pip->addr),
1009                 vnn->public_netmask_bits,
1010                 iface,
1011                 pip->pnn));
1012
1013         state = talloc(ctdb, struct takeover_callback_state);
1014         CTDB_NO_MEMORY(ctdb, state);
1015
1016         state->c = talloc_steal(state, c);
1017         state->addr = talloc(state, ctdb_sock_addr);       
1018         CTDB_NO_MEMORY(ctdb, state->addr);
1019         *state->addr = pip->addr;
1020         state->vnn   = vnn;
1021
1022         vnn->update_in_flight = true;
1023         talloc_set_destructor(state, ctdb_releaseip_destructor);
1024
1025         ret = ctdb_event_script_callback(ctdb, 
1026                                          state, release_ip_callback, state,
1027                                          false,
1028                                          CTDB_EVENT_RELEASE_IP,
1029                                          "%s %s %u",
1030                                          iface,
1031                                          ctdb_addr_to_str(&pip->addr),
1032                                          vnn->public_netmask_bits);
1033         free(iface);
1034         if (ret != 0) {
1035                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1036                         ctdb_addr_to_str(&pip->addr),
1037                         ctdb_vnn_iface_string(vnn)));
1038                 talloc_free(state);
1039                 return -1;
1040         }
1041
1042         /* tell the control that we will be reply asynchronously */
1043         *async_reply = true;
1044         return 0;
1045 }
1046
1047 /*
1048   release an ip address old v4 style
1049  */
1050 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1051                                 struct ctdb_req_control *c,
1052                                 TDB_DATA indata, 
1053                                 bool *async_reply)
1054 {
1055         TDB_DATA data;
1056         
1057         data.dsize = sizeof(struct ctdb_public_ip);
1058         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1059         CTDB_NO_MEMORY(ctdb, data.dptr);
1060         
1061         memcpy(data.dptr, indata.dptr, indata.dsize);
1062         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1063 }
1064
1065
1066 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1067                                    ctdb_sock_addr *addr,
1068                                    unsigned mask, const char *ifaces,
1069                                    bool check_address)
1070 {
1071         struct ctdb_vnn      *vnn;
1072         uint32_t num = 0;
1073         char *tmp;
1074         const char *iface;
1075         int i;
1076         int ret;
1077
1078         tmp = strdup(ifaces);
1079         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1080                 if (!ctdb_sys_check_iface_exists(iface)) {
1081                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1082                         free(tmp);
1083                         return -1;
1084                 }
1085         }
1086         free(tmp);
1087
1088         /* Verify that we dont have an entry for this ip yet */
1089         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1090                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1091                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1092                                 ctdb_addr_to_str(addr)));
1093                         return -1;
1094                 }               
1095         }
1096
1097         /* create a new vnn structure for this ip address */
1098         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1099         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1100         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1101         tmp = talloc_strdup(vnn, ifaces);
1102         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1103         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1104                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1105                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1106                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1107                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1108                 num++;
1109         }
1110         talloc_free(tmp);
1111         vnn->ifaces[num] = NULL;
1112         vnn->public_address      = *addr;
1113         vnn->public_netmask_bits = mask;
1114         vnn->pnn                 = -1;
1115         if (check_address) {
1116                 if (ctdb_sys_have_ip(addr)) {
1117                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1118                         vnn->pnn = ctdb->pnn;
1119                 }
1120         }
1121
1122         for (i=0; vnn->ifaces[i]; i++) {
1123                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1124                 if (ret != 0) {
1125                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1126                                            "for public_address[%s]\n",
1127                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1128                         talloc_free(vnn);
1129                         return -1;
1130                 }
1131         }
1132
1133         DLIST_ADD(ctdb->vnn, vnn);
1134
1135         return 0;
1136 }
1137
1138 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1139                                   struct timeval t, void *private_data)
1140 {
1141         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1142                                                         struct ctdb_context);
1143         struct ctdb_vnn *vnn;
1144
1145         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1146                 int i;
1147
1148                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1149                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1150                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1151                                         vnn->ifaces[i],
1152                                         ctdb_addr_to_str(&vnn->public_address)));
1153                         }
1154                 }
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160 }
1161
1162
1163 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1164 {
1165         if (ctdb->check_public_ifaces_ctx != NULL) {
1166                 talloc_free(ctdb->check_public_ifaces_ctx);
1167                 ctdb->check_public_ifaces_ctx = NULL;
1168         }
1169
1170         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1171         if (ctdb->check_public_ifaces_ctx == NULL) {
1172                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1173         }
1174
1175         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1176                 timeval_current_ofs(30, 0), 
1177                 ctdb_check_interfaces_event, ctdb);
1178
1179         return 0;
1180 }
1181
1182
1183 /*
1184   setup the public address lists from a file
1185 */
1186 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1187 {
1188         char **lines;
1189         int nlines;
1190         int i;
1191
1192         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1193         if (lines == NULL) {
1194                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1195                 return -1;
1196         }
1197         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1198                 nlines--;
1199         }
1200
1201         for (i=0;i<nlines;i++) {
1202                 unsigned mask;
1203                 ctdb_sock_addr addr;
1204                 const char *addrstr;
1205                 const char *ifaces;
1206                 char *tok, *line;
1207
1208                 line = lines[i];
1209                 while ((*line == ' ') || (*line == '\t')) {
1210                         line++;
1211                 }
1212                 if (*line == '#') {
1213                         continue;
1214                 }
1215                 if (strcmp(line, "") == 0) {
1216                         continue;
1217                 }
1218                 tok = strtok(line, " \t");
1219                 addrstr = tok;
1220                 tok = strtok(NULL, " \t");
1221                 if (tok == NULL) {
1222                         if (NULL == ctdb->default_public_interface) {
1223                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1224                                          i+1));
1225                                 talloc_free(lines);
1226                                 return -1;
1227                         }
1228                         ifaces = ctdb->default_public_interface;
1229                 } else {
1230                         ifaces = tok;
1231                 }
1232
1233                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1234                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1235                         talloc_free(lines);
1236                         return -1;
1237                 }
1238                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1239                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1240                         talloc_free(lines);
1241                         return -1;
1242                 }
1243         }
1244
1245
1246         talloc_free(lines);
1247         return 0;
1248 }
1249
1250 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1251                               const char *iface,
1252                               const char *ip)
1253 {
1254         struct ctdb_vnn *svnn;
1255         struct ctdb_iface *cur = NULL;
1256         bool ok;
1257         int ret;
1258
1259         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1260         CTDB_NO_MEMORY(ctdb, svnn);
1261
1262         svnn->ifaces = talloc_array(svnn, const char *, 2);
1263         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1264         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1265         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1266         svnn->ifaces[1] = NULL;
1267
1268         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1269         if (!ok) {
1270                 talloc_free(svnn);
1271                 return -1;
1272         }
1273
1274         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1275         if (ret != 0) {
1276                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1277                                    "for single_ip[%s]\n",
1278                                    svnn->ifaces[0],
1279                                    ctdb_addr_to_str(&svnn->public_address)));
1280                 talloc_free(svnn);
1281                 return -1;
1282         }
1283
1284         /* assume the single public ip interface is initially "good" */
1285         cur = ctdb_find_iface(ctdb, iface);
1286         if (cur == NULL) {
1287                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1288                 return -1;
1289         }
1290         cur->link_up = true;
1291
1292         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1293         if (ret != 0) {
1294                 talloc_free(svnn);
1295                 return -1;
1296         }
1297
1298         ctdb->single_ip_vnn = svnn;
1299         return 0;
1300 }
1301
1302 struct ctdb_public_ip_list {
1303         struct ctdb_public_ip_list *next;
1304         uint32_t pnn;
1305         ctdb_sock_addr addr;
1306 };
1307
1308 /* Given a physical node, return the number of
1309    public addresses that is currently assigned to this node.
1310 */
1311 static int node_ip_coverage(struct ctdb_context *ctdb, 
1312         int32_t pnn,
1313         struct ctdb_public_ip_list *ips)
1314 {
1315         int num=0;
1316
1317         for (;ips;ips=ips->next) {
1318                 if (ips->pnn == pnn) {
1319                         num++;
1320                 }
1321         }
1322         return num;
1323 }
1324
1325
1326 /* Can the given node host the given IP: is the public IP known to the
1327  * node and is NOIPHOST unset?
1328 */
1329 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1330                              struct ctdb_ipflags ipflags,
1331                              struct ctdb_public_ip_list *ip)
1332 {
1333         struct ctdb_all_public_ips *public_ips;
1334         int i;
1335
1336         if (ipflags.noiphost) {
1337                 return false;
1338         }
1339
1340         public_ips = ctdb->nodes[pnn]->available_public_ips;
1341
1342         if (public_ips == NULL) {
1343                 return false;
1344         }
1345
1346         for (i=0; i<public_ips->num; i++) {
1347                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1348                         /* yes, this node can serve this public ip */
1349                         return true;
1350                 }
1351         }
1352
1353         return false;
1354 }
1355
1356 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1357                                  struct ctdb_ipflags ipflags,
1358                                  struct ctdb_public_ip_list *ip)
1359 {
1360         if (ipflags.noiptakeover) {
1361                 return false;
1362         }
1363
1364         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1365 }
1366
1367 /* search the node lists list for a node to takeover this ip.
1368    pick the node that currently are serving the least number of ips
1369    so that the ips get spread out evenly.
1370 */
1371 static int find_takeover_node(struct ctdb_context *ctdb, 
1372                 struct ctdb_ipflags *ipflags,
1373                 struct ctdb_public_ip_list *ip,
1374                 struct ctdb_public_ip_list *all_ips)
1375 {
1376         int pnn, min=0, num;
1377         int i, numnodes;
1378
1379         numnodes = talloc_array_length(ipflags);
1380         pnn    = -1;
1381         for (i=0; i<numnodes; i++) {
1382                 /* verify that this node can serve this ip */
1383                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1384                         /* no it couldnt   so skip to the next node */
1385                         continue;
1386                 }
1387
1388                 num = node_ip_coverage(ctdb, i, all_ips);
1389                 /* was this the first node we checked ? */
1390                 if (pnn == -1) {
1391                         pnn = i;
1392                         min  = num;
1393                 } else {
1394                         if (num < min) {
1395                                 pnn = i;
1396                                 min  = num;
1397                         }
1398                 }
1399         }       
1400         if (pnn == -1) {
1401                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1402                         ctdb_addr_to_str(&ip->addr)));
1403
1404                 return -1;
1405         }
1406
1407         ip->pnn = pnn;
1408         return 0;
1409 }
1410
1411 #define IP_KEYLEN       4
1412 static uint32_t *ip_key(ctdb_sock_addr *ip)
1413 {
1414         static uint32_t key[IP_KEYLEN];
1415
1416         bzero(key, sizeof(key));
1417
1418         switch (ip->sa.sa_family) {
1419         case AF_INET:
1420                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1421                 break;
1422         case AF_INET6: {
1423                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1424                 key[0]  = htonl(s6_a32[0]);
1425                 key[1]  = htonl(s6_a32[1]);
1426                 key[2]  = htonl(s6_a32[2]);
1427                 key[3]  = htonl(s6_a32[3]);
1428                 break;
1429         }
1430         default:
1431                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1432                 return key;
1433         }
1434
1435         return key;
1436 }
1437
1438 static void *add_ip_callback(void *parm, void *data)
1439 {
1440         struct ctdb_public_ip_list *this_ip = parm; 
1441         struct ctdb_public_ip_list *prev_ip = data; 
1442
1443         if (prev_ip == NULL) {
1444                 return parm;
1445         }
1446         if (this_ip->pnn == -1) {
1447                 this_ip->pnn = prev_ip->pnn;
1448         }
1449
1450         return parm;
1451 }
1452
1453 static int getips_count_callback(void *param, void *data)
1454 {
1455         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1456         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1457
1458         new_ip->next = *ip_list;
1459         *ip_list     = new_ip;
1460         return 0;
1461 }
1462
1463 static struct ctdb_public_ip_list *
1464 create_merged_ip_list(struct ctdb_context *ctdb)
1465 {
1466         int i, j;
1467         struct ctdb_public_ip_list *ip_list;
1468         struct ctdb_all_public_ips *public_ips;
1469
1470         if (ctdb->ip_tree != NULL) {
1471                 talloc_free(ctdb->ip_tree);
1472                 ctdb->ip_tree = NULL;
1473         }
1474         ctdb->ip_tree = trbt_create(ctdb, 0);
1475
1476         for (i=0;i<ctdb->num_nodes;i++) {
1477                 public_ips = ctdb->nodes[i]->known_public_ips;
1478
1479                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1480                         continue;
1481                 }
1482
1483                 /* there were no public ips for this node */
1484                 if (public_ips == NULL) {
1485                         continue;
1486                 }               
1487
1488                 for (j=0;j<public_ips->num;j++) {
1489                         struct ctdb_public_ip_list *tmp_ip; 
1490
1491                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1492                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1493                         /* Do not use information about IP addresses hosted
1494                          * on other nodes, it may not be accurate */
1495                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1496                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1497                         } else {
1498                                 tmp_ip->pnn = -1;
1499                         }
1500                         tmp_ip->addr = public_ips->ips[j].addr;
1501                         tmp_ip->next = NULL;
1502
1503                         trbt_insertarray32_callback(ctdb->ip_tree,
1504                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1505                                 add_ip_callback,
1506                                 tmp_ip);
1507                 }
1508         }
1509
1510         ip_list = NULL;
1511         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1512
1513         return ip_list;
1514 }
1515
1516 /* 
1517  * This is the length of the longtest common prefix between the IPs.
1518  * It is calculated by XOR-ing the 2 IPs together and counting the
1519  * number of leading zeroes.  The implementation means that all
1520  * addresses end up being 128 bits long.
1521  *
1522  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1523  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1524  * lots of nodes and IP addresses?
1525  */
1526 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1527 {
1528         uint32_t ip1_k[IP_KEYLEN];
1529         uint32_t *t;
1530         int i;
1531         uint32_t x;
1532
1533         uint32_t distance = 0;
1534
1535         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1536         t = ip_key(ip2);
1537         for (i=0; i<IP_KEYLEN; i++) {
1538                 x = ip1_k[i] ^ t[i];
1539                 if (x == 0) {
1540                         distance += 32;
1541                 } else {
1542                         /* Count number of leading zeroes. 
1543                          * FIXME? This could be optimised...
1544                          */
1545                         while ((x & (1 << 31)) == 0) {
1546                                 x <<= 1;
1547                                 distance += 1;
1548                         }
1549                 }
1550         }
1551
1552         return distance;
1553 }
1554
1555 /* Calculate the IP distance for the given IP relative to IPs on the
1556    given node.  The ips argument is generally the all_ips variable
1557    used in the main part of the algorithm.
1558  */
1559 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1560                                   struct ctdb_public_ip_list *ips,
1561                                   int pnn)
1562 {
1563         struct ctdb_public_ip_list *t;
1564         uint32_t d;
1565
1566         uint32_t sum = 0;
1567
1568         for (t=ips; t != NULL; t=t->next) {
1569                 if (t->pnn != pnn) {
1570                         continue;
1571                 }
1572
1573                 /* Optimisation: We never calculate the distance
1574                  * between an address and itself.  This allows us to
1575                  * calculate the effect of removing an address from a
1576                  * node by simply calculating the distance between
1577                  * that address and all of the exitsing addresses.
1578                  * Moreover, we assume that we're only ever dealing
1579                  * with addresses from all_ips so we can identify an
1580                  * address via a pointer rather than doing a more
1581                  * expensive address comparison. */
1582                 if (&(t->addr) == ip) {
1583                         continue;
1584                 }
1585
1586                 d = ip_distance(ip, &(t->addr));
1587                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1588         }
1589
1590         return sum;
1591 }
1592
1593 /* Return the LCP2 imbalance metric for addresses currently assigned
1594    to the given node.
1595  */
1596 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1597 {
1598         struct ctdb_public_ip_list *t;
1599
1600         uint32_t imbalance = 0;
1601
1602         for (t=all_ips; t!=NULL; t=t->next) {
1603                 if (t->pnn != pnn) {
1604                         continue;
1605                 }
1606                 /* Pass the rest of the IPs rather than the whole
1607                    all_ips input list.
1608                 */
1609                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1610         }
1611
1612         return imbalance;
1613 }
1614
1615 /* Allocate any unassigned IPs just by looping through the IPs and
1616  * finding the best node for each.
1617  */
1618 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1619                                       struct ctdb_ipflags *ipflags,
1620                                       struct ctdb_public_ip_list *all_ips)
1621 {
1622         struct ctdb_public_ip_list *tmp_ip;
1623
1624         /* loop over all ip's and find a physical node to cover for 
1625            each unassigned ip.
1626         */
1627         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1628                 if (tmp_ip->pnn == -1) {
1629                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1630                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1631                                         ctdb_addr_to_str(&tmp_ip->addr)));
1632                         }
1633                 }
1634         }
1635 }
1636
1637 /* Basic non-deterministic rebalancing algorithm.
1638  */
1639 static void basic_failback(struct ctdb_context *ctdb,
1640                            struct ctdb_ipflags *ipflags,
1641                            struct ctdb_public_ip_list *all_ips,
1642                            int num_ips)
1643 {
1644         int i, numnodes;
1645         int maxnode, maxnum, minnode, minnum, num, retries;
1646         struct ctdb_public_ip_list *tmp_ip;
1647
1648         numnodes = talloc_array_length(ipflags);
1649         retries = 0;
1650
1651 try_again:
1652         maxnum=0;
1653         minnum=0;
1654
1655         /* for each ip address, loop over all nodes that can serve
1656            this ip and make sure that the difference between the node
1657            serving the most and the node serving the least ip's are
1658            not greater than 1.
1659         */
1660         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1661                 if (tmp_ip->pnn == -1) {
1662                         continue;
1663                 }
1664
1665                 /* Get the highest and lowest number of ips's served by any 
1666                    valid node which can serve this ip.
1667                 */
1668                 maxnode = -1;
1669                 minnode = -1;
1670                 for (i=0; i<numnodes; i++) {
1671                         /* only check nodes that can actually serve this ip */
1672                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1673                                 /* no it couldnt   so skip to the next node */
1674                                 continue;
1675                         }
1676
1677                         num = node_ip_coverage(ctdb, i, all_ips);
1678                         if (maxnode == -1) {
1679                                 maxnode = i;
1680                                 maxnum  = num;
1681                         } else {
1682                                 if (num > maxnum) {
1683                                         maxnode = i;
1684                                         maxnum  = num;
1685                                 }
1686                         }
1687                         if (minnode == -1) {
1688                                 minnode = i;
1689                                 minnum  = num;
1690                         } else {
1691                                 if (num < minnum) {
1692                                         minnode = i;
1693                                         minnum  = num;
1694                                 }
1695                         }
1696                 }
1697                 if (maxnode == -1) {
1698                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1699                                 ctdb_addr_to_str(&tmp_ip->addr)));
1700
1701                         continue;
1702                 }
1703
1704                 /* if the spread between the smallest and largest coverage by
1705                    a node is >=2 we steal one of the ips from the node with
1706                    most coverage to even things out a bit.
1707                    try to do this a limited number of times since we dont
1708                    want to spend too much time balancing the ip coverage.
1709                 */
1710                 if ( (maxnum > minnum+1)
1711                      && (retries < (num_ips + 5)) ){
1712                         struct ctdb_public_ip_list *tmp;
1713
1714                         /* Reassign one of maxnode's VNNs */
1715                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1716                                 if (tmp->pnn == maxnode) {
1717                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1718                                         retries++;
1719                                         goto try_again;;
1720                                 }
1721                         }
1722                 }
1723         }
1724 }
1725
1726 static void lcp2_init(struct ctdb_context *tmp_ctx,
1727                       struct ctdb_ipflags *ipflags,
1728                       struct ctdb_public_ip_list *all_ips,
1729                       uint32_t *force_rebalance_nodes,
1730                       uint32_t **lcp2_imbalances,
1731                       bool **rebalance_candidates)
1732 {
1733         int i, numnodes;
1734         struct ctdb_public_ip_list *tmp_ip;
1735
1736         numnodes = talloc_array_length(ipflags);
1737
1738         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1739         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1740         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1741         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1742
1743         for (i=0; i<numnodes; i++) {
1744                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1745                 /* First step: assume all nodes are candidates */
1746                 (*rebalance_candidates)[i] = true;
1747         }
1748
1749         /* 2nd step: if a node has IPs assigned then it must have been
1750          * healthy before, so we remove it from consideration.  This
1751          * is overkill but is all we have because we don't maintain
1752          * state between takeover runs.  An alternative would be to
1753          * keep state and invalidate it every time the recovery master
1754          * changes.
1755          */
1756         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1757                 if (tmp_ip->pnn != -1) {
1758                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1759                 }
1760         }
1761
1762         /* 3rd step: if a node is forced to re-balance then
1763            we allow failback onto the node */
1764         if (force_rebalance_nodes == NULL) {
1765                 return;
1766         }
1767         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1768                 uint32_t pnn = force_rebalance_nodes[i];
1769                 if (pnn >= numnodes) {
1770                         DEBUG(DEBUG_ERR,
1771                               (__location__ "unknown node %u\n", pnn));
1772                         continue;
1773                 }
1774
1775                 DEBUG(DEBUG_NOTICE,
1776                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1777                 (*rebalance_candidates)[pnn] = true;
1778         }
1779 }
1780
1781 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1782  * the IP/node combination that will cost the least.
1783  */
1784 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1785                                      struct ctdb_ipflags *ipflags,
1786                                      struct ctdb_public_ip_list *all_ips,
1787                                      uint32_t *lcp2_imbalances)
1788 {
1789         struct ctdb_public_ip_list *tmp_ip;
1790         int dstnode, numnodes;
1791
1792         int minnode;
1793         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1794         struct ctdb_public_ip_list *minip;
1795
1796         bool should_loop = true;
1797         bool have_unassigned = true;
1798
1799         numnodes = talloc_array_length(ipflags);
1800
1801         while (have_unassigned && should_loop) {
1802                 should_loop = false;
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1806
1807                 minnode = -1;
1808                 mindsum = 0;
1809                 minip = NULL;
1810
1811                 /* loop over each unassigned ip. */
1812                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1813                         if (tmp_ip->pnn != -1) {
1814                                 continue;
1815                         }
1816
1817                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1818                                 /* only check nodes that can actually takeover this ip */
1819                                 if (!can_node_takeover_ip(ctdb, dstnode,
1820                                                           ipflags[dstnode],
1821                                                           tmp_ip)) {
1822                                         /* no it couldnt   so skip to the next node */
1823                                         continue;
1824                                 }
1825
1826                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1827                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1828                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1829                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1830                                                    dstnode,
1831                                                    dstimbl - lcp2_imbalances[dstnode]));
1832
1833
1834                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1835                                         minnode = dstnode;
1836                                         minimbl = dstimbl;
1837                                         mindsum = dstdsum;
1838                                         minip = tmp_ip;
1839                                         should_loop = true;
1840                                 }
1841                         }
1842                 }
1843
1844                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1845
1846                 /* If we found one then assign it to the given node. */
1847                 if (minnode != -1) {
1848                         minip->pnn = minnode;
1849                         lcp2_imbalances[minnode] = minimbl;
1850                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1851                                           ctdb_addr_to_str(&(minip->addr)),
1852                                           minnode,
1853                                           mindsum));
1854                 }
1855
1856                 /* There might be a better way but at least this is clear. */
1857                 have_unassigned = false;
1858                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1859                         if (tmp_ip->pnn == -1) {
1860                                 have_unassigned = true;
1861                         }
1862                 }
1863         }
1864
1865         /* We know if we have an unassigned addresses so we might as
1866          * well optimise.
1867          */
1868         if (have_unassigned) {
1869                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1870                         if (tmp_ip->pnn == -1) {
1871                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1872                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1873                         }
1874                 }
1875         }
1876 }
1877
1878 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1879  * to move IPs from, determines the best IP/destination node
1880  * combination to move from the source node.
1881  */
1882 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1883                                     struct ctdb_ipflags *ipflags,
1884                                     struct ctdb_public_ip_list *all_ips,
1885                                     int srcnode,
1886                                     uint32_t candimbl,
1887                                     uint32_t *lcp2_imbalances,
1888                                     bool *rebalance_candidates)
1889 {
1890         int dstnode, mindstnode, numnodes;
1891         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1892         uint32_t minsrcimbl, mindstimbl;
1893         struct ctdb_public_ip_list *minip;
1894         struct ctdb_public_ip_list *tmp_ip;
1895
1896         /* Find an IP and destination node that best reduces imbalance. */
1897         srcimbl = 0;
1898         minip = NULL;
1899         minsrcimbl = 0;
1900         mindstnode = -1;
1901         mindstimbl = 0;
1902
1903         numnodes = talloc_array_length(ipflags);
1904
1905         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1906         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1907
1908         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1909                 /* Only consider addresses on srcnode. */
1910                 if (tmp_ip->pnn != srcnode) {
1911                         continue;
1912                 }
1913
1914                 /* What is this IP address costing the source node? */
1915                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1916                 srcimbl = candimbl - srcdsum;
1917
1918                 /* Consider this IP address would cost each potential
1919                  * destination node.  Destination nodes are limited to
1920                  * those that are newly healthy, since we don't want
1921                  * to do gratuitous failover of IPs just to make minor
1922                  * balance improvements.
1923                  */
1924                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1925                         if (!rebalance_candidates[dstnode]) {
1926                                 continue;
1927                         }
1928
1929                         /* only check nodes that can actually takeover this ip */
1930                         if (!can_node_takeover_ip(ctdb, dstnode,
1931                                                   ipflags[dstnode], tmp_ip)) {
1932                                 /* no it couldnt   so skip to the next node */
1933                                 continue;
1934                         }
1935
1936                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1937                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1938                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1939                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1940                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1941                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1942
1943                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1944                             ((mindstnode == -1) ||                              \
1945                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1946
1947                                 minip = tmp_ip;
1948                                 minsrcimbl = srcimbl;
1949                                 mindstnode = dstnode;
1950                                 mindstimbl = dstimbl;
1951                         }
1952                 }
1953         }
1954         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1955
1956         if (mindstnode != -1) {
1957                 /* We found a move that makes things better... */
1958                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1959                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1960                                   ctdb_addr_to_str(&(minip->addr)),
1961                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1962
1963
1964                 lcp2_imbalances[srcnode] = srcimbl;
1965                 lcp2_imbalances[mindstnode] = mindstimbl;
1966                 minip->pnn = mindstnode;
1967
1968                 return true;
1969         }
1970
1971         return false;
1972         
1973 }
1974
1975 struct lcp2_imbalance_pnn {
1976         uint32_t imbalance;
1977         int pnn;
1978 };
1979
1980 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1981 {
1982         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1983         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1984
1985         if (lipa->imbalance > lipb->imbalance) {
1986                 return -1;
1987         } else if (lipa->imbalance == lipb->imbalance) {
1988                 return 0;
1989         } else {
1990                 return 1;
1991         }
1992 }
1993
1994 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1995  * node with the highest LCP2 imbalance, and then determines the best
1996  * IP/destination node combination to move from the source node.
1997  */
1998 static void lcp2_failback(struct ctdb_context *ctdb,
1999                           struct ctdb_ipflags *ipflags,
2000                           struct ctdb_public_ip_list *all_ips,
2001                           uint32_t *lcp2_imbalances,
2002                           bool *rebalance_candidates)
2003 {
2004         int i, num_rebalance_candidates, numnodes;
2005         struct lcp2_imbalance_pnn * lips;
2006         bool again;
2007
2008         numnodes = talloc_array_length(ipflags);
2009
2010 try_again:
2011
2012         /* It is only worth continuing if we have suitable target
2013          * nodes to transfer IPs to.  This check is much cheaper than
2014          * continuing on...
2015          */
2016         num_rebalance_candidates = 0;
2017         for (i=0; i<numnodes; i++) {
2018                 if (rebalance_candidates[i]) {
2019                         num_rebalance_candidates++;
2020                 }
2021         }
2022         if (num_rebalance_candidates == 0) {
2023                 return;
2024         }
2025
2026         /* Put the imbalances and nodes into an array, sort them and
2027          * iterate through candidates.  Usually the 1st one will be
2028          * used, so this doesn't cost much...
2029          */
2030         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2031         for (i=0; i<numnodes; i++) {
2032                 lips[i].imbalance = lcp2_imbalances[i];
2033                 lips[i].pnn = i;
2034         }
2035         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2036               lcp2_cmp_imbalance_pnn);
2037
2038         again = false;
2039         for (i=0; i<numnodes; i++) {
2040                 /* This means that all nodes had 0 or 1 addresses, so
2041                  * can't be imbalanced.
2042                  */
2043                 if (lips[i].imbalance == 0) {
2044                         break;
2045                 }
2046
2047                 if (lcp2_failback_candidate(ctdb,
2048                                             ipflags,
2049                                             all_ips,
2050                                             lips[i].pnn,
2051                                             lips[i].imbalance,
2052                                             lcp2_imbalances,
2053                                             rebalance_candidates)) {
2054                         again = true;
2055                         break;
2056                 }
2057         }
2058
2059         talloc_free(lips);
2060         if (again) {
2061                 goto try_again;
2062         }
2063 }
2064
2065 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2066                                     struct ctdb_ipflags *ipflags,
2067                                     struct ctdb_public_ip_list *all_ips)
2068 {
2069         struct ctdb_public_ip_list *tmp_ip;
2070
2071         /* verify that the assigned nodes can serve that public ip
2072            and set it to -1 if not
2073         */
2074         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2075                 if (tmp_ip->pnn == -1) {
2076                         continue;
2077                 }
2078                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2079                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2080                         /* this node can not serve this ip. */
2081                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2082                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2083                                            tmp_ip->pnn));
2084                         tmp_ip->pnn = -1;
2085                 }
2086         }
2087 }
2088
2089 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2090                                        struct ctdb_ipflags *ipflags,
2091                                        struct ctdb_public_ip_list *all_ips)
2092 {
2093         struct ctdb_public_ip_list *tmp_ip;
2094         int i, numnodes;
2095
2096         numnodes = talloc_array_length(ipflags);
2097
2098         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2099        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2100         *  always be allocated the same way for a specific set of
2101         *  available/unavailable nodes.
2102         */
2103
2104         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2105                 tmp_ip->pnn = i % numnodes;
2106         }
2107
2108         /* IP failback doesn't make sense with deterministic
2109          * IPs, since the modulo step above implicitly fails
2110          * back IPs to their "home" node.
2111          */
2112         if (1 == ctdb->tunable.no_ip_failback) {
2113                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2114         }
2115
2116         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2117
2118         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2119
2120         /* No failback here! */
2121 }
2122
2123 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2124                                           struct ctdb_ipflags *ipflags,
2125                                           struct ctdb_public_ip_list *all_ips)
2126 {
2127         /* This should be pushed down into basic_failback. */
2128         struct ctdb_public_ip_list *tmp_ip;
2129         int num_ips = 0;
2130         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2131                 num_ips++;
2132         }
2133
2134         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2135
2136         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2137
2138         /* If we don't want IPs to fail back then don't rebalance IPs. */
2139         if (1 == ctdb->tunable.no_ip_failback) {
2140                 return;
2141         }
2142
2143         /* Now, try to make sure the ip adresses are evenly distributed
2144            across the nodes.
2145         */
2146         basic_failback(ctdb, ipflags, all_ips, num_ips);
2147 }
2148
2149 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2150                           struct ctdb_ipflags *ipflags,
2151                           struct ctdb_public_ip_list *all_ips,
2152                           uint32_t *force_rebalance_nodes)
2153 {
2154         uint32_t *lcp2_imbalances;
2155         bool *rebalance_candidates;
2156
2157         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2158
2159         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2160
2161         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2162                   &lcp2_imbalances, &rebalance_candidates);
2163
2164         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2165
2166         /* If we don't want IPs to fail back then don't rebalance IPs. */
2167         if (1 == ctdb->tunable.no_ip_failback) {
2168                 goto finished;
2169         }
2170
2171         /* Now, try to make sure the ip adresses are evenly distributed
2172            across the nodes.
2173         */
2174         lcp2_failback(ctdb, ipflags, all_ips,
2175                       lcp2_imbalances, rebalance_candidates);
2176
2177 finished:
2178         talloc_free(tmp_ctx);
2179 }
2180
2181 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2182 {
2183         int i, num_healthy;
2184
2185         /* Count how many completely healthy nodes we have */
2186         num_healthy = 0;
2187         for (i=0;i<nodemap->num;i++) {
2188                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2189                         num_healthy++;
2190                 }
2191         }
2192
2193         return num_healthy == 0;
2194 }
2195
2196 /* The calculation part of the IP allocation algorithm. */
2197 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2198                                    struct ctdb_ipflags *ipflags,
2199                                    struct ctdb_public_ip_list **all_ips_p,
2200                                    uint32_t *force_rebalance_nodes)
2201 {
2202         /* since nodes only know about those public addresses that
2203            can be served by that particular node, no single node has
2204            a full list of all public addresses that exist in the cluster.
2205            Walk over all node structures and create a merged list of
2206            all public addresses that exist in the cluster.
2207
2208            keep the tree of ips around as ctdb->ip_tree
2209         */
2210         *all_ips_p = create_merged_ip_list(ctdb);
2211
2212         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2213                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2214         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2215                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2216         } else {
2217                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2218         }
2219
2220         /* at this point ->pnn is the node which will own each IP
2221            or -1 if there is no node that can cover this ip
2222         */
2223
2224         return;
2225 }
2226
2227 struct get_tunable_callback_data {
2228         const char *tunable;
2229         uint32_t *out;
2230         bool fatal;
2231 };
2232
2233 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2234                                  int32_t res, TDB_DATA outdata,
2235                                  void *callback)
2236 {
2237         struct get_tunable_callback_data *cd =
2238                 (struct get_tunable_callback_data *)callback;
2239         int size;
2240
2241         if (res != 0) {
2242                 /* Already handled in fail callback */
2243                 return;
2244         }
2245
2246         if (outdata.dsize != sizeof(uint32_t)) {
2247                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2248                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2249                                  (int)outdata.dsize));
2250                 cd->fatal = true;
2251                 return;
2252         }
2253
2254         size = talloc_array_length(cd->out);
2255         if (pnn >= size) {
2256                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2257                                  cd->tunable, pnn, size));
2258                 return;
2259         }
2260
2261                 
2262         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2263 }
2264
2265 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2266                                        int32_t res, TDB_DATA outdata,
2267                                        void *callback)
2268 {
2269         struct get_tunable_callback_data *cd =
2270                 (struct get_tunable_callback_data *)callback;
2271
2272         switch (res) {
2273         case -ETIME:
2274                 DEBUG(DEBUG_ERR,
2275                       ("Timed out getting tunable \"%s\" from node %d\n",
2276                        cd->tunable, pnn));
2277                 cd->fatal = true;
2278                 break;
2279         case -EINVAL:
2280         case -1:
2281                 DEBUG(DEBUG_WARNING,
2282                       ("Tunable \"%s\" not implemented on node %d\n",
2283                        cd->tunable, pnn));
2284                 break;
2285         default:
2286                 DEBUG(DEBUG_ERR,
2287                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2288                        cd->tunable, pnn));
2289                 cd->fatal = true;
2290         }
2291 }
2292
2293 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2294                                         TALLOC_CTX *tmp_ctx,
2295                                         struct ctdb_node_map *nodemap,
2296                                         const char *tunable,
2297                                         uint32_t default_value)
2298 {
2299         TDB_DATA data;
2300         struct ctdb_control_get_tunable *t;
2301         uint32_t *nodes;
2302         uint32_t *tvals;
2303         struct get_tunable_callback_data callback_data;
2304         int i;
2305
2306         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2307         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2308         for (i=0; i<nodemap->num; i++) {
2309                 tvals[i] = default_value;
2310         }
2311                 
2312         callback_data.out = tvals;
2313         callback_data.tunable = tunable;
2314         callback_data.fatal = false;
2315
2316         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2317         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2318         t = (struct ctdb_control_get_tunable *)data.dptr;
2319         t->length = strlen(tunable)+1;
2320         memcpy(t->name, tunable, t->length);
2321         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2322         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2323                                       nodes, 0, TAKEOVER_TIMEOUT(),
2324                                       false, data,
2325                                       get_tunable_callback,
2326                                       get_tunable_fail_callback,
2327                                       &callback_data) != 0) {
2328                 if (callback_data.fatal) {
2329                         talloc_free(tvals);
2330                         tvals = NULL;
2331                 }
2332         }
2333         talloc_free(nodes);
2334         talloc_free(data.dptr);
2335
2336         return tvals;
2337 }
2338
2339 struct get_runstate_callback_data {
2340         enum ctdb_runstate *out;
2341         bool fatal;
2342 };
2343
2344 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2345                                   int32_t res, TDB_DATA outdata,
2346                                   void *callback_data)
2347 {
2348         struct get_runstate_callback_data *cd =
2349                 (struct get_runstate_callback_data *)callback_data;
2350         int size;
2351
2352         if (res != 0) {
2353                 /* Already handled in fail callback */
2354                 return;
2355         }
2356
2357         if (outdata.dsize != sizeof(uint32_t)) {
2358                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2359                                  pnn, (int)sizeof(uint32_t),
2360                                  (int)outdata.dsize));
2361                 cd->fatal = true;
2362                 return;
2363         }
2364
2365         size = talloc_array_length(cd->out);
2366         if (pnn >= size) {
2367                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2368                                  pnn, size));
2369                 return;
2370         }
2371
2372         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2373 }
2374
2375 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2376                                        int32_t res, TDB_DATA outdata,
2377                                        void *callback)
2378 {
2379         struct get_runstate_callback_data *cd =
2380                 (struct get_runstate_callback_data *)callback;
2381
2382         switch (res) {
2383         case -ETIME:
2384                 DEBUG(DEBUG_ERR,
2385                       ("Timed out getting runstate from node %d\n", pnn));
2386                 cd->fatal = true;
2387                 break;
2388         default:
2389                 DEBUG(DEBUG_WARNING,
2390                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2391                        pnn));
2392         }
2393 }
2394
2395 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2396                                                     TALLOC_CTX *tmp_ctx,
2397                                                     struct ctdb_node_map *nodemap,
2398                                                     enum ctdb_runstate default_value)
2399 {
2400         uint32_t *nodes;
2401         enum ctdb_runstate *rs;
2402         struct get_runstate_callback_data callback_data;
2403         int i;
2404
2405         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2406         CTDB_NO_MEMORY_NULL(ctdb, rs);
2407         for (i=0; i<nodemap->num; i++) {
2408                 rs[i] = default_value;
2409         }
2410
2411         callback_data.out = rs;
2412         callback_data.fatal = false;
2413
2414         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2415         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2416                                       nodes, 0, TAKEOVER_TIMEOUT(),
2417                                       true, tdb_null,
2418                                       get_runstate_callback,
2419                                       get_runstate_fail_callback,
2420                                       &callback_data) != 0) {
2421                 if (callback_data.fatal) {
2422                         free(rs);
2423                         rs = NULL;
2424                 }
2425         }
2426         talloc_free(nodes);
2427
2428         return rs;
2429 }
2430
2431 /* Set internal flags for IP allocation:
2432  *   Clear ip flags
2433  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2434  *   Set NOIPHOST ip flag for each INACTIVE node
2435  *   if all nodes are disabled:
2436  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2437  *   else
2438  *     Set NOIPHOST ip flags for disabled nodes
2439  */
2440 static struct ctdb_ipflags *
2441 set_ipflags_internal(struct ctdb_context *ctdb,
2442                      TALLOC_CTX *tmp_ctx,
2443                      struct ctdb_node_map *nodemap,
2444                      uint32_t *tval_noiptakeover,
2445                      uint32_t *tval_noiphostonalldisabled,
2446                      enum ctdb_runstate *runstate)
2447 {
2448         int i;
2449         struct ctdb_ipflags *ipflags;
2450
2451         /* Clear IP flags - implicit due to talloc_zero */
2452         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2453         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2454
2455         for (i=0;i<nodemap->num;i++) {
2456                 /* Can not take IPs on node with NoIPTakeover set */
2457                 if (tval_noiptakeover[i] != 0) {
2458                         ipflags[i].noiptakeover = true;
2459                 }
2460
2461                 /* Can not host IPs on node not in RUNNING state */
2462                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2463                         ipflags[i].noiphost = true;
2464                         continue;
2465                 }
2466                 /* Can not host IPs on INACTIVE node */
2467                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2468                         ipflags[i].noiphost = true;
2469                 }
2470         }
2471
2472         if (all_nodes_are_disabled(nodemap)) {
2473                 /* If all nodes are disabled, can not host IPs on node
2474                  * with NoIPHostOnAllDisabled set
2475                  */
2476                 for (i=0;i<nodemap->num;i++) {
2477                         if (tval_noiphostonalldisabled[i] != 0) {
2478                                 ipflags[i].noiphost = true;
2479                         }
2480                 }
2481         } else {
2482                 /* If some nodes are not disabled, then can not host
2483                  * IPs on DISABLED node
2484                  */
2485                 for (i=0;i<nodemap->num;i++) {
2486                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2487                                 ipflags[i].noiphost = true;
2488                         }
2489                 }
2490         }
2491
2492         return ipflags;
2493 }
2494
2495 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2496                                         TALLOC_CTX *tmp_ctx,
2497                                         struct ctdb_node_map *nodemap)
2498 {
2499         uint32_t *tval_noiptakeover;
2500         uint32_t *tval_noiphostonalldisabled;
2501         struct ctdb_ipflags *ipflags;
2502         enum ctdb_runstate *runstate;
2503
2504
2505         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2506                                                    "NoIPTakeover", 0);
2507         if (tval_noiptakeover == NULL) {
2508                 return NULL;
2509         }
2510
2511         tval_noiphostonalldisabled =
2512                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2513                                        "NoIPHostOnAllDisabled", 0);
2514         if (tval_noiphostonalldisabled == NULL) {
2515                 /* Caller frees tmp_ctx */
2516                 return NULL;
2517         }
2518
2519         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2520          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2521          * reasonable behaviour on a mixed cluster during upgrade.
2522          */
2523         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2524                                            CTDB_RUNSTATE_RUNNING);
2525         if (runstate == NULL) {
2526                 /* Caller frees tmp_ctx */
2527                 return NULL;
2528         }
2529
2530         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2531                                        tval_noiptakeover,
2532                                        tval_noiphostonalldisabled,
2533                                        runstate);
2534
2535         talloc_free(tval_noiptakeover);
2536         talloc_free(tval_noiphostonalldisabled);
2537         talloc_free(runstate);
2538
2539         return ipflags;
2540 }
2541
2542 struct iprealloc_callback_data {
2543         bool *retry_nodes;
2544         int retry_count;
2545         client_async_callback fail_callback;
2546         void *fail_callback_data;
2547         struct ctdb_node_map *nodemap;
2548 };
2549
2550 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2551                                         int32_t res, TDB_DATA outdata,
2552                                         void *callback)
2553 {
2554         int numnodes;
2555         struct iprealloc_callback_data *cd =
2556                 (struct iprealloc_callback_data *)callback;
2557
2558         switch (res) {
2559         case -ETIME:
2560                 /* If the control timed out then that's a real error,
2561                  * so call the real fail callback
2562                  */
2563                 cd->fail_callback(ctdb, pnn, res, outdata,
2564                                   cd->fail_callback_data);
2565                 break;
2566         default:
2567                 /* If not a timeout then either the ipreallocated
2568                  * eventscript (or some setup) failed.  This might
2569                  * have failed because the IPREALLOCATED control isn't
2570                  * implemented - right now there is no way of knowing
2571                  * because the error codes are all folded down to -1.
2572                  * Consider retrying using EVENTSCRIPT control...
2573                  */
2574
2575                 numnodes = talloc_array_length(cd->retry_nodes);
2576                 if (pnn > numnodes) {
2577                         DEBUG(DEBUG_ERR,
2578                               ("ipreallocated failure from node %d, but only %d nodes in nodemap\n",
2579                                pnn, numnodes));
2580                         return;
2581                 }
2582
2583                 /* Can't run the "ipreallocated" event on a INACTIVE node */
2584                 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2585                         DEBUG(DEBUG_ERR,
2586                               ("ipreallocated failure from node %d, but node is inactive - not flagging a retry\n",
2587                                pnn));
2588                         return;
2589                 }
2590
2591                 DEBUG(DEBUG_WARNING,
2592                       ("ipreallocated failure from node %d, flagging retry\n",
2593                        pnn));
2594                 cd->retry_nodes[pnn] = true;
2595                 cd->retry_count++;
2596         }
2597 }
2598
2599 struct takeover_callback_data {
2600         bool *node_failed;
2601         client_async_callback fail_callback;
2602         void *fail_callback_data;
2603         struct ctdb_node_map *nodemap;
2604 };
2605
2606 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2607                                        uint32_t node_pnn, int32_t res,
2608                                        TDB_DATA outdata, void *callback_data)
2609 {
2610         struct takeover_callback_data *cd =
2611                 talloc_get_type_abort(callback_data,
2612                                       struct takeover_callback_data);
2613         int i;
2614
2615         for (i = 0; i < cd->nodemap->num; i++) {
2616                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2617                         break;
2618                 }
2619         }
2620
2621         if (i == cd->nodemap->num) {
2622                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2623                 return;
2624         }
2625
2626         if (!cd->node_failed[i]) {
2627                 cd->node_failed[i] = true;
2628                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2629                                   cd->fail_callback_data);
2630         }
2631 }
2632
2633 /*
2634   make any IP alias changes for public addresses that are necessary 
2635  */
2636 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2637                       uint32_t *force_rebalance_nodes,
2638                       client_async_callback fail_callback, void *callback_data)
2639 {
2640         int i, j, ret;
2641         struct ctdb_public_ip ip;
2642         struct ctdb_public_ipv4 ipv4;
2643         uint32_t *nodes;
2644         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2645         TDB_DATA data;
2646         struct timeval timeout;
2647         struct client_async_data *async_data;
2648         struct ctdb_client_control_state *state;
2649         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2650         struct ctdb_ipflags *ipflags;
2651         struct takeover_callback_data *takeover_data;
2652         struct iprealloc_callback_data iprealloc_data;
2653         bool *retry_data;
2654
2655         /*
2656          * ip failover is completely disabled, just send out the 
2657          * ipreallocated event.
2658          */
2659         if (ctdb->tunable.disable_ip_failover != 0) {
2660                 goto ipreallocated;
2661         }
2662
2663         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2664         if (ipflags == NULL) {
2665                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2666                 talloc_free(tmp_ctx);
2667                 return -1;
2668         }
2669
2670         ZERO_STRUCT(ip);
2671
2672         /* Do the IP reassignment calculations */
2673         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2674
2675         /* Now tell all nodes to release any public IPs should not
2676          * host.  This will be a NOOP on nodes that don't currently
2677          * hold the given IP.
2678          */
2679         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2680         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2681
2682         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2683                                                        bool, nodemap->num);
2684         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2685         takeover_data->fail_callback = fail_callback;
2686         takeover_data->fail_callback_data = callback_data;
2687         takeover_data->nodemap = nodemap;
2688
2689         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2690         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2691
2692         async_data->fail_callback = takeover_run_fail_callback;
2693         async_data->callback_data = takeover_data;
2694
2695         for (i=0;i<nodemap->num;i++) {
2696                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2697                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2698                         continue;
2699                 }
2700
2701                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2702                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2703                                 /* This node should be serving this
2704                                    vnn so dont tell it to release the ip
2705                                 */
2706                                 continue;
2707                         }
2708                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2709                                 ipv4.pnn = tmp_ip->pnn;
2710                                 ipv4.sin = tmp_ip->addr.ip;
2711
2712                                 timeout = TAKEOVER_TIMEOUT();
2713                                 data.dsize = sizeof(ipv4);
2714                                 data.dptr  = (uint8_t *)&ipv4;
2715                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2716                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2717                                                 data, async_data,
2718                                                 &timeout, NULL);
2719                         } else {
2720                                 ip.pnn  = tmp_ip->pnn;
2721                                 ip.addr = tmp_ip->addr;
2722
2723                                 timeout = TAKEOVER_TIMEOUT();
2724                                 data.dsize = sizeof(ip);
2725                                 data.dptr  = (uint8_t *)&ip;
2726                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2727                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2728                                                 data, async_data,
2729                                                 &timeout, NULL);
2730                         }
2731
2732                         if (state == NULL) {
2733                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2734                                 talloc_free(tmp_ctx);
2735                                 return -1;
2736                         }
2737                 
2738                         ctdb_client_async_add(async_data, state);
2739                 }
2740         }
2741         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2742                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2743                 talloc_free(tmp_ctx);
2744                 return -1;
2745         }
2746         talloc_free(async_data);
2747
2748
2749         /* tell all nodes to get their own IPs */
2750         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2751         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2752
2753         async_data->fail_callback = fail_callback;
2754         async_data->callback_data = callback_data;
2755
2756         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2757                 if (tmp_ip->pnn == -1) {
2758                         /* this IP won't be taken over */
2759                         continue;
2760                 }
2761
2762                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2763                         ipv4.pnn = tmp_ip->pnn;
2764                         ipv4.sin = tmp_ip->addr.ip;
2765
2766                         timeout = TAKEOVER_TIMEOUT();
2767                         data.dsize = sizeof(ipv4);
2768                         data.dptr  = (uint8_t *)&ipv4;
2769                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2770                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2771                                         data, async_data,
2772                                         &timeout, NULL);
2773                 } else {
2774                         ip.pnn  = tmp_ip->pnn;
2775                         ip.addr = tmp_ip->addr;
2776
2777                         timeout = TAKEOVER_TIMEOUT();
2778                         data.dsize = sizeof(ip);
2779                         data.dptr  = (uint8_t *)&ip;
2780                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2781                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2782                                         data, async_data,
2783                                         &timeout, NULL);
2784                 }
2785                 if (state == NULL) {
2786                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2787                         talloc_free(tmp_ctx);
2788                         return -1;
2789                 }
2790                 
2791                 ctdb_client_async_add(async_data, state);
2792         }
2793         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2794                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2795                 talloc_free(tmp_ctx);
2796                 return -1;
2797         }
2798
2799 ipreallocated:
2800         /* 
2801          * Tell all nodes to run eventscripts to process the
2802          * "ipreallocated" event.  This can do a lot of things,
2803          * including restarting services to reconfigure them if public
2804          * IPs have moved.  Once upon a time this event only used to
2805          * update natwg.
2806          */
2807         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2808         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2809         iprealloc_data.retry_nodes = retry_data;
2810         iprealloc_data.retry_count = 0;
2811         iprealloc_data.fail_callback = fail_callback;
2812         iprealloc_data.fail_callback_data = callback_data;
2813         iprealloc_data.nodemap = nodemap;
2814
2815         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2816         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2817                                         nodes, 0, TAKEOVER_TIMEOUT(),
2818                                         false, tdb_null,
2819                                         NULL, iprealloc_fail_callback,
2820                                         &iprealloc_data);
2821         if (ret != 0) {
2822                 /* If the control failed then we should retry to any
2823                  * nodes flagged by iprealloc_fail_callback using the
2824                  * EVENTSCRIPT control.  This is a best-effort at
2825                  * backward compatiblity when running a mixed cluster
2826                  * where some nodes have not yet been upgraded to
2827                  * support the IPREALLOCATED control.
2828                  */
2829                 DEBUG(DEBUG_WARNING,
2830                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2831
2832                 nodes = talloc_array(tmp_ctx, uint32_t,
2833                                      iprealloc_data.retry_count);
2834                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2835
2836                 j = 0;
2837                 for (i=0; i<nodemap->num; i++) {
2838                         if (iprealloc_data.retry_nodes[i]) {
2839                                 nodes[j] = i;
2840                                 j++;
2841                         }
2842                 }
2843
2844                 data.dptr  = discard_const("ipreallocated");
2845                 data.dsize = strlen((char *)data.dptr) + 1; 
2846                 ret = ctdb_client_async_control(ctdb,
2847                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2848                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2849                                                 false, data,
2850                                                 NULL, fail_callback,
2851                                                 callback_data);
2852                 if (ret != 0) {
2853                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2854                 }
2855         }
2856
2857         talloc_free(tmp_ctx);
2858         return ret;
2859 }
2860
2861
2862 /*
2863   destroy a ctdb_client_ip structure
2864  */
2865 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2866 {
2867         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2868                 ctdb_addr_to_str(&ip->addr),
2869                 ntohs(ip->addr.ip.sin_port),
2870                 ip->client_id));
2871
2872         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2873         return 0;
2874 }
2875
2876 /*
2877   called by a client to inform us of a TCP connection that it is managing
2878   that should tickled with an ACK when IP takeover is done
2879   we handle both the old ipv4 style of packets as well as the new ipv4/6
2880   pdus.
2881  */
2882 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2883                                 TDB_DATA indata)
2884 {
2885         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2886         struct ctdb_control_tcp *old_addr = NULL;
2887         struct ctdb_control_tcp_addr new_addr;
2888         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2889         struct ctdb_tcp_list *tcp;
2890         struct ctdb_tcp_connection t;
2891         int ret;
2892         TDB_DATA data;
2893         struct ctdb_client_ip *ip;
2894         struct ctdb_vnn *vnn;
2895         ctdb_sock_addr addr;
2896
2897         switch (indata.dsize) {
2898         case sizeof(struct ctdb_control_tcp):
2899                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2900                 ZERO_STRUCT(new_addr);
2901                 tcp_sock = &new_addr;
2902                 tcp_sock->src.ip  = old_addr->src;
2903                 tcp_sock->dest.ip = old_addr->dest;
2904                 break;
2905         case sizeof(struct ctdb_control_tcp_addr):
2906                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2907                 break;
2908         default:
2909                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2910                                  "to ctdb_control_tcp_client. size was %d but "
2911                                  "only allowed sizes are %lu and %lu\n",
2912                                  (int)indata.dsize,
2913                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2914                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2915                 return -1;
2916         }
2917
2918         addr = tcp_sock->src;
2919         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2920         addr = tcp_sock->dest;
2921         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2922
2923         ZERO_STRUCT(addr);
2924         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2925         vnn = find_public_ip_vnn(ctdb, &addr);
2926         if (vnn == NULL) {
2927                 switch (addr.sa.sa_family) {
2928                 case AF_INET:
2929                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2930                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2931                                         ctdb_addr_to_str(&addr)));
2932                         }
2933                         break;
2934                 case AF_INET6:
2935                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2936                                 ctdb_addr_to_str(&addr)));
2937                         break;
2938                 default:
2939                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2940                 }
2941
2942                 return 0;
2943         }
2944
2945         if (vnn->pnn != ctdb->pnn) {
2946                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2947                         ctdb_addr_to_str(&addr),
2948                         client_id, client->pid));
2949                 /* failing this call will tell smbd to die */
2950                 return -1;
2951         }
2952
2953         ip = talloc(client, struct ctdb_client_ip);
2954         CTDB_NO_MEMORY(ctdb, ip);
2955
2956         ip->ctdb      = ctdb;
2957         ip->addr      = addr;
2958         ip->client_id = client_id;
2959         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2960         DLIST_ADD(ctdb->client_ip_list, ip);
2961
2962         tcp = talloc(client, struct ctdb_tcp_list);
2963         CTDB_NO_MEMORY(ctdb, tcp);
2964
2965         tcp->connection.src_addr = tcp_sock->src;
2966         tcp->connection.dst_addr = tcp_sock->dest;
2967
2968         DLIST_ADD(client->tcp_list, tcp);
2969
2970         t.src_addr = tcp_sock->src;
2971         t.dst_addr = tcp_sock->dest;
2972
2973         data.dptr = (uint8_t *)&t;
2974         data.dsize = sizeof(t);
2975
2976         switch (addr.sa.sa_family) {
2977         case AF_INET:
2978                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2979                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2980                         ctdb_addr_to_str(&tcp_sock->src),
2981                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2982                 break;
2983         case AF_INET6:
2984                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2985                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2986                         ctdb_addr_to_str(&tcp_sock->src),
2987                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2988                 break;
2989         default:
2990                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2991         }
2992
2993
2994         /* tell all nodes about this tcp connection */
2995         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2996                                        CTDB_CONTROL_TCP_ADD,
2997                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2998         if (ret != 0) {
2999                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3000                 return -1;
3001         }
3002
3003         return 0;
3004 }
3005
3006 /*
3007   find a tcp address on a list
3008  */
3009 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3010                                            struct ctdb_tcp_connection *tcp)
3011 {
3012         int i;
3013
3014         if (array == NULL) {
3015                 return NULL;
3016         }
3017
3018         for (i=0;i<array->num;i++) {
3019                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3020                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3021                         return &array->connections[i];
3022                 }
3023         }
3024         return NULL;
3025 }
3026
3027
3028
3029 /*
3030   called by a daemon to inform us of a TCP connection that one of its
3031   clients managing that should tickled with an ACK when IP takeover is
3032   done
3033  */
3034 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3035 {
3036         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3037         struct ctdb_tcp_array *tcparray;
3038         struct ctdb_tcp_connection tcp;
3039         struct ctdb_vnn *vnn;
3040
3041         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3042         if (vnn == NULL) {
3043                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3044                         ctdb_addr_to_str(&p->dst_addr)));
3045
3046                 return -1;
3047         }
3048
3049
3050         tcparray = vnn->tcp_array;
3051
3052         /* If this is the first tickle */
3053         if (tcparray == NULL) {
3054                 tcparray = talloc_size(ctdb->nodes, 
3055                         offsetof(struct ctdb_tcp_array, connections) +
3056                         sizeof(struct ctdb_tcp_connection) * 1);
3057                 CTDB_NO_MEMORY(ctdb, tcparray);
3058                 vnn->tcp_array = tcparray;
3059
3060                 tcparray->num = 0;
3061                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3062                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3063
3064                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3065                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3066                 tcparray->num++;
3067
3068                 if (tcp_update_needed) {
3069                         vnn->tcp_update_needed = true;
3070                 }
3071                 return 0;
3072         }
3073
3074
3075         /* Do we already have this tickle ?*/
3076         tcp.src_addr = p->src_addr;
3077         tcp.dst_addr = p->dst_addr;
3078         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
3079                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3080                         ctdb_addr_to_str(&tcp.dst_addr),
3081                         ntohs(tcp.dst_addr.ip.sin_port),
3082                         vnn->pnn));
3083                 return 0;
3084         }
3085
3086         /* A new tickle, we must add it to the array */
3087         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3088                                         struct ctdb_tcp_connection,
3089                                         tcparray->num+1);
3090         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3091
3092         vnn->tcp_array = tcparray;
3093         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3094         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3095         tcparray->num++;
3096                                 
3097         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3098                 ctdb_addr_to_str(&tcp.dst_addr),
3099                 ntohs(tcp.dst_addr.ip.sin_port),
3100                 vnn->pnn));
3101
3102         if (tcp_update_needed) {
3103                 vnn->tcp_update_needed = true;
3104         }
3105
3106         return 0;
3107 }
3108
3109
3110 /*
3111   called by a daemon to inform us of a TCP connection that one of its
3112   clients managing that should tickled with an ACK when IP takeover is
3113   done
3114  */
3115 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3116 {
3117         struct ctdb_tcp_connection *tcpp;
3118         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3119
3120         if (vnn == NULL) {
3121                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3122                         ctdb_addr_to_str(&conn->dst_addr)));
3123                 return;
3124         }
3125
3126         /* if the array is empty we cant remove it
3127            and we dont need to do anything
3128          */
3129         if (vnn->tcp_array == NULL) {
3130                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3131                         ctdb_addr_to_str(&conn->dst_addr),
3132                         ntohs(conn->dst_addr.ip.sin_port)));
3133                 return;
3134         }
3135
3136
3137         /* See if we know this connection
3138            if we dont know this connection  then we dont need to do anything
3139          */
3140         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3141         if (tcpp == NULL) {
3142                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3143                         ctdb_addr_to_str(&conn->dst_addr),
3144                         ntohs(conn->dst_addr.ip.sin_port)));
3145                 return;
3146         }
3147
3148
3149         /* We need to remove this entry from the array.
3150            Instead of allocating a new array and copying data to it
3151            we cheat and just copy the last entry in the existing array
3152            to the entry that is to be removed and just shring the 
3153            ->num field
3154          */
3155         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3156         vnn->tcp_array->num--;
3157
3158         /* If we deleted the last entry we also need to remove the entire array
3159          */
3160         if (vnn->tcp_array->num == 0) {
3161                 talloc_free(vnn->tcp_array);
3162                 vnn->tcp_array = NULL;
3163         }               
3164
3165         vnn->tcp_update_needed = true;
3166
3167         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3168                 ctdb_addr_to_str(&conn->src_addr),
3169                 ntohs(conn->src_addr.ip.sin_port)));
3170 }
3171
3172
3173 /*
3174   called by a daemon to inform us of a TCP connection that one of its
3175   clients used are no longer needed in the tickle database
3176  */
3177 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3178 {
3179         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3180
3181         ctdb_remove_tcp_connection(ctdb, conn);
3182
3183         return 0;
3184 }
3185
3186
3187 /*
3188   called when a daemon restarts - send all tickes for all public addresses
3189   we are serving immediately to the new node.
3190  */
3191 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
3192 {
3193 /*XXX here we should send all tickes we are serving to the new node */
3194         return 0;
3195 }
3196
3197
3198 /*
3199   called when a client structure goes away - hook to remove
3200   elements from the tcp_list in all daemons
3201  */
3202 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3203 {
3204         while (client->tcp_list) {
3205                 struct ctdb_tcp_list *tcp = client->tcp_list;
3206                 DLIST_REMOVE(client->tcp_list, tcp);
3207                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3208         }
3209 }
3210
3211
3212 /*
3213   release all IPs on shutdown
3214  */
3215 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3216 {
3217         struct ctdb_vnn *vnn;
3218         int count = 0;
3219
3220         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3221                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3222                         ctdb_vnn_unassign_iface(ctdb, vnn);
3223                         continue;
3224                 }
3225                 if (!vnn->iface) {
3226                         continue;
3227                 }
3228
3229                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3230                                     ctdb_addr_to_str(&vnn->public_address),
3231                                     vnn->public_netmask_bits,
3232                                     ctdb_vnn_iface_string(vnn)));
3233
3234                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3235                                   ctdb_vnn_iface_string(vnn),
3236                                   ctdb_addr_to_str(&vnn->public_address),
3237                                   vnn->public_netmask_bits);
3238                 release_kill_clients(ctdb, &vnn->public_address);
3239                 ctdb_vnn_unassign_iface(ctdb, vnn);
3240                 count++;
3241         }
3242
3243         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3244 }
3245
3246
3247 /*
3248   get list of public IPs
3249  */
3250 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3251                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3252 {
3253         int i, num, len;
3254         struct ctdb_all_public_ips *ips;
3255         struct ctdb_vnn *vnn;
3256         bool only_available = false;
3257
3258         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3259                 only_available = true;
3260         }
3261
3262         /* count how many public ip structures we have */
3263         num = 0;
3264         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3265                 num++;
3266         }
3267
3268         len = offsetof(struct ctdb_all_public_ips, ips) + 
3269                 num*sizeof(struct ctdb_public_ip);
3270         ips = talloc_zero_size(outdata, len);
3271         CTDB_NO_MEMORY(ctdb, ips);
3272
3273         i = 0;
3274         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3275                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3276                         continue;
3277                 }
3278                 ips->ips[i].pnn  = vnn->pnn;
3279                 ips->ips[i].addr = vnn->public_address;
3280                 i++;
3281         }
3282         ips->num = i;
3283         len = offsetof(struct ctdb_all_public_ips, ips) +
3284                 i*sizeof(struct ctdb_public_ip);
3285
3286         outdata->dsize = len;
3287         outdata->dptr  = (uint8_t *)ips;
3288
3289         return 0;
3290 }
3291
3292
3293 /*
3294   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3295  */
3296 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3297                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3298 {
3299         int i, num, len;
3300         struct ctdb_all_public_ipsv4 *ips;
3301         struct ctdb_vnn *vnn;
3302
3303         /* count how many public ip structures we have */
3304         num = 0;
3305         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3306                 if (vnn->public_address.sa.sa_family != AF_INET) {
3307                         continue;
3308                 }
3309                 num++;
3310         }
3311
3312         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3313                 num*sizeof(struct ctdb_public_ipv4);
3314         ips = talloc_zero_size(outdata, len);
3315         CTDB_NO_MEMORY(ctdb, ips);
3316
3317         outdata->dsize = len;
3318         outdata->dptr  = (uint8_t *)ips;
3319
3320         ips->num = num;
3321         i = 0;
3322         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3323                 if (vnn->public_address.sa.sa_family != AF_INET) {
3324                         continue;
3325                 }
3326                 ips->ips[i].pnn = vnn->pnn;
3327                 ips->ips[i].sin = vnn->public_address.ip;
3328                 i++;
3329         }
3330
3331         return 0;
3332 }
3333
3334 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3335                                         struct ctdb_req_control *c,
3336                                         TDB_DATA indata,
3337                                         TDB_DATA *outdata)
3338 {
3339         int i, num, len;
3340         ctdb_sock_addr *addr;
3341         struct ctdb_control_public_ip_info *info;
3342         struct ctdb_vnn *vnn;
3343
3344         addr = (ctdb_sock_addr *)indata.dptr;
3345
3346         vnn = find_public_ip_vnn(ctdb, addr);
3347         if (vnn == NULL) {
3348                 /* if it is not a public ip   it could be our 'single ip' */
3349                 if (ctdb->single_ip_vnn) {
3350                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3351                                 vnn = ctdb->single_ip_vnn;
3352                         }
3353                 }
3354         }
3355         if (vnn == NULL) {
3356                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3357                                  "'%s'not a public address\n",
3358                                  ctdb_addr_to_str(addr)));
3359                 return -1;
3360         }
3361
3362         /* count how many public ip structures we have */
3363         num = 0;
3364         for (;vnn->ifaces[num];) {
3365                 num++;
3366         }
3367
3368         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3369                 num*sizeof(struct ctdb_control_iface_info);
3370         info = talloc_zero_size(outdata, len);
3371         CTDB_NO_MEMORY(ctdb, info);
3372
3373         info->ip.addr = vnn->public_address;
3374         info->ip.pnn = vnn->pnn;
3375         info->active_idx = 0xFFFFFFFF;
3376
3377         for (i=0; vnn->ifaces[i]; i++) {
3378                 struct ctdb_iface *cur;
3379
3380                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3381                 if (cur == NULL) {
3382                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3383                                            vnn->ifaces[i]));
3384                         return -1;
3385                 }
3386                 if (vnn->iface == cur) {
3387                         info->active_idx = i;
3388                 }
3389                 strcpy(info->ifaces[i].name, cur->name);
3390                 info->ifaces[i].link_state = cur->link_up;
3391                 info->ifaces[i].references = cur->references;
3392         }
3393         info->num = i;
3394         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3395                 i*sizeof(struct ctdb_control_iface_info);
3396
3397         outdata->dsize = len;
3398         outdata->dptr  = (uint8_t *)info;
3399
3400         return 0;
3401 }
3402
3403 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3404                                 struct ctdb_req_control *c,
3405                                 TDB_DATA *outdata)
3406 {
3407         int i, num, len;
3408         struct ctdb_control_get_ifaces *ifaces;
3409         struct ctdb_iface *cur;
3410
3411         /* count how many public ip structures we have */
3412         num = 0;
3413         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3414                 num++;
3415         }
3416
3417         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3418                 num*sizeof(struct ctdb_control_iface_info);
3419         ifaces = talloc_zero_size(outdata, len);
3420         CTDB_NO_MEMORY(ctdb, ifaces);
3421
3422         i = 0;
3423         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3424                 strcpy(ifaces->ifaces[i].name, cur->name);
3425                 ifaces->ifaces[i].link_state = cur->link_up;
3426                 ifaces->ifaces[i].references = cur->references;
3427                 i++;
3428         }
3429         ifaces->num = i;
3430         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3431                 i*sizeof(struct ctdb_control_iface_info);
3432
3433         outdata->dsize = len;
3434         outdata->dptr  = (uint8_t *)ifaces;
3435
3436         return 0;
3437 }
3438
3439 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3440                                     struct ctdb_req_control *c,
3441                                     TDB_DATA indata)
3442 {
3443         struct ctdb_control_iface_info *info;
3444         struct ctdb_iface *iface;
3445         bool link_up = false;
3446
3447         info = (struct ctdb_control_iface_info *)indata.dptr;
3448
3449         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3450                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3451                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3452                                   len, len, info->name));
3453                 return -1;
3454         }
3455
3456         switch (info->link_state) {
3457         case 0:
3458                 link_up = false;
3459                 break;
3460         case 1:
3461                 link_up = true;
3462                 break;
3463         default:
3464                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3465                                   (unsigned int)info->link_state));
3466                 return -1;
3467         }
3468
3469         if (info->references != 0) {
3470                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3471                                   (unsigned int)info->references));
3472                 return -1;
3473         }
3474
3475         iface = ctdb_find_iface(ctdb, info->name);
3476         if (iface == NULL) {
3477                 return -1;
3478         }
3479
3480         if (link_up == iface->link_up) {
3481                 return 0;
3482         }
3483
3484         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3485               ("iface[%s] has changed it's link status %s => %s\n",
3486                iface->name,
3487                iface->link_up?"up":"down",
3488                link_up?"up":"down"));
3489
3490         iface->link_up = link_up;
3491         return 0;
3492 }
3493
3494
3495 /* 
3496    structure containing the listening socket and the list of tcp connections
3497    that the ctdb daemon is to kill
3498 */
3499 struct ctdb_kill_tcp {
3500         struct ctdb_vnn *vnn;
3501         struct ctdb_context *ctdb;
3502         int capture_fd;
3503         struct fd_event *fde;
3504         trbt_tree_t *connections;
3505         void *private_data;
3506 };
3507
3508 /*
3509   a tcp connection that is to be killed
3510  */
3511 struct ctdb_killtcp_con {
3512         ctdb_sock_addr src_addr;
3513         ctdb_sock_addr dst_addr;
3514         int count;
3515         struct ctdb_kill_tcp *killtcp;
3516 };
3517
3518 /* this function is used to create a key to represent this socketpair
3519    in the killtcp tree.
3520    this key is used to insert and lookup matching socketpairs that are
3521    to be tickled and RST
3522 */
3523 #define KILLTCP_KEYLEN  10
3524 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3525 {
3526         static uint32_t key[KILLTCP_KEYLEN];
3527
3528         bzero(key, sizeof(key));
3529
3530         if (src->sa.sa_family != dst->sa.sa_family) {
3531                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3532                 return key;
3533         }
3534         
3535         switch (src->sa.sa_family) {
3536         case AF_INET:
3537                 key[0]  = dst->ip.sin_addr.s_addr;
3538                 key[1]  = src->ip.sin_addr.s_addr;
3539                 key[2]  = dst->ip.sin_port;
3540                 key[3]  = src->ip.sin_port;
3541                 break;
3542         case AF_INET6: {
3543                 uint32_t *dst6_addr32 =
3544                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3545                 uint32_t *src6_addr32 =
3546                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3547                 key[0]  = dst6_addr32[3];
3548                 key[1]  = src6_addr32[3];
3549                 key[2]  = dst6_addr32[2];
3550                 key[3]  = src6_addr32[2];
3551                 key[4]  = dst6_addr32[1];
3552                 key[5]  = src6_addr32[1];
3553                 key[6]  = dst6_addr32[0];
3554                 key[7]  = src6_addr32[0];
3555                 key[8]  = dst->ip6.sin6_port;
3556                 key[9]  = src->ip6.sin6_port;
3557                 break;
3558         }
3559         default:
3560                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3561                 return key;
3562         }
3563
3564         return key;
3565 }
3566
3567 /*
3568   called when we get a read event on the raw socket
3569  */
3570 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3571                                 uint16_t flags, void *private_data)
3572 {
3573         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3574         struct ctdb_killtcp_con *con;
3575         ctdb_sock_addr src, dst;
3576         uint32_t ack_seq, seq;
3577
3578         if (!(flags & EVENT_FD_READ)) {
3579                 return;
3580         }
3581
3582         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3583                                 killtcp->private_data,
3584                                 &src, &dst,
3585                                 &ack_seq, &seq) != 0) {
3586                 /* probably a non-tcp ACK packet */
3587                 return;
3588         }
3589
3590         /* check if we have this guy in our list of connections
3591            to kill
3592         */
3593         con = trbt_lookuparray32(killtcp->connections, 
3594                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3595         if (con == NULL) {
3596                 /* no this was some other packet we can just ignore */
3597                 return;
3598         }
3599
3600         /* This one has been tickled !
3601            now reset him and remove him from the list.
3602          */
3603         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3604                 ntohs(con->dst_addr.ip.sin_port),
3605                 ctdb_addr_to_str(&con->src_addr),
3606                 ntohs(con->src_addr.ip.sin_port)));
3607
3608         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3609         talloc_free(con);
3610 }
3611
3612
3613 /* when traversing the list of all tcp connections to send tickle acks to
3614    (so that we can capture the ack coming back and kill the connection
3615     by a RST)
3616    this callback is called for each connection we are currently trying to kill
3617 */
3618 static int tickle_connection_traverse(void *param, void *data)
3619 {
3620         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3621
3622         /* have tried too many times, just give up */
3623         if (con->count >= 5) {
3624                 /* can't delete in traverse: reparent to delete_cons */
3625                 talloc_steal(param, con);
3626                 return 0;
3627         }
3628
3629         /* othervise, try tickling it again */
3630         con->count++;
3631         ctdb_sys_send_tcp(
3632                 (ctdb_sock_addr *)&con->dst_addr,
3633                 (ctdb_sock_addr *)&con->src_addr,
3634                 0, 0, 0);
3635         return 0;
3636 }
3637
3638
3639 /* 
3640    called every second until all sentenced connections have been reset
3641  */
3642 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3643                                               struct timeval t, void *private_data)
3644 {
3645         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3646         void *delete_cons = talloc_new(NULL);
3647
3648         /* loop over all connections sending tickle ACKs */
3649         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3650
3651         /* now we've finished traverse, it's safe to do deletion. */
3652         talloc_free(delete_cons);
3653
3654         /* If there are no more connections to kill we can remove the
3655            entire killtcp structure
3656          */
3657         if ( (killtcp->connections == NULL) || 
3658              (killtcp->connections->root == NULL) ) {
3659                 talloc_free(killtcp);
3660                 return;
3661         }
3662
3663         /* try tickling them again in a seconds time
3664          */
3665         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3666                         ctdb_tickle_sentenced_connections, killtcp);
3667 }
3668
3669 /*
3670   destroy the killtcp structure
3671  */
3672 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3673 {
3674         struct ctdb_vnn *tmpvnn;
3675
3676         /* verify that this vnn is still active */
3677         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3678                 if (tmpvnn == killtcp->vnn) {
3679                         break;
3680                 }
3681         }
3682
3683         if (tmpvnn == NULL) {
3684                 return 0;
3685         }
3686
3687         if (killtcp->vnn->killtcp != killtcp) {
3688                 return 0;
3689         }
3690
3691         killtcp->vnn->killtcp = NULL;
3692
3693         return 0;
3694 }
3695
3696
3697 /* nothing fancy here, just unconditionally replace any existing
3698    connection structure with the new one.
3699
3700    dont even free the old one if it did exist, that one is talloc_stolen
3701    by the same node in the tree anyway and will be deleted when the new data 
3702    is deleted
3703 */
3704 static void *add_killtcp_callback(void *parm, void *data)
3705 {
3706         return parm;
3707 }
3708
3709 /*
3710   add a tcp socket to the list of connections we want to RST
3711  */
3712 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3713                                        ctdb_sock_addr *s,
3714                                        ctdb_sock_addr *d)
3715 {
3716         ctdb_sock_addr src, dst;
3717         struct ctdb_kill_tcp *killtcp;
3718         struct ctdb_killtcp_con *con;
3719         struct ctdb_vnn *vnn;
3720
3721         ctdb_canonicalize_ip(s, &src);
3722         ctdb_canonicalize_ip(d, &dst);
3723
3724         vnn = find_public_ip_vnn(ctdb, &dst);
3725         if (vnn == NULL) {
3726                 vnn = find_public_ip_vnn(ctdb, &src);
3727         }
3728         if (vnn == NULL) {
3729                 /* if it is not a public ip   it could be our 'single ip' */
3730                 if (ctdb->single_ip_vnn) {
3731                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3732                                 vnn = ctdb->single_ip_vnn;
3733                         }
3734                 }
3735         }
3736         if (vnn == NULL) {
3737                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3738                 return -1;
3739         }
3740
3741         killtcp = vnn->killtcp;
3742         
3743         /* If this is the first connection to kill we must allocate
3744            a new structure
3745          */
3746         if (killtcp == NULL) {
3747                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3748                 CTDB_NO_MEMORY(ctdb, killtcp);
3749
3750                 killtcp->vnn         = vnn;
3751                 killtcp->ctdb        = ctdb;
3752                 killtcp->capture_fd  = -1;
3753                 killtcp->connections = trbt_create(killtcp, 0);
3754
3755                 vnn->killtcp         = killtcp;
3756                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3757         }
3758
3759
3760
3761         /* create a structure that describes this connection we want to
3762            RST and store it in killtcp->connections
3763         */
3764         con = talloc(killtcp, struct ctdb_killtcp_con);
3765         CTDB_NO_MEMORY(ctdb, con);
3766         con->src_addr = src;
3767         con->dst_addr = dst;
3768         con->count    = 0;
3769         con->killtcp  = killtcp;
3770
3771
3772         trbt_insertarray32_callback(killtcp->connections,
3773                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3774                         add_killtcp_callback, con);
3775
3776         /* 
3777            If we dont have a socket to listen on yet we must create it
3778          */
3779         if (killtcp->capture_fd == -1) {
3780                 const char *iface = ctdb_vnn_iface_string(vnn);
3781                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3782                 if (killtcp->capture_fd == -1) {
3783                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3784                                           "socket on iface '%s' for killtcp (%s)\n",
3785                                           iface, strerror(errno)));
3786                         goto failed;
3787                 }
3788         }
3789
3790
3791         if (killtcp->fde == NULL) {
3792                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3793                                             EVENT_FD_READ,
3794                                             capture_tcp_handler, killtcp);
3795                 tevent_fd_set_auto_close(killtcp->fde);
3796
3797                 /* We also need to set up some events to tickle all these connections
3798                    until they are all reset
3799                 */
3800                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3801                                 ctdb_tickle_sentenced_connections, killtcp);
3802         }
3803
3804         /* tickle him once now */
3805         ctdb_sys_send_tcp(
3806                 &con->dst_addr,
3807                 &con->src_addr,
3808                 0, 0, 0);
3809
3810         return 0;
3811
3812 failed:
3813         talloc_free(vnn->killtcp);
3814         vnn->killtcp = NULL;
3815         return -1;
3816 }
3817
3818 /*
3819   kill a TCP connection.
3820  */
3821 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3822 {
3823         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3824
3825         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3826 }
3827
3828 /*
3829   called by a daemon to inform us of the entire list of TCP tickles for
3830   a particular public address.
3831   this control should only be sent by the node that is currently serving
3832   that public address.
3833  */
3834 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3835 {
3836         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3837         struct ctdb_tcp_array *tcparray;
3838         struct ctdb_vnn *vnn;
3839
3840         /* We must at least have tickles.num or else we cant verify the size
3841            of the received data blob
3842          */
3843         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3844                                         tickles.connections)) {
3845                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3846                 return -1;
3847         }
3848
3849         /* verify that the size of data matches what we expect */
3850         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3851                                 tickles.connections)
3852                          + sizeof(struct ctdb_tcp_connection)
3853                                  * list->tickles.num) {
3854                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3855                 return -1;
3856         }       
3857
3858         vnn = find_public_ip_vnn(ctdb, &list->addr);
3859         if (vnn == NULL) {
3860                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3861                         ctdb_addr_to_str(&list->addr)));
3862
3863                 return 1;
3864         }
3865
3866         /* remove any old ticklelist we might have */
3867         talloc_free(vnn->tcp_array);
3868         vnn->tcp_array = NULL;
3869
3870         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3871         CTDB_NO_MEMORY(ctdb, tcparray);
3872
3873         tcparray->num = list->tickles.num;
3874
3875         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3876         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3877
3878         memcpy(tcparray->connections, &list->tickles.connections[0], 
3879                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3880
3881         /* We now have a new fresh tickle list array for this vnn */
3882         vnn->tcp_array = talloc_steal(vnn, tcparray);
3883         
3884         return 0;
3885 }
3886
3887 /*
3888   called to return the full list of tickles for the puclic address associated 
3889   with the provided vnn
3890  */
3891 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3892 {
3893         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3894         struct ctdb_control_tcp_tickle_list *list;
3895         struct ctdb_tcp_array *tcparray;
3896         int num;
3897         struct ctdb_vnn *vnn;
3898
3899         vnn = find_public_ip_vnn(ctdb, addr);
3900         if (vnn == NULL) {
3901                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3902                         ctdb_addr_to_str(addr)));
3903
3904                 return 1;
3905         }
3906
3907         tcparray = vnn->tcp_array;
3908         if (tcparray) {
3909                 num = tcparray->num;
3910         } else {
3911                 num = 0;
3912         }
3913
3914         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3915                                 tickles.connections)
3916                         + sizeof(struct ctdb_tcp_connection) * num;
3917
3918         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3919         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3920         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3921
3922         list->addr = *addr;
3923         list->tickles.num = num;
3924         if (num) {
3925                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3926                         sizeof(struct ctdb_tcp_connection) * num);
3927         }
3928
3929         return 0;
3930 }
3931
3932
3933 /*
3934   set the list of all tcp tickles for a public address
3935  */
3936 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3937                               struct timeval timeout, uint32_t destnode, 
3938                               ctdb_sock_addr *addr,
3939                               struct ctdb_tcp_array *tcparray)
3940 {
3941         int ret, num;
3942         TDB_DATA data;
3943         struct ctdb_control_tcp_tickle_list *list;
3944
3945         if (tcparray) {
3946                 num = tcparray->num;
3947         } else {
3948                 num = 0;
3949         }
3950
3951         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3952                                 tickles.connections) +
3953                         sizeof(struct ctdb_tcp_connection) * num;
3954         data.dptr = talloc_size(ctdb, data.dsize);
3955         CTDB_NO_MEMORY(ctdb, data.dptr);
3956
3957         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3958         list->addr = *addr;
3959         list->tickles.num = num;
3960         if (tcparray) {
3961                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3962         }
3963
3964         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3965                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3966                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3967         if (ret != 0) {
3968                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3969                 return -1;
3970         }
3971
3972         talloc_free(data.dptr);
3973
3974         return ret;
3975 }
3976
3977
3978 /*
3979   perform tickle updates if required
3980  */
3981 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3982                                 struct timed_event *te, 
3983                                 struct timeval t, void *private_data)
3984 {
3985         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3986         int ret;
3987         struct ctdb_vnn *vnn;
3988
3989         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3990                 /* we only send out updates for public addresses that 
3991                    we have taken over
3992                  */
3993                 if (ctdb->pnn != vnn->pnn) {
3994                         continue;
3995                 }
3996                 /* We only send out the updates if we need to */
3997                 if (!vnn->tcp_update_needed) {
3998                         continue;
3999                 }
4000                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
4001                                 TAKEOVER_TIMEOUT(),
4002                                 CTDB_BROADCAST_CONNECTED,
4003                                 &vnn->public_address,
4004                                 vnn->tcp_array);
4005                 if (ret != 0) {
4006                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4007                                 ctdb_addr_to_str(&vnn->public_address)));
4008                 }
4009         }
4010
4011         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4012                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4013                              ctdb_update_tcp_tickles, ctdb);
4014 }               
4015         
4016
4017 /*
4018   start periodic update of tcp tickles
4019  */
4020 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4021 {
4022         ctdb->tickle_update_context = talloc_new(ctdb);
4023
4024         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4025                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4026                              ctdb_update_tcp_tickles, ctdb);
4027 }
4028
4029
4030
4031
4032 struct control_gratious_arp {
4033         struct ctdb_context *ctdb;
4034         ctdb_sock_addr addr;
4035         const char *iface;
4036         int count;
4037 };
4038
4039 /*
4040   send a control_gratuitous arp
4041  */
4042 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4043                                   struct timeval t, void *private_data)
4044 {
4045         int ret;
4046         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4047                                                         struct control_gratious_arp);
4048
4049         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4050         if (ret != 0) {
4051                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4052                                  arp->iface, strerror(errno)));
4053         }
4054
4055
4056         arp->count++;
4057         if (arp->count == CTDB_ARP_REPEAT) {
4058                 talloc_free(arp);
4059                 return;
4060         }
4061
4062         event_add_timed(arp->ctdb->ev, arp, 
4063                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4064                         send_gratious_arp, arp);
4065 }
4066
4067
4068 /*
4069   send a gratious arp 
4070  */
4071 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4072 {
4073         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4074         struct control_gratious_arp *arp;
4075
4076         /* verify the size of indata */
4077         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4078                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4079                                  (unsigned)indata.dsize, 
4080                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4081                 return -1;
4082         }
4083         if (indata.dsize != 
4084                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4085                 + gratious_arp->len ) ){
4086
4087                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4088                         "but should be %u bytes\n", 
4089                          (unsigned)indata.dsize, 
4090                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4091                 return -1;
4092         }
4093
4094
4095         arp = talloc(ctdb, struct control_gratious_arp);
4096         CTDB_NO_MEMORY(ctdb, arp);
4097
4098         arp->ctdb  = ctdb;
4099         arp->addr   = gratious_arp->addr;
4100         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4101         CTDB_NO_MEMORY(ctdb, arp->iface);
4102         arp->count = 0;
4103         
4104         event_add_timed(arp->ctdb->ev, arp, 
4105                         timeval_zero(), send_gratious_arp, arp);
4106
4107         return 0;
4108 }
4109
4110 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4111 {
4112         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4113         int ret;
4114
4115         /* verify the size of indata */
4116         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4117                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4118                 return -1;
4119         }
4120         if (indata.dsize != 
4121                 ( offsetof(struct ctdb_control_ip_iface, iface)
4122                 + pub->len ) ){
4123
4124                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4125                         "but should be %u bytes\n", 
4126                          (unsigned)indata.dsize, 
4127                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4128                 return -1;
4129         }
4130
4131         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4132
4133         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4134
4135         if (ret != 0) {
4136                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4137                 return -1;
4138         }
4139
4140         return 0;
4141 }
4142
4143 /*
4144   called when releaseip event finishes for del_public_address
4145  */
4146 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
4147                                 void *private_data)
4148 {
4149         talloc_free(private_data);
4150 }
4151
4152 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4153 {
4154         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4155         struct ctdb_vnn *vnn;
4156         int ret;
4157
4158         /* verify the size of indata */
4159         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4160                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4161                 return -1;
4162         }
4163         if (indata.dsize != 
4164                 ( offsetof(struct ctdb_control_ip_iface, iface)
4165                 + pub->len ) ){
4166
4167                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4168                         "but should be %u bytes\n", 
4169                          (unsigned)indata.dsize, 
4170                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4171                 return -1;
4172         }
4173
4174         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4175
4176         /* walk over all public addresses until we find a match */
4177         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4178                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4179                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4180
4181                         DLIST_REMOVE(ctdb->vnn, vnn);
4182                         talloc_steal(mem_ctx, vnn);
4183                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
4184                         if (vnn->pnn != ctdb->pnn) {
4185                                 if (vnn->iface != NULL) {
4186                                         ctdb_vnn_unassign_iface(ctdb, vnn);
4187                                 }
4188                                 talloc_free(mem_ctx);
4189                                 return 0;
4190                         }
4191                         vnn->pnn = -1;
4192
4193                         ret = ctdb_event_script_callback(ctdb, 
4194                                          mem_ctx, delete_ip_callback, mem_ctx,
4195                                          false,
4196                                          CTDB_EVENT_RELEASE_IP,
4197                                          "%s %s %u",
4198                                          ctdb_vnn_iface_string(vnn),
4199                                          ctdb_addr_to_str(&vnn->public_address),
4200                                          vnn->public_netmask_bits);
4201                         if (vnn->iface != NULL) {
4202                                 ctdb_vnn_unassign_iface(ctdb, vnn);
4203                         }
4204                         if (ret != 0) {
4205                                 return -1;
4206                         }
4207                         return 0;
4208                 }
4209         }
4210
4211         return -1;
4212 }
4213
4214
4215 struct ipreallocated_callback_state {
4216         struct ctdb_req_control *c;
4217 };
4218
4219 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4220                                         int status, void *p)
4221 {
4222         struct ipreallocated_callback_state *state =
4223                 talloc_get_type(p, struct ipreallocated_callback_state);
4224
4225         if (status != 0) {
4226                 DEBUG(DEBUG_ERR,
4227                       (" \"ipreallocated\" event script failed (status %d)\n",
4228                        status));
4229                 if (status == -ETIME) {
4230                         ctdb_ban_self(ctdb);
4231                 }
4232         }
4233
4234         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4235         talloc_free(state);
4236 }
4237
4238 /* A control to run the ipreallocated event */
4239 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4240                                    struct ctdb_req_control *c,
4241                                    bool *async_reply)
4242 {
4243         int ret;
4244         struct ipreallocated_callback_state *state;
4245
4246         state = talloc(ctdb, struct ipreallocated_callback_state);
4247         CTDB_NO_MEMORY(ctdb, state);
4248
4249         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4250
4251         ret = ctdb_event_script_callback(ctdb, state,
4252                                          ctdb_ipreallocated_callback, state,
4253                                          false, CTDB_EVENT_IPREALLOCATED,
4254                                          "%s", "");
4255
4256         if (ret != 0) {
4257                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4258                 talloc_free(state);
4259                 return -1;
4260         }
4261
4262         /* tell the control that we will be reply asynchronously */
4263         state->c    = talloc_steal(state, c);
4264         *async_reply = true;
4265
4266         return 0;
4267 }
4268
4269
4270 /* This function is called from the recovery daemon to verify that a remote
4271    node has the expected ip allocation.
4272    This is verified against ctdb->ip_tree
4273 */
4274 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4275                                 struct ctdb_all_public_ips *ips,
4276                                 uint32_t pnn)
4277 {
4278         struct ctdb_public_ip_list *tmp_ip; 
4279         int i;
4280
4281         if (ctdb->ip_tree == NULL) {
4282                 /* dont know the expected allocation yet, assume remote node
4283                    is correct. */
4284                 return 0;
4285         }
4286
4287         if (ips == NULL) {
4288                 return 0;
4289         }
4290
4291         for (i=0; i<ips->num; i++) {
4292                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4293                 if (tmp_ip == NULL) {
4294                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4295                         return -1;
4296                 }
4297
4298                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4299                         continue;
4300                 }
4301
4302                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4303                         DEBUG(DEBUG_ERR,
4304                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4305                                pnn,
4306                                ctdb_addr_to_str(&ips->ips[i].addr),
4307                                ips->ips[i].pnn, tmp_ip->pnn));
4308                         return -1;
4309                 }
4310         }
4311
4312         return 0;
4313 }
4314
4315 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4316 {
4317         struct ctdb_public_ip_list *tmp_ip; 
4318
4319         if (ctdb->ip_tree == NULL) {
4320                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4321                 return -1;
4322         }
4323
4324         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4325         if (tmp_ip == NULL) {
4326                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4327                 return -1;
4328         }
4329
4330         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4331         tmp_ip->pnn = ip->pnn;
4332
4333         return 0;
4334 }
4335
4336
4337 struct ctdb_reloadips_handle {
4338         struct ctdb_context *ctdb;
4339         struct ctdb_req_control *c;
4340         int status;
4341         int fd[2];
4342         pid_t child;
4343         struct fd_event *fde;
4344 };
4345
4346 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4347 {
4348         if (h == h->ctdb->reload_ips) {
4349                 h->ctdb->reload_ips = NULL;
4350         }
4351         if (h->c != NULL) {
4352                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4353                 h->c = NULL;
4354         }
4355         ctdb_kill(h->ctdb, h->child, SIGKILL);
4356         return 0;
4357 }
4358
4359 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4360                                 struct timed_event *te,
4361                                 struct timeval t, void *private_data)
4362 {
4363         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4364
4365         talloc_free(h);
4366 }       
4367
4368 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4369                              uint16_t flags, void *private_data)
4370 {
4371         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4372
4373         char res;
4374         int ret;
4375
4376         ret = read(h->fd[0], &res, 1);
4377         if (ret < 1 || res != 0) {
4378                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4379                 res = 1;
4380         }
4381         h->status = res;
4382
4383         talloc_free(h);
4384 }
4385
4386 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4387 {
4388         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4389         struct ctdb_all_public_ips *ips;
4390         struct ctdb_vnn *vnn;
4391         struct client_async_data *async_data;
4392         struct timeval timeout;
4393         TDB_DATA data;
4394         struct ctdb_client_control_state *state;
4395         bool first_add;
4396         int i, ret;
4397
4398         CTDB_NO_MEMORY(ctdb, mem_ctx);
4399
4400         /* Read IPs from local node */
4401         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4402                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4403         if (ret != 0) {
4404                 DEBUG(DEBUG_ERR,
4405                       ("Unable to fetch public IPs from local node\n"));
4406                 talloc_free(mem_ctx);
4407                 return -1;
4408         }
4409
4410         /* Read IPs file - this is safe since this is a child process */
4411         ctdb->vnn = NULL;
4412         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4413                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4414                 talloc_free(mem_ctx);
4415                 return -1;
4416         }
4417
4418         async_data = talloc_zero(mem_ctx, struct client_async_data);
4419         CTDB_NO_MEMORY(ctdb, async_data);
4420
4421         /* Compare IPs between node and file for IPs to be deleted */
4422         for (i = 0; i < ips->num; i++) {
4423                 /* */
4424                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4425                         if (ctdb_same_ip(&vnn->public_address,
4426                                          &ips->ips[i].addr)) {
4427                                 /* IP is still in file */
4428                                 break;
4429                         }
4430                 }
4431
4432                 if (vnn == NULL) {
4433                         /* Delete IP ips->ips[i] */
4434                         struct ctdb_control_ip_iface *pub;
4435
4436                         DEBUG(DEBUG_NOTICE,
4437                               ("IP %s no longer configured, deleting it\n",
4438                                ctdb_addr_to_str(&ips->ips[i].addr)));
4439
4440                         pub = talloc_zero(mem_ctx,
4441                                           struct ctdb_control_ip_iface);
4442                         CTDB_NO_MEMORY(ctdb, pub);
4443
4444                         pub->addr  = ips->ips[i].addr;
4445                         pub->mask  = 0;
4446                         pub->len   = 0;
4447
4448                         timeout = TAKEOVER_TIMEOUT();
4449
4450                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4451                                               iface) + pub->len;
4452                         data.dptr = (uint8_t *)pub;
4453
4454                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4455                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4456                                                   0, data, async_data,
4457                                                   &timeout, NULL);
4458                         if (state == NULL) {
4459                                 DEBUG(DEBUG_ERR,
4460                                       (__location__
4461                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4462                                 goto failed;
4463                         }
4464
4465                 }
4466         }
4467
4468         /* Compare IPs between node and file for IPs to be added */
4469         first_add = true;
4470         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4471                 for (i = 0; i < ips->num; i++) {
4472                         if (ctdb_same_ip(&vnn->public_address,
4473                                          &ips->ips[i].addr)) {
4474                                 /* IP already on node */
4475                                 break;
4476                         }
4477                 }
4478                 if (i == ips->num) {
4479                         /* Add IP ips->ips[i] */
4480                         struct ctdb_control_ip_iface *pub;
4481                         const char *ifaces = NULL;
4482                         uint32_t len;
4483                         int iface = 0;
4484
4485                         DEBUG(DEBUG_NOTICE,
4486                               ("New IP %s configured, adding it\n",
4487                                ctdb_addr_to_str(&vnn->public_address)));
4488                         if (first_add) {
4489                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4490
4491                                 data.dsize = sizeof(pnn);
4492                                 data.dptr  = (uint8_t *)&pnn;
4493
4494                                 ret = ctdb_client_send_message(
4495                                         ctdb,
4496                                         CTDB_BROADCAST_CONNECTED,
4497                                         CTDB_SRVID_REBALANCE_NODE,
4498                                         data);
4499                                 if (ret != 0) {
4500                                         DEBUG(DEBUG_WARNING,
4501                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4502                                 }
4503
4504                                 first_add = false;
4505                         }
4506
4507                         ifaces = vnn->ifaces[0];
4508                         iface = 1;
4509                         while (vnn->ifaces[iface] != NULL) {
4510                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4511                                                          vnn->ifaces[iface]);
4512                                 iface++;
4513                         }
4514
4515                         len   = strlen(ifaces) + 1;
4516                         pub = talloc_zero_size(mem_ctx,
4517                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4518                         CTDB_NO_MEMORY(ctdb, pub);
4519
4520                         pub->addr  = vnn->public_address;
4521                         pub->mask  = vnn->public_netmask_bits;
4522                         pub->len   = len;
4523                         memcpy(&pub->iface[0], ifaces, pub->len);
4524
4525                         timeout = TAKEOVER_TIMEOUT();
4526
4527                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4528                                               iface) + pub->len;
4529                         data.dptr = (uint8_t *)pub;
4530
4531                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4532                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4533                                                   0, data, async_data,
4534                                                   &timeout, NULL);
4535                         if (state == NULL) {
4536                                 DEBUG(DEBUG_ERR,
4537                                       (__location__
4538                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4539                                 goto failed;
4540                         }
4541                 }
4542         }
4543
4544         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4545                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4546                 goto failed;
4547         }
4548
4549         talloc_free(mem_ctx);
4550         return 0;
4551
4552 failed:
4553         talloc_free(mem_ctx);
4554         return -1;
4555 }
4556
4557 /* This control is sent to force the node to re-read the public addresses file
4558    and drop any addresses we should nnot longer host, and add new addresses
4559    that we are now able to host
4560 */
4561 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4562 {
4563         struct ctdb_reloadips_handle *h;
4564         pid_t parent = getpid();
4565
4566         if (ctdb->reload_ips != NULL) {
4567                 talloc_free(ctdb->reload_ips);
4568                 ctdb->reload_ips = NULL;
4569         }
4570
4571         h = talloc(ctdb, struct ctdb_reloadips_handle);
4572         CTDB_NO_MEMORY(ctdb, h);
4573         h->ctdb     = ctdb;
4574         h->c        = NULL;
4575         h->status   = -1;
4576         
4577         if (pipe(h->fd) == -1) {
4578                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4579                 talloc_free(h);
4580                 return -1;
4581         }
4582
4583         h->child = ctdb_fork(ctdb);
4584         if (h->child == (pid_t)-1) {
4585                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4586                 close(h->fd[0]);
4587                 close(h->fd[1]);
4588                 talloc_free(h);
4589                 return -1;
4590         }
4591
4592         /* child process */
4593         if (h->child == 0) {
4594                 signed char res = 0;
4595
4596                 close(h->fd[0]);
4597                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4598
4599                 ctdb_set_process_name("ctdb_reloadips");
4600                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4601                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4602                         res = -1;
4603                 } else {
4604                         res = ctdb_reloadips_child(ctdb);
4605                         if (res != 0) {
4606                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4607                         }
4608                 }
4609
4610                 write(h->fd[1], &res, 1);
4611                 /* make sure we die when our parent dies */
4612                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4613                         sleep(5);
4614                 }
4615                 _exit(0);
4616         }
4617
4618         h->c             = talloc_steal(h, c);
4619
4620         close(h->fd[1]);
4621         set_close_on_exec(h->fd[0]);
4622
4623         talloc_set_destructor(h, ctdb_reloadips_destructor);
4624
4625
4626         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4627                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4628                         (void *)h);
4629         tevent_fd_set_auto_close(h->fde);
4630
4631         event_add_timed(ctdb->ev, h,
4632                         timeval_current_ofs(120, 0),
4633                         ctdb_reloadips_timeout_event, h);
4634
4635         /* we reply later */
4636         *async_reply = true;
4637         return 0;
4638 }