ctdb-ipalloc: Tidy up create_merged_ip_list()
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68
69         enum ipalloc_algorithm algorithm;
70         uint32_t no_ip_failback;
71 };
72
73 struct ctdb_interface {
74         struct ctdb_interface *prev, *next;
75         const char *name;
76         bool link_up;
77         uint32_t references;
78 };
79
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
81 {
82         if (vnn->iface) {
83                 return vnn->iface->name;
84         }
85
86         return "__none__";
87 }
88
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
90 {
91         struct ctdb_interface *i;
92
93         /* Verify that we don't have an entry for this ip yet */
94         for (i=ctdb->ifaces;i;i=i->next) {
95                 if (strcmp(i->name, iface) == 0) {
96                         return 0;
97                 }
98         }
99
100         /* create a new structure for this interface */
101         i = talloc_zero(ctdb, struct ctdb_interface);
102         CTDB_NO_MEMORY_FATAL(ctdb, i);
103         i->name = talloc_strdup(i, iface);
104         CTDB_NO_MEMORY(ctdb, i->name);
105
106         i->link_up = true;
107
108         DLIST_ADD(ctdb->ifaces, i);
109
110         return 0;
111 }
112
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
114                                         const char *name)
115 {
116         int n;
117
118         for (n = 0; vnn->ifaces[n] != NULL; n++) {
119                 if (strcmp(name, vnn->ifaces[n]) == 0) {
120                         return true;
121                 }
122         }
123
124         return false;
125 }
126
127 /* If any interfaces now have no possible IPs then delete them.  This
128  * implementation is naive (i.e. simple) rather than clever
129  * (i.e. complex).  Given that this is run on delip and that operation
130  * is rare, this doesn't need to be efficient - it needs to be
131  * foolproof.  One alternative is reference counting, where the logic
132  * is distributed and can, therefore, be broken in multiple places.
133  * Another alternative is to build a red-black tree of interfaces that
134  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135  * once) and then walking ctdb->ifaces once and deleting those not in
136  * the tree.  Let's go to one of those if the naive implementation
137  * causes problems...  :-)
138  */
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140                                         struct ctdb_vnn *vnn)
141 {
142         struct ctdb_interface *i, *next;
143
144         /* For each interface, check if there's an IP using it. */
145         for (i = ctdb->ifaces; i != NULL; i = next) {
146                 struct ctdb_vnn *tv;
147                 bool found;
148                 next = i->next;
149
150                 /* Only consider interfaces named in the given VNN. */
151                 if (!vnn_has_interface_with_name(vnn, i->name)) {
152                         continue;
153                 }
154
155                 /* Is the "single IP" on this interface? */
156                 if ((ctdb->single_ip_vnn != NULL) &&
157                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159                         /* Found, next interface please... */
160                         continue;
161                 }
162                 /* Search for a vnn with this interface. */
163                 found = false;
164                 for (tv=ctdb->vnn; tv; tv=tv->next) {
165                         if (vnn_has_interface_with_name(tv, i->name)) {
166                                 found = true;
167                                 break;
168                         }
169                 }
170
171                 if (!found) {
172                         /* None of the VNNs are using this interface. */
173                         DLIST_REMOVE(ctdb->ifaces, i);
174                         talloc_free(i);
175                 }
176         }
177 }
178
179
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
181                                               const char *iface)
182 {
183         struct ctdb_interface *i;
184
185         for (i=ctdb->ifaces;i;i=i->next) {
186                 if (strcmp(i->name, iface) == 0) {
187                         return i;
188                 }
189         }
190
191         return NULL;
192 }
193
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195                                                   struct ctdb_vnn *vnn)
196 {
197         int i;
198         struct ctdb_interface *cur = NULL;
199         struct ctdb_interface *best = NULL;
200
201         for (i=0; vnn->ifaces[i]; i++) {
202
203                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
204                 if (cur == NULL) {
205                         continue;
206                 }
207
208                 if (!cur->link_up) {
209                         continue;
210                 }
211
212                 if (best == NULL) {
213                         best = cur;
214                         continue;
215                 }
216
217                 if (cur->references < best->references) {
218                         best = cur;
219                         continue;
220                 }
221         }
222
223         return best;
224 }
225
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227                                      struct ctdb_vnn *vnn)
228 {
229         struct ctdb_interface *best = NULL;
230
231         if (vnn->iface) {
232                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                                    "still assigned to iface '%s'\n",
234                                    ctdb_addr_to_str(&vnn->public_address),
235                                    ctdb_vnn_iface_string(vnn)));
236                 return 0;
237         }
238
239         best = ctdb_vnn_best_iface(ctdb, vnn);
240         if (best == NULL) {
241                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242                                   "cannot assign to iface any iface\n",
243                                   ctdb_addr_to_str(&vnn->public_address)));
244                 return -1;
245         }
246
247         vnn->iface = best;
248         best->references++;
249         vnn->pnn = ctdb->pnn;
250
251         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252                            "now assigned to iface '%s' refs[%d]\n",
253                            ctdb_addr_to_str(&vnn->public_address),
254                            ctdb_vnn_iface_string(vnn),
255                            best->references));
256         return 0;
257 }
258
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260                                     struct ctdb_vnn *vnn)
261 {
262         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263                            "now unassigned (old iface '%s' refs[%d])\n",
264                            ctdb_addr_to_str(&vnn->public_address),
265                            ctdb_vnn_iface_string(vnn),
266                            vnn->iface?vnn->iface->references:0));
267         if (vnn->iface) {
268                 vnn->iface->references--;
269         }
270         vnn->iface = NULL;
271         if (vnn->pnn == ctdb->pnn) {
272                 vnn->pnn = -1;
273         }
274 }
275
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277                                struct ctdb_vnn *vnn)
278 {
279         int i;
280
281         /* Nodes that are not RUNNING can not host IPs */
282         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
283                 return false;
284         }
285
286         if (vnn->delete_pending) {
287                 return false;
288         }
289
290         if (vnn->iface && vnn->iface->link_up) {
291                 return true;
292         }
293
294         for (i=0; vnn->ifaces[i]; i++) {
295                 struct ctdb_interface *cur;
296
297                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
298                 if (cur == NULL) {
299                         continue;
300                 }
301
302                 if (cur->link_up) {
303                         return true;
304                 }
305         }
306
307         return false;
308 }
309
310 struct ctdb_takeover_arp {
311         struct ctdb_context *ctdb;
312         uint32_t count;
313         ctdb_sock_addr addr;
314         struct ctdb_tcp_array *tcparray;
315         struct ctdb_vnn *vnn;
316 };
317
318
319 /*
320   lists of tcp endpoints
321  */
322 struct ctdb_tcp_list {
323         struct ctdb_tcp_list *prev, *next;
324         struct ctdb_connection connection;
325 };
326
327 /*
328   list of clients to kill on IP release
329  */
330 struct ctdb_client_ip {
331         struct ctdb_client_ip *prev, *next;
332         struct ctdb_context *ctdb;
333         ctdb_sock_addr addr;
334         uint32_t client_id;
335 };
336
337
338 /*
339   send a gratuitous arp
340  */
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342                                   struct tevent_timer *te,
343                                   struct timeval t, void *private_data)
344 {
345         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
346                                                         struct ctdb_takeover_arp);
347         int i, ret;
348         struct ctdb_tcp_array *tcparray;
349         const char *iface = ctdb_vnn_iface_string(arp->vnn);
350
351         ret = ctdb_sys_send_arp(&arp->addr, iface);
352         if (ret != 0) {
353                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354                                   iface, strerror(errno)));
355         }
356
357         tcparray = arp->tcparray;
358         if (tcparray) {
359                 for (i=0;i<tcparray->num;i++) {
360                         struct ctdb_connection *tcon;
361
362                         tcon = &tcparray->connections[i];
363                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
365                                 ctdb_addr_to_str(&tcon->src),
366                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
367                         ret = ctdb_sys_send_tcp(
368                                 &tcon->src,
369                                 &tcon->dst,
370                                 0, 0, 0);
371                         if (ret != 0) {
372                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373                                         ctdb_addr_to_str(&tcon->src)));
374                         }
375                 }
376         }
377
378         arp->count++;
379
380         if (arp->count == CTDB_ARP_REPEAT) {
381                 talloc_free(arp);
382                 return;
383         }
384
385         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387                          ctdb_control_send_arp, arp);
388 }
389
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391                                        struct ctdb_vnn *vnn)
392 {
393         struct ctdb_takeover_arp *arp;
394         struct ctdb_tcp_array *tcparray;
395
396         if (!vnn->takeover_ctx) {
397                 vnn->takeover_ctx = talloc_new(vnn);
398                 if (!vnn->takeover_ctx) {
399                         return -1;
400                 }
401         }
402
403         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
404         if (!arp) {
405                 return -1;
406         }
407
408         arp->ctdb = ctdb;
409         arp->addr = vnn->public_address;
410         arp->vnn  = vnn;
411
412         tcparray = vnn->tcp_array;
413         if (tcparray) {
414                 /* add all of the known tcp connections for this IP to the
415                    list of tcp connections to send tickle acks for */
416                 arp->tcparray = talloc_steal(arp, tcparray);
417
418                 vnn->tcp_array = NULL;
419                 vnn->tcp_update_needed = true;
420         }
421
422         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423                          timeval_zero(), ctdb_control_send_arp, arp);
424
425         return 0;
426 }
427
428 struct takeover_callback_state {
429         struct ctdb_req_control_old *c;
430         ctdb_sock_addr *addr;
431         struct ctdb_vnn *vnn;
432 };
433
434 struct ctdb_do_takeip_state {
435         struct ctdb_req_control_old *c;
436         struct ctdb_vnn *vnn;
437 };
438
439 /*
440   called when takeip event finishes
441  */
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
443                                     void *private_data)
444 {
445         struct ctdb_do_takeip_state *state =
446                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
447         int32_t ret;
448         TDB_DATA data;
449
450         if (status != 0) {
451                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
452         
453                 if (status == -ETIME) {
454                         ctdb_ban_self(ctdb);
455                 }
456                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457                                  ctdb_addr_to_str(&state->vnn->public_address),
458                                  ctdb_vnn_iface_string(state->vnn)));
459                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
460
461                 node->flags |= NODE_FLAGS_UNHEALTHY;
462                 talloc_free(state);
463                 return;
464         }
465
466         if (ctdb->do_checkpublicip) {
467
468         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469         if (ret != 0) {
470                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471                 talloc_free(state);
472                 return;
473         }
474
475         }
476
477         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478         data.dsize = strlen((char *)data.dptr) + 1;
479         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
480
481         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
482
483
484         /* the control succeeded */
485         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
486         talloc_free(state);
487         return;
488 }
489
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
491 {
492         state->vnn->update_in_flight = false;
493         return 0;
494 }
495
496 /*
497   take over an ip address
498  */
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500                               struct ctdb_req_control_old *c,
501                               struct ctdb_vnn *vnn)
502 {
503         int ret;
504         struct ctdb_do_takeip_state *state;
505
506         if (vnn->update_in_flight) {
507                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508                                     "update for this IP already in flight\n",
509                                     ctdb_addr_to_str(&vnn->public_address),
510                                     vnn->public_netmask_bits));
511                 return -1;
512         }
513
514         ret = ctdb_vnn_assign_iface(ctdb, vnn);
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517                                  "assign a usable interface\n",
518                                  ctdb_addr_to_str(&vnn->public_address),
519                                  vnn->public_netmask_bits));
520                 return -1;
521         }
522
523         state = talloc(vnn, struct ctdb_do_takeip_state);
524         CTDB_NO_MEMORY(ctdb, state);
525
526         state->c = talloc_steal(ctdb, c);
527         state->vnn   = vnn;
528
529         vnn->update_in_flight = true;
530         talloc_set_destructor(state, ctdb_takeip_destructor);
531
532         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533                             ctdb_addr_to_str(&vnn->public_address),
534                             vnn->public_netmask_bits,
535                             ctdb_vnn_iface_string(vnn)));
536
537         ret = ctdb_event_script_callback(ctdb,
538                                          state,
539                                          ctdb_do_takeip_callback,
540                                          state,
541                                          CTDB_EVENT_TAKE_IP,
542                                          "%s %s %u",
543                                          ctdb_vnn_iface_string(vnn),
544                                          ctdb_addr_to_str(&vnn->public_address),
545                                          vnn->public_netmask_bits);
546
547         if (ret != 0) {
548                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549                         ctdb_addr_to_str(&vnn->public_address),
550                         ctdb_vnn_iface_string(vnn)));
551                 talloc_free(state);
552                 return -1;
553         }
554
555         return 0;
556 }
557
558 struct ctdb_do_updateip_state {
559         struct ctdb_req_control_old *c;
560         struct ctdb_interface *old;
561         struct ctdb_vnn *vnn;
562 };
563
564 /*
565   called when updateip event finishes
566  */
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
568                                       void *private_data)
569 {
570         struct ctdb_do_updateip_state *state =
571                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
572         int32_t ret;
573
574         if (status != 0) {
575                 if (status == -ETIME) {
576                         ctdb_ban_self(ctdb);
577                 }
578                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579                         ctdb_addr_to_str(&state->vnn->public_address),
580                         state->old->name,
581                         ctdb_vnn_iface_string(state->vnn)));
582
583                 /*
584                  * All we can do is reset the old interface
585                  * and let the next run fix it
586                  */
587                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588                 state->vnn->iface = state->old;
589                 state->vnn->iface->references++;
590
591                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
592                 talloc_free(state);
593                 return;
594         }
595
596         if (ctdb->do_checkpublicip) {
597
598         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
599         if (ret != 0) {
600                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
601                 talloc_free(state);
602                 return;
603         }
604
605         }
606
607         /* the control succeeded */
608         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
609         talloc_free(state);
610         return;
611 }
612
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
614 {
615         state->vnn->update_in_flight = false;
616         return 0;
617 }
618
619 /*
620   update (move) an ip address
621  */
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623                                 struct ctdb_req_control_old *c,
624                                 struct ctdb_vnn *vnn)
625 {
626         int ret;
627         struct ctdb_do_updateip_state *state;
628         struct ctdb_interface *old = vnn->iface;
629         const char *new_name;
630
631         if (vnn->update_in_flight) {
632                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633                                     "update for this IP already in flight\n",
634                                     ctdb_addr_to_str(&vnn->public_address),
635                                     vnn->public_netmask_bits));
636                 return -1;
637         }
638
639         ctdb_vnn_unassign_iface(ctdb, vnn);
640         ret = ctdb_vnn_assign_iface(ctdb, vnn);
641         if (ret != 0) {
642                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643                                  "assin a usable interface (old iface '%s')\n",
644                                  ctdb_addr_to_str(&vnn->public_address),
645                                  vnn->public_netmask_bits,
646                                  old->name));
647                 return -1;
648         }
649
650         new_name = ctdb_vnn_iface_string(vnn);
651         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652                 /* A benign update from one interface onto itself.
653                  * no need to run the eventscripts in this case, just return
654                  * success.
655                  */
656                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
657                 return 0;
658         }
659
660         state = talloc(vnn, struct ctdb_do_updateip_state);
661         CTDB_NO_MEMORY(ctdb, state);
662
663         state->c = talloc_steal(ctdb, c);
664         state->old = old;
665         state->vnn = vnn;
666
667         vnn->update_in_flight = true;
668         talloc_set_destructor(state, ctdb_updateip_destructor);
669
670         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671                             "interface %s to %s\n",
672                             ctdb_addr_to_str(&vnn->public_address),
673                             vnn->public_netmask_bits,
674                             old->name,
675                             new_name));
676
677         ret = ctdb_event_script_callback(ctdb,
678                                          state,
679                                          ctdb_do_updateip_callback,
680                                          state,
681                                          CTDB_EVENT_UPDATE_IP,
682                                          "%s %s %s %u",
683                                          state->old->name,
684                                          new_name,
685                                          ctdb_addr_to_str(&vnn->public_address),
686                                          vnn->public_netmask_bits);
687         if (ret != 0) {
688                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689                                  ctdb_addr_to_str(&vnn->public_address),
690                                  old->name, new_name));
691                 talloc_free(state);
692                 return -1;
693         }
694
695         return 0;
696 }
697
698 /*
699   Find the vnn of the node that has a public ip address
700   returns -1 if the address is not known as a public address
701  */
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
703 {
704         struct ctdb_vnn *vnn;
705
706         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707                 if (ctdb_same_ip(&vnn->public_address, addr)) {
708                         return vnn;
709                 }
710         }
711
712         return NULL;
713 }
714
715 /*
716   take over an ip address
717  */
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719                                  struct ctdb_req_control_old *c,
720                                  TDB_DATA indata,
721                                  bool *async_reply)
722 {
723         int ret;
724         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725         struct ctdb_vnn *vnn;
726         bool have_ip = false;
727         bool do_updateip = false;
728         bool do_takeip = false;
729         struct ctdb_interface *best_iface = NULL;
730
731         if (pip->pnn != ctdb->pnn) {
732                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733                                  "with pnn %d, but we're node %d\n",
734                                  ctdb_addr_to_str(&pip->addr),
735                                  pip->pnn, ctdb->pnn));
736                 return -1;
737         }
738
739         /* update out vnn list */
740         vnn = find_public_ip_vnn(ctdb, &pip->addr);
741         if (vnn == NULL) {
742                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743                         ctdb_addr_to_str(&pip->addr)));
744                 return 0;
745         }
746
747         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748                 have_ip = ctdb_sys_have_ip(&pip->addr);
749         }
750         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751         if (best_iface == NULL) {
752                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753                                  "a usable interface (old %s, have_ip %d)\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  vnn->public_netmask_bits,
756                                  ctdb_vnn_iface_string(vnn),
757                                  have_ip));
758                 return -1;
759         }
760
761         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
763                 have_ip = false;
764         }
765
766
767         if (vnn->iface == NULL && have_ip) {
768                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770                                  ctdb_addr_to_str(&vnn->public_address)));
771                 return 0;
772         }
773
774         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776                                   "and we have it on iface[%s], but it was assigned to node %d"
777                                   "and we are node %d, banning ourself\n",
778                                  ctdb_addr_to_str(&vnn->public_address),
779                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
780                 ctdb_ban_self(ctdb);
781                 return -1;
782         }
783
784         if (vnn->pnn == -1 && have_ip) {
785                 vnn->pnn = ctdb->pnn;
786                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787                                   "and we already have it on iface[%s], update local daemon\n",
788                                  ctdb_addr_to_str(&vnn->public_address),
789                                   ctdb_vnn_iface_string(vnn)));
790                 return 0;
791         }
792
793         if (vnn->iface) {
794                 if (vnn->iface != best_iface) {
795                         if (!vnn->iface->link_up) {
796                                 do_updateip = true;
797                         } else if (vnn->iface->references > (best_iface->references + 1)) {
798                                 /* only move when the rebalance gains something */
799                                         do_updateip = true;
800                         }
801                 }
802         }
803
804         if (!have_ip) {
805                 if (do_updateip) {
806                         ctdb_vnn_unassign_iface(ctdb, vnn);
807                         do_updateip = false;
808                 }
809                 do_takeip = true;
810         }
811
812         if (do_takeip) {
813                 ret = ctdb_do_takeip(ctdb, c, vnn);
814                 if (ret != 0) {
815                         return -1;
816                 }
817         } else if (do_updateip) {
818                 ret = ctdb_do_updateip(ctdb, c, vnn);
819                 if (ret != 0) {
820                         return -1;
821                 }
822         } else {
823                 /*
824                  * The interface is up and the kernel known the ip
825                  * => do nothing
826                  */
827                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828                         ctdb_addr_to_str(&pip->addr),
829                         vnn->public_netmask_bits,
830                         ctdb_vnn_iface_string(vnn)));
831                 return 0;
832         }
833
834         /* tell ctdb_control.c that we will be replying asynchronously */
835         *async_reply = true;
836
837         return 0;
838 }
839
840 /*
841   kill any clients that are registered with a IP that is being released
842  */
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
844 {
845         struct ctdb_client_ip *ip;
846
847         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848                 ctdb_addr_to_str(addr)));
849
850         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851                 ctdb_sock_addr tmp_addr;
852
853                 tmp_addr = ip->addr;
854                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
855                         ip->client_id,
856                         ctdb_addr_to_str(&ip->addr)));
857
858                 if (ctdb_same_ip(&tmp_addr, addr)) {
859                         struct ctdb_client *client = reqid_find(ctdb->idr,
860                                                                 ip->client_id,
861                                                                 struct ctdb_client);
862                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
863                                 ip->client_id,
864                                 ctdb_addr_to_str(&ip->addr),
865                                 client->pid));
866
867                         if (client->pid != 0) {
868                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869                                         (unsigned)client->pid,
870                                         ctdb_addr_to_str(addr),
871                                         ip->client_id));
872                                 kill(client->pid, SIGKILL);
873                         }
874                 }
875         }
876 }
877
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
879 {
880         DLIST_REMOVE(ctdb->vnn, vnn);
881         ctdb_vnn_unassign_iface(ctdb, vnn);
882         ctdb_remove_orphaned_ifaces(ctdb, vnn);
883         talloc_free(vnn);
884 }
885
886 /*
887   called when releaseip event finishes
888  */
889 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
890                                 void *private_data)
891 {
892         struct takeover_callback_state *state = 
893                 talloc_get_type(private_data, struct takeover_callback_state);
894         TDB_DATA data;
895
896         if (status == -ETIME) {
897                 ctdb_ban_self(ctdb);
898         }
899
900         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901                 if  (ctdb_sys_have_ip(state->addr)) {
902                         DEBUG(DEBUG_ERR,
903                               ("IP %s still hosted during release IP callback, failing\n",
904                                ctdb_addr_to_str(state->addr)));
905                         ctdb_request_control_reply(ctdb, state->c,
906                                                    NULL, -1, NULL);
907                         talloc_free(state);
908                         return;
909                 }
910         }
911
912         /* send a message to all clients of this node telling them
913            that the cluster has been reconfigured and they should
914            release any sockets on this IP */
915         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917         data.dsize = strlen((char *)data.dptr)+1;
918
919         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
920
921         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
922
923         /* kill clients that have registered with this IP */
924         release_kill_clients(ctdb, state->addr);
925
926         ctdb_vnn_unassign_iface(ctdb, state->vnn);
927
928         /* Process the IP if it has been marked for deletion */
929         if (state->vnn->delete_pending) {
930                 do_delete_ip(ctdb, state->vnn);
931                 state->vnn = NULL;
932         }
933
934         /* the control succeeded */
935         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
936         talloc_free(state);
937 }
938
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
940 {
941         if (state->vnn != NULL) {
942                 state->vnn->update_in_flight = false;
943         }
944         return 0;
945 }
946
947 /*
948   release an ip address
949  */
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
951                                 struct ctdb_req_control_old *c,
952                                 TDB_DATA indata, 
953                                 bool *async_reply)
954 {
955         int ret;
956         struct takeover_callback_state *state;
957         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958         struct ctdb_vnn *vnn;
959         char *iface;
960
961         /* update our vnn list */
962         vnn = find_public_ip_vnn(ctdb, &pip->addr);
963         if (vnn == NULL) {
964                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965                         ctdb_addr_to_str(&pip->addr)));
966                 return 0;
967         }
968         vnn->pnn = pip->pnn;
969
970         /* stop any previous arps */
971         talloc_free(vnn->takeover_ctx);
972         vnn->takeover_ctx = NULL;
973
974         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975          * lazy multicast to drop an IP from any node that isn't the
976          * intended new node.  The following causes makes ctdbd ignore
977          * a release for any address it doesn't host.
978          */
979         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980                 if (!ctdb_sys_have_ip(&pip->addr)) {
981                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982                                 ctdb_addr_to_str(&pip->addr),
983                                 vnn->public_netmask_bits,
984                                 ctdb_vnn_iface_string(vnn)));
985                         ctdb_vnn_unassign_iface(ctdb, vnn);
986                         return 0;
987                 }
988         } else {
989                 if (vnn->iface == NULL) {
990                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991                                            ctdb_addr_to_str(&pip->addr),
992                                            vnn->public_netmask_bits));
993                         return 0;
994                 }
995         }
996
997         /* There is a potential race between take_ip and us because we
998          * update the VNN via a callback that run when the
999          * eventscripts have been run.  Avoid the race by allowing one
1000          * update to be in flight at a time.
1001          */
1002         if (vnn->update_in_flight) {
1003                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004                                     "update for this IP already in flight\n",
1005                                     ctdb_addr_to_str(&vnn->public_address),
1006                                     vnn->public_netmask_bits));
1007                 return -1;
1008         }
1009
1010         iface = strdup(ctdb_vnn_iface_string(vnn));
1011
1012         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1013                 ctdb_addr_to_str(&pip->addr),
1014                 vnn->public_netmask_bits,
1015                 iface,
1016                 pip->pnn));
1017
1018         state = talloc(ctdb, struct takeover_callback_state);
1019         if (state == NULL) {
1020                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021                                __FILE__, __LINE__);
1022                 free(iface);
1023                 return -1;
1024         }
1025
1026         state->c = talloc_steal(state, c);
1027         state->addr = talloc(state, ctdb_sock_addr);       
1028         if (state->addr == NULL) {
1029                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030                                __FILE__, __LINE__);
1031                 free(iface);
1032                 talloc_free(state);
1033                 return -1;
1034         }
1035         *state->addr = pip->addr;
1036         state->vnn   = vnn;
1037
1038         vnn->update_in_flight = true;
1039         talloc_set_destructor(state, ctdb_releaseip_destructor);
1040
1041         ret = ctdb_event_script_callback(ctdb, 
1042                                          state, release_ip_callback, state,
1043                                          CTDB_EVENT_RELEASE_IP,
1044                                          "%s %s %u",
1045                                          iface,
1046                                          ctdb_addr_to_str(&pip->addr),
1047                                          vnn->public_netmask_bits);
1048         free(iface);
1049         if (ret != 0) {
1050                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051                         ctdb_addr_to_str(&pip->addr),
1052                         ctdb_vnn_iface_string(vnn)));
1053                 talloc_free(state);
1054                 return -1;
1055         }
1056
1057         /* tell the control that we will be reply asynchronously */
1058         *async_reply = true;
1059         return 0;
1060 }
1061
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063                                    ctdb_sock_addr *addr,
1064                                    unsigned mask, const char *ifaces,
1065                                    bool check_address)
1066 {
1067         struct ctdb_vnn      *vnn;
1068         uint32_t num = 0;
1069         char *tmp;
1070         const char *iface;
1071         int i;
1072         int ret;
1073
1074         tmp = strdup(ifaces);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 if (!ctdb_sys_check_iface_exists(iface)) {
1077                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1078                         free(tmp);
1079                         return -1;
1080                 }
1081         }
1082         free(tmp);
1083
1084         /* Verify that we don't have an entry for this ip yet */
1085         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1088                                 ctdb_addr_to_str(addr)));
1089                         return -1;
1090                 }               
1091         }
1092
1093         /* create a new vnn structure for this ip address */
1094         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097         tmp = talloc_strdup(vnn, ifaces);
1098         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1104                 num++;
1105         }
1106         talloc_free(tmp);
1107         vnn->ifaces[num] = NULL;
1108         vnn->public_address      = *addr;
1109         vnn->public_netmask_bits = mask;
1110         vnn->pnn                 = -1;
1111         if (check_address) {
1112                 if (ctdb_sys_have_ip(addr)) {
1113                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114                         vnn->pnn = ctdb->pnn;
1115                 }
1116         }
1117
1118         for (i=0; vnn->ifaces[i]; i++) {
1119                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1120                 if (ret != 0) {
1121                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122                                            "for public_address[%s]\n",
1123                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1124                         talloc_free(vnn);
1125                         return -1;
1126                 }
1127         }
1128
1129         DLIST_ADD(ctdb->vnn, vnn);
1130
1131         return 0;
1132 }
1133
1134 /*
1135   setup the public address lists from a file
1136 */
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1138 {
1139         char **lines;
1140         int nlines;
1141         int i;
1142
1143         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144         if (lines == NULL) {
1145                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1146                 return -1;
1147         }
1148         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1149                 nlines--;
1150         }
1151
1152         for (i=0;i<nlines;i++) {
1153                 unsigned mask;
1154                 ctdb_sock_addr addr;
1155                 const char *addrstr;
1156                 const char *ifaces;
1157                 char *tok, *line;
1158
1159                 line = lines[i];
1160                 while ((*line == ' ') || (*line == '\t')) {
1161                         line++;
1162                 }
1163                 if (*line == '#') {
1164                         continue;
1165                 }
1166                 if (strcmp(line, "") == 0) {
1167                         continue;
1168                 }
1169                 tok = strtok(line, " \t");
1170                 addrstr = tok;
1171                 tok = strtok(NULL, " \t");
1172                 if (tok == NULL) {
1173                         if (NULL == ctdb->default_public_interface) {
1174                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1175                                          i+1));
1176                                 talloc_free(lines);
1177                                 return -1;
1178                         }
1179                         ifaces = ctdb->default_public_interface;
1180                 } else {
1181                         ifaces = tok;
1182                 }
1183
1184                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1186                         talloc_free(lines);
1187                         return -1;
1188                 }
1189                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1191                         talloc_free(lines);
1192                         return -1;
1193                 }
1194         }
1195
1196
1197         talloc_free(lines);
1198         return 0;
1199 }
1200
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1202                               const char *iface,
1203                               const char *ip)
1204 {
1205         struct ctdb_vnn *svnn;
1206         struct ctdb_interface *cur = NULL;
1207         bool ok;
1208         int ret;
1209
1210         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211         CTDB_NO_MEMORY(ctdb, svnn);
1212
1213         svnn->ifaces = talloc_array(svnn, const char *, 2);
1214         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217         svnn->ifaces[1] = NULL;
1218
1219         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1220         if (!ok) {
1221                 talloc_free(svnn);
1222                 return -1;
1223         }
1224
1225         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1226         if (ret != 0) {
1227                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228                                    "for single_ip[%s]\n",
1229                                    svnn->ifaces[0],
1230                                    ctdb_addr_to_str(&svnn->public_address)));
1231                 talloc_free(svnn);
1232                 return -1;
1233         }
1234
1235         /* assume the single public ip interface is initially "good" */
1236         cur = ctdb_find_iface(ctdb, iface);
1237         if (cur == NULL) {
1238                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1239                 return -1;
1240         }
1241         cur->link_up = true;
1242
1243         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1244         if (ret != 0) {
1245                 talloc_free(svnn);
1246                 return -1;
1247         }
1248
1249         ctdb->single_ip_vnn = svnn;
1250         return 0;
1251 }
1252
1253 struct public_ip_list {
1254         struct public_ip_list *next;
1255         uint32_t pnn;
1256         ctdb_sock_addr addr;
1257 };
1258
1259 /* Given a physical node, return the number of
1260    public addresses that is currently assigned to this node.
1261 */
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1263 {
1264         int num=0;
1265
1266         for (;ips;ips=ips->next) {
1267                 if (ips->pnn == pnn) {
1268                         num++;
1269                 }
1270         }
1271         return num;
1272 }
1273
1274
1275 /* Can the given node host the given IP: is the public IP known to the
1276  * node and is NOIPHOST unset?
1277 */
1278 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1279                              struct ctdb_ipflags ipflags,
1280                              struct public_ip_list *ip)
1281 {
1282         struct ctdb_public_ip_list_old *public_ips;
1283         int i;
1284
1285         if (ipflags.noiphost) {
1286                 return false;
1287         }
1288
1289         public_ips = ctdb->ipalloc_state->available_public_ips[pnn];
1290
1291         if (public_ips == NULL) {
1292                 return false;
1293         }
1294
1295         for (i=0; i<public_ips->num; i++) {
1296                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1297                         /* yes, this node can serve this public ip */
1298                         return true;
1299                 }
1300         }
1301
1302         return false;
1303 }
1304
1305 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1306                                  struct ctdb_ipflags ipflags,
1307                                  struct public_ip_list *ip)
1308 {
1309         if (ipflags.noiptakeover) {
1310                 return false;
1311         }
1312
1313         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1314 }
1315
1316 /* search the node lists list for a node to takeover this ip.
1317    pick the node that currently are serving the least number of ips
1318    so that the ips get spread out evenly.
1319 */
1320 static int find_takeover_node(struct ctdb_context *ctdb,
1321                               struct ctdb_ipflags *ipflags,
1322                               struct public_ip_list *ip,
1323                               struct public_ip_list *all_ips)
1324 {
1325         int pnn, min=0, num;
1326         int i, numnodes;
1327
1328         numnodes = talloc_array_length(ipflags);
1329         pnn    = -1;
1330         for (i=0; i<numnodes; i++) {
1331                 /* verify that this node can serve this ip */
1332                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1333                         /* no it couldnt   so skip to the next node */
1334                         continue;
1335                 }
1336
1337                 num = node_ip_coverage(i, all_ips);
1338                 /* was this the first node we checked ? */
1339                 if (pnn == -1) {
1340                         pnn = i;
1341                         min  = num;
1342                 } else {
1343                         if (num < min) {
1344                                 pnn = i;
1345                                 min  = num;
1346                         }
1347                 }
1348         }       
1349         if (pnn == -1) {
1350                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1351                         ctdb_addr_to_str(&ip->addr)));
1352
1353                 return -1;
1354         }
1355
1356         ip->pnn = pnn;
1357         return 0;
1358 }
1359
1360 #define IP_KEYLEN       4
1361 static uint32_t *ip_key(ctdb_sock_addr *ip)
1362 {
1363         static uint32_t key[IP_KEYLEN];
1364
1365         bzero(key, sizeof(key));
1366
1367         switch (ip->sa.sa_family) {
1368         case AF_INET:
1369                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1370                 break;
1371         case AF_INET6: {
1372                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1373                 key[0]  = htonl(s6_a32[0]);
1374                 key[1]  = htonl(s6_a32[1]);
1375                 key[2]  = htonl(s6_a32[2]);
1376                 key[3]  = htonl(s6_a32[3]);
1377                 break;
1378         }
1379         default:
1380                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1381                 return key;
1382         }
1383
1384         return key;
1385 }
1386
1387 static void *add_ip_callback(void *parm, void *data)
1388 {
1389         struct public_ip_list *this_ip = parm;
1390         struct public_ip_list *prev_ip = data;
1391
1392         if (prev_ip == NULL) {
1393                 return parm;
1394         }
1395         if (this_ip->pnn == -1) {
1396                 this_ip->pnn = prev_ip->pnn;
1397         }
1398
1399         return parm;
1400 }
1401
1402 static int getips_count_callback(void *param, void *data)
1403 {
1404         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1405         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1406
1407         new_ip->next = *ip_list;
1408         *ip_list     = new_ip;
1409         return 0;
1410 }
1411
1412 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1413                                        struct ctdb_public_ip_list_old *ips,
1414                                        uint32_t pnn);
1415
1416 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1417                                          struct ipalloc_state *ipalloc_state,
1418                                          struct ctdb_node_map_old *nodemap)
1419 {
1420         int j;
1421         int ret;
1422
1423         if (ipalloc_state->num != nodemap->num) {
1424                 DEBUG(DEBUG_ERR,
1425                       (__location__
1426                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1427                        ipalloc_state->num, nodemap->num));
1428                 return -1;
1429         }
1430
1431         for (j=0; j<nodemap->num; j++) {
1432                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1433                         continue;
1434                 }
1435
1436                 /* Retrieve the list of known public IPs from the node */
1437                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1438                                         TAKEOVER_TIMEOUT(),
1439                                         j,
1440                                         ctdb->nodes,
1441                                         0,
1442                                         &ipalloc_state->known_public_ips[j]);
1443                 if (ret != 0) {
1444                         DEBUG(DEBUG_ERR,
1445                               ("Failed to read known public IPs from node: %u\n",
1446                                j));
1447                         return -1;
1448                 }
1449
1450                 if (ctdb->do_checkpublicip) {
1451                         verify_remote_ip_allocation(ctdb,
1452                                                     ipalloc_state->known_public_ips[j],
1453                                                     j);
1454                 }
1455
1456                 /* Retrieve the list of available public IPs from the node */
1457                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1458                                         TAKEOVER_TIMEOUT(),
1459                                         j,
1460                                         ctdb->nodes,
1461                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1462                                         &ipalloc_state->available_public_ips[j]);
1463                 if (ret != 0) {
1464                         DEBUG(DEBUG_ERR,
1465                               ("Failed to read available public IPs from node: %u\n",
1466                                j));
1467                         return -1;
1468                 }
1469         }
1470
1471         return 0;
1472 }
1473
1474 static struct public_ip_list *
1475 create_merged_ip_list(struct ctdb_context *ctdb)
1476 {
1477         int i, j;
1478         struct public_ip_list *ip_list;
1479         struct ctdb_public_ip_list_old *public_ips;
1480
1481         TALLOC_FREE(ctdb->ip_tree);
1482         ctdb->ip_tree = trbt_create(ctdb, 0);
1483
1484         for (i=0; i < ctdb->num_nodes; i++) {
1485                 public_ips = ctdb->ipalloc_state->known_public_ips[i];
1486
1487                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1488                         continue;
1489                 }
1490
1491                 /* there were no public ips for this node */
1492                 if (public_ips == NULL) {
1493                         continue;
1494                 }
1495
1496                 for (j=0; j < public_ips->num; j++) {
1497                         struct public_ip_list *tmp_ip;
1498
1499                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1500                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1501                         /* Do not use information about IP addresses hosted
1502                          * on other nodes, it may not be accurate */
1503                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1504                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1505                         } else {
1506                                 tmp_ip->pnn = -1;
1507                         }
1508                         tmp_ip->addr = public_ips->ips[j].addr;
1509                         tmp_ip->next = NULL;
1510
1511                         trbt_insertarray32_callback(ctdb->ip_tree,
1512                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1513                                 add_ip_callback,
1514                                 tmp_ip);
1515                 }
1516         }
1517
1518         ip_list = NULL;
1519         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1520
1521         return ip_list;
1522 }
1523
1524 /* 
1525  * This is the length of the longtest common prefix between the IPs.
1526  * It is calculated by XOR-ing the 2 IPs together and counting the
1527  * number of leading zeroes.  The implementation means that all
1528  * addresses end up being 128 bits long.
1529  *
1530  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1531  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1532  * lots of nodes and IP addresses?
1533  */
1534 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1535 {
1536         uint32_t ip1_k[IP_KEYLEN];
1537         uint32_t *t;
1538         int i;
1539         uint32_t x;
1540
1541         uint32_t distance = 0;
1542
1543         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1544         t = ip_key(ip2);
1545         for (i=0; i<IP_KEYLEN; i++) {
1546                 x = ip1_k[i] ^ t[i];
1547                 if (x == 0) {
1548                         distance += 32;
1549                 } else {
1550                         /* Count number of leading zeroes. 
1551                          * FIXME? This could be optimised...
1552                          */
1553                         while ((x & (1 << 31)) == 0) {
1554                                 x <<= 1;
1555                                 distance += 1;
1556                         }
1557                 }
1558         }
1559
1560         return distance;
1561 }
1562
1563 /* Calculate the IP distance for the given IP relative to IPs on the
1564    given node.  The ips argument is generally the all_ips variable
1565    used in the main part of the algorithm.
1566  */
1567 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1568                                   struct public_ip_list *ips,
1569                                   int pnn)
1570 {
1571         struct public_ip_list *t;
1572         uint32_t d;
1573
1574         uint32_t sum = 0;
1575
1576         for (t=ips; t != NULL; t=t->next) {
1577                 if (t->pnn != pnn) {
1578                         continue;
1579                 }
1580
1581                 /* Optimisation: We never calculate the distance
1582                  * between an address and itself.  This allows us to
1583                  * calculate the effect of removing an address from a
1584                  * node by simply calculating the distance between
1585                  * that address and all of the exitsing addresses.
1586                  * Moreover, we assume that we're only ever dealing
1587                  * with addresses from all_ips so we can identify an
1588                  * address via a pointer rather than doing a more
1589                  * expensive address comparison. */
1590                 if (&(t->addr) == ip) {
1591                         continue;
1592                 }
1593
1594                 d = ip_distance(ip, &(t->addr));
1595                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1596         }
1597
1598         return sum;
1599 }
1600
1601 /* Return the LCP2 imbalance metric for addresses currently assigned
1602    to the given node.
1603  */
1604 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1605 {
1606         struct public_ip_list *t;
1607
1608         uint32_t imbalance = 0;
1609
1610         for (t=all_ips; t!=NULL; t=t->next) {
1611                 if (t->pnn != pnn) {
1612                         continue;
1613                 }
1614                 /* Pass the rest of the IPs rather than the whole
1615                    all_ips input list.
1616                 */
1617                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1618         }
1619
1620         return imbalance;
1621 }
1622
1623 /* Allocate any unassigned IPs just by looping through the IPs and
1624  * finding the best node for each.
1625  */
1626 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1627                                       struct ctdb_ipflags *ipflags,
1628                                       struct public_ip_list *all_ips)
1629 {
1630         struct public_ip_list *tmp_ip;
1631
1632         /* loop over all ip's and find a physical node to cover for 
1633            each unassigned ip.
1634         */
1635         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1636                 if (tmp_ip->pnn == -1) {
1637                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1638                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1639                                         ctdb_addr_to_str(&tmp_ip->addr)));
1640                         }
1641                 }
1642         }
1643 }
1644
1645 /* Basic non-deterministic rebalancing algorithm.
1646  */
1647 static void basic_failback(struct ctdb_context *ctdb,
1648                            struct ctdb_ipflags *ipflags,
1649                            struct public_ip_list *all_ips,
1650                            int num_ips)
1651 {
1652         int i, numnodes;
1653         int maxnode, maxnum, minnode, minnum, num, retries;
1654         struct public_ip_list *tmp_ip;
1655
1656         numnodes = talloc_array_length(ipflags);
1657         retries = 0;
1658
1659 try_again:
1660         maxnum=0;
1661         minnum=0;
1662
1663         /* for each ip address, loop over all nodes that can serve
1664            this ip and make sure that the difference between the node
1665            serving the most and the node serving the least ip's are
1666            not greater than 1.
1667         */
1668         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1669                 if (tmp_ip->pnn == -1) {
1670                         continue;
1671                 }
1672
1673                 /* Get the highest and lowest number of ips's served by any 
1674                    valid node which can serve this ip.
1675                 */
1676                 maxnode = -1;
1677                 minnode = -1;
1678                 for (i=0; i<numnodes; i++) {
1679                         /* only check nodes that can actually serve this ip */
1680                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1681                                 /* no it couldnt   so skip to the next node */
1682                                 continue;
1683                         }
1684
1685                         num = node_ip_coverage(i, all_ips);
1686                         if (maxnode == -1) {
1687                                 maxnode = i;
1688                                 maxnum  = num;
1689                         } else {
1690                                 if (num > maxnum) {
1691                                         maxnode = i;
1692                                         maxnum  = num;
1693                                 }
1694                         }
1695                         if (minnode == -1) {
1696                                 minnode = i;
1697                                 minnum  = num;
1698                         } else {
1699                                 if (num < minnum) {
1700                                         minnode = i;
1701                                         minnum  = num;
1702                                 }
1703                         }
1704                 }
1705                 if (maxnode == -1) {
1706                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1707                                 ctdb_addr_to_str(&tmp_ip->addr)));
1708
1709                         continue;
1710                 }
1711
1712                 /* if the spread between the smallest and largest coverage by
1713                    a node is >=2 we steal one of the ips from the node with
1714                    most coverage to even things out a bit.
1715                    try to do this a limited number of times since we dont
1716                    want to spend too much time balancing the ip coverage.
1717                 */
1718                 if ( (maxnum > minnum+1)
1719                      && (retries < (num_ips + 5)) ){
1720                         struct public_ip_list *tmp;
1721
1722                         /* Reassign one of maxnode's VNNs */
1723                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1724                                 if (tmp->pnn == maxnode) {
1725                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1726                                         retries++;
1727                                         goto try_again;;
1728                                 }
1729                         }
1730                 }
1731         }
1732 }
1733
1734 static void lcp2_init(struct ctdb_context *tmp_ctx,
1735                       struct ctdb_ipflags *ipflags,
1736                       struct public_ip_list *all_ips,
1737                       uint32_t *force_rebalance_nodes,
1738                       uint32_t **lcp2_imbalances,
1739                       bool **rebalance_candidates)
1740 {
1741         int i, numnodes;
1742         struct public_ip_list *tmp_ip;
1743
1744         numnodes = talloc_array_length(ipflags);
1745
1746         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1747         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1748         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1750
1751         for (i=0; i<numnodes; i++) {
1752                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1753                 /* First step: assume all nodes are candidates */
1754                 (*rebalance_candidates)[i] = true;
1755         }
1756
1757         /* 2nd step: if a node has IPs assigned then it must have been
1758          * healthy before, so we remove it from consideration.  This
1759          * is overkill but is all we have because we don't maintain
1760          * state between takeover runs.  An alternative would be to
1761          * keep state and invalidate it every time the recovery master
1762          * changes.
1763          */
1764         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1765                 if (tmp_ip->pnn != -1) {
1766                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1767                 }
1768         }
1769
1770         /* 3rd step: if a node is forced to re-balance then
1771            we allow failback onto the node */
1772         if (force_rebalance_nodes == NULL) {
1773                 return;
1774         }
1775         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1776                 uint32_t pnn = force_rebalance_nodes[i];
1777                 if (pnn >= numnodes) {
1778                         DEBUG(DEBUG_ERR,
1779                               (__location__ "unknown node %u\n", pnn));
1780                         continue;
1781                 }
1782
1783                 DEBUG(DEBUG_NOTICE,
1784                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1785                 (*rebalance_candidates)[pnn] = true;
1786         }
1787 }
1788
1789 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1790  * the IP/node combination that will cost the least.
1791  */
1792 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1793                                      struct ctdb_ipflags *ipflags,
1794                                      struct public_ip_list *all_ips,
1795                                      uint32_t *lcp2_imbalances)
1796 {
1797         struct public_ip_list *tmp_ip;
1798         int dstnode, numnodes;
1799
1800         int minnode;
1801         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1802         struct public_ip_list *minip;
1803
1804         bool should_loop = true;
1805         bool have_unassigned = true;
1806
1807         numnodes = talloc_array_length(ipflags);
1808
1809         while (have_unassigned && should_loop) {
1810                 should_loop = false;
1811
1812                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1813                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1814
1815                 minnode = -1;
1816                 mindsum = 0;
1817                 minip = NULL;
1818
1819                 /* loop over each unassigned ip. */
1820                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1821                         if (tmp_ip->pnn != -1) {
1822                                 continue;
1823                         }
1824
1825                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1826                                 /* only check nodes that can actually takeover this ip */
1827                                 if (!can_node_takeover_ip(ctdb, dstnode,
1828                                                           ipflags[dstnode],
1829                                                           tmp_ip)) {
1830                                         /* no it couldnt   so skip to the next node */
1831                                         continue;
1832                                 }
1833
1834                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1835                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1836                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1837                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1838                                                    dstnode,
1839                                                    dstimbl - lcp2_imbalances[dstnode]));
1840
1841
1842                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1843                                         minnode = dstnode;
1844                                         minimbl = dstimbl;
1845                                         mindsum = dstdsum;
1846                                         minip = tmp_ip;
1847                                         should_loop = true;
1848                                 }
1849                         }
1850                 }
1851
1852                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1853
1854                 /* If we found one then assign it to the given node. */
1855                 if (minnode != -1) {
1856                         minip->pnn = minnode;
1857                         lcp2_imbalances[minnode] = minimbl;
1858                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1859                                           ctdb_addr_to_str(&(minip->addr)),
1860                                           minnode,
1861                                           mindsum));
1862                 }
1863
1864                 /* There might be a better way but at least this is clear. */
1865                 have_unassigned = false;
1866                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1867                         if (tmp_ip->pnn == -1) {
1868                                 have_unassigned = true;
1869                         }
1870                 }
1871         }
1872
1873         /* We know if we have an unassigned addresses so we might as
1874          * well optimise.
1875          */
1876         if (have_unassigned) {
1877                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1878                         if (tmp_ip->pnn == -1) {
1879                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1880                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1881                         }
1882                 }
1883         }
1884 }
1885
1886 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1887  * to move IPs from, determines the best IP/destination node
1888  * combination to move from the source node.
1889  */
1890 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1891                                     struct ctdb_ipflags *ipflags,
1892                                     struct public_ip_list *all_ips,
1893                                     int srcnode,
1894                                     uint32_t *lcp2_imbalances,
1895                                     bool *rebalance_candidates)
1896 {
1897         int dstnode, mindstnode, numnodes;
1898         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1899         uint32_t minsrcimbl, mindstimbl;
1900         struct public_ip_list *minip;
1901         struct public_ip_list *tmp_ip;
1902
1903         /* Find an IP and destination node that best reduces imbalance. */
1904         srcimbl = 0;
1905         minip = NULL;
1906         minsrcimbl = 0;
1907         mindstnode = -1;
1908         mindstimbl = 0;
1909
1910         numnodes = talloc_array_length(ipflags);
1911
1912         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1913         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1914                            srcnode, lcp2_imbalances[srcnode]));
1915
1916         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1917                 /* Only consider addresses on srcnode. */
1918                 if (tmp_ip->pnn != srcnode) {
1919                         continue;
1920                 }
1921
1922                 /* What is this IP address costing the source node? */
1923                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1924                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1925
1926                 /* Consider this IP address would cost each potential
1927                  * destination node.  Destination nodes are limited to
1928                  * those that are newly healthy, since we don't want
1929                  * to do gratuitous failover of IPs just to make minor
1930                  * balance improvements.
1931                  */
1932                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1933                         if (!rebalance_candidates[dstnode]) {
1934                                 continue;
1935                         }
1936
1937                         /* only check nodes that can actually takeover this ip */
1938                         if (!can_node_takeover_ip(ctdb, dstnode,
1939                                                   ipflags[dstnode], tmp_ip)) {
1940                                 /* no it couldnt   so skip to the next node */
1941                                 continue;
1942                         }
1943
1944                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1945                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1946                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1947                                            srcnode, -srcdsum,
1948                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1949                                            dstnode, dstdsum));
1950
1951                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1952                             (dstdsum < srcdsum) &&                      \
1953                             ((mindstnode == -1) ||                              \
1954                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1955
1956                                 minip = tmp_ip;
1957                                 minsrcimbl = srcimbl;
1958                                 mindstnode = dstnode;
1959                                 mindstimbl = dstimbl;
1960                         }
1961                 }
1962         }
1963         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1964
1965         if (mindstnode != -1) {
1966                 /* We found a move that makes things better... */
1967                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1968                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1969                                   ctdb_addr_to_str(&(minip->addr)),
1970                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1971
1972
1973                 lcp2_imbalances[srcnode] = minsrcimbl;
1974                 lcp2_imbalances[mindstnode] = mindstimbl;
1975                 minip->pnn = mindstnode;
1976
1977                 return true;
1978         }
1979
1980         return false;
1981         
1982 }
1983
1984 struct lcp2_imbalance_pnn {
1985         uint32_t imbalance;
1986         int pnn;
1987 };
1988
1989 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1990 {
1991         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1992         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1993
1994         if (lipa->imbalance > lipb->imbalance) {
1995                 return -1;
1996         } else if (lipa->imbalance == lipb->imbalance) {
1997                 return 0;
1998         } else {
1999                 return 1;
2000         }
2001 }
2002
2003 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2004  * node with the highest LCP2 imbalance, and then determines the best
2005  * IP/destination node combination to move from the source node.
2006  */
2007 static void lcp2_failback(struct ctdb_context *ctdb,
2008                           struct ctdb_ipflags *ipflags,
2009                           struct public_ip_list *all_ips,
2010                           uint32_t *lcp2_imbalances,
2011                           bool *rebalance_candidates)
2012 {
2013         int i, numnodes;
2014         struct lcp2_imbalance_pnn * lips;
2015         bool again;
2016
2017         numnodes = talloc_array_length(ipflags);
2018
2019 try_again:
2020         /* Put the imbalances and nodes into an array, sort them and
2021          * iterate through candidates.  Usually the 1st one will be
2022          * used, so this doesn't cost much...
2023          */
2024         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2025         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2026         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2027         for (i=0; i<numnodes; i++) {
2028                 lips[i].imbalance = lcp2_imbalances[i];
2029                 lips[i].pnn = i;
2030                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2031         }
2032         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2033               lcp2_cmp_imbalance_pnn);
2034
2035         again = false;
2036         for (i=0; i<numnodes; i++) {
2037                 /* This means that all nodes had 0 or 1 addresses, so
2038                  * can't be imbalanced.
2039                  */
2040                 if (lips[i].imbalance == 0) {
2041                         break;
2042                 }
2043
2044                 if (lcp2_failback_candidate(ctdb,
2045                                             ipflags,
2046                                             all_ips,
2047                                             lips[i].pnn,
2048                                             lcp2_imbalances,
2049                                             rebalance_candidates)) {
2050                         again = true;
2051                         break;
2052                 }
2053         }
2054
2055         talloc_free(lips);
2056         if (again) {
2057                 goto try_again;
2058         }
2059 }
2060
2061 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2062                                     struct ctdb_ipflags *ipflags,
2063                                     struct public_ip_list *all_ips)
2064 {
2065         struct public_ip_list *tmp_ip;
2066
2067         /* verify that the assigned nodes can serve that public ip
2068            and set it to -1 if not
2069         */
2070         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2071                 if (tmp_ip->pnn == -1) {
2072                         continue;
2073                 }
2074                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2075                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2076                         /* this node can not serve this ip. */
2077                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2078                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2079                                            tmp_ip->pnn));
2080                         tmp_ip->pnn = -1;
2081                 }
2082         }
2083 }
2084
2085 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2086                                        struct ctdb_ipflags *ipflags,
2087                                        struct public_ip_list *all_ips)
2088 {
2089         struct public_ip_list *tmp_ip;
2090         int i, numnodes;
2091
2092         numnodes = talloc_array_length(ipflags);
2093
2094         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2095        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2096         *  always be allocated the same way for a specific set of
2097         *  available/unavailable nodes.
2098         */
2099
2100         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2101                 tmp_ip->pnn = i % numnodes;
2102         }
2103
2104         /* IP failback doesn't make sense with deterministic
2105          * IPs, since the modulo step above implicitly fails
2106          * back IPs to their "home" node.
2107          */
2108         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2109                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2110         }
2111
2112         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2113
2114         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2115
2116         /* No failback here! */
2117 }
2118
2119 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2120                                           struct ctdb_ipflags *ipflags,
2121                                           struct public_ip_list *all_ips)
2122 {
2123         /* This should be pushed down into basic_failback. */
2124         struct public_ip_list *tmp_ip;
2125         int num_ips = 0;
2126         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2127                 num_ips++;
2128         }
2129
2130         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2131
2132         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2133
2134         /* If we don't want IPs to fail back then don't rebalance IPs. */
2135         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2136                 return;
2137         }
2138
2139         /* Now, try to make sure the ip adresses are evenly distributed
2140            across the nodes.
2141         */
2142         basic_failback(ctdb, ipflags, all_ips, num_ips);
2143 }
2144
2145 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2146                           struct ctdb_ipflags *ipflags,
2147                           struct public_ip_list *all_ips,
2148                           uint32_t *force_rebalance_nodes)
2149 {
2150         uint32_t *lcp2_imbalances;
2151         bool *rebalance_candidates;
2152         int numnodes, num_rebalance_candidates, i;
2153
2154         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2155
2156         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2157
2158         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2159                   &lcp2_imbalances, &rebalance_candidates);
2160
2161         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2162
2163         /* If we don't want IPs to fail back then don't rebalance IPs. */
2164         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2165                 goto finished;
2166         }
2167
2168         /* It is only worth continuing if we have suitable target
2169          * nodes to transfer IPs to.  This check is much cheaper than
2170          * continuing on...
2171          */
2172         numnodes = talloc_array_length(ipflags);
2173         num_rebalance_candidates = 0;
2174         for (i=0; i<numnodes; i++) {
2175                 if (rebalance_candidates[i]) {
2176                         num_rebalance_candidates++;
2177                 }
2178         }
2179         if (num_rebalance_candidates == 0) {
2180                 goto finished;
2181         }
2182
2183         /* Now, try to make sure the ip adresses are evenly distributed
2184            across the nodes.
2185         */
2186         lcp2_failback(ctdb, ipflags, all_ips,
2187                       lcp2_imbalances, rebalance_candidates);
2188
2189 finished:
2190         talloc_free(tmp_ctx);
2191 }
2192
2193 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2194 {
2195         int i;
2196
2197         for (i=0;i<nodemap->num;i++) {
2198                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2199                         /* Found one completely healthy node */
2200                         return false;
2201                 }
2202         }
2203
2204         return true;
2205 }
2206
2207 /* The calculation part of the IP allocation algorithm. */
2208 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2209                                    struct ctdb_ipflags *ipflags,
2210                                    struct public_ip_list *all_ips,
2211                                    uint32_t *force_rebalance_nodes)
2212 {
2213         switch (ctdb->ipalloc_state->algorithm) {
2214         case IPALLOC_LCP2:
2215                 ip_alloc_lcp2(ctdb, ipflags, all_ips, force_rebalance_nodes);
2216                 break;
2217         case IPALLOC_DETERMINISTIC:
2218                 ip_alloc_deterministic_ips(ctdb, ipflags, all_ips);
2219                 break;
2220         case IPALLOC_NONDETERMINISTIC:
2221                 ip_alloc_nondeterministic_ips(ctdb, ipflags, all_ips);
2222                break;
2223         }
2224
2225         /* at this point ->pnn is the node which will own each IP
2226            or -1 if there is no node that can cover this ip
2227         */
2228
2229         return;
2230 }
2231
2232 struct get_tunable_callback_data {
2233         const char *tunable;
2234         uint32_t *out;
2235         bool fatal;
2236 };
2237
2238 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2239                                  int32_t res, TDB_DATA outdata,
2240                                  void *callback)
2241 {
2242         struct get_tunable_callback_data *cd =
2243                 (struct get_tunable_callback_data *)callback;
2244         int size;
2245
2246         if (res != 0) {
2247                 /* Already handled in fail callback */
2248                 return;
2249         }
2250
2251         if (outdata.dsize != sizeof(uint32_t)) {
2252                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2253                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2254                                  (int)outdata.dsize));
2255                 cd->fatal = true;
2256                 return;
2257         }
2258
2259         size = talloc_array_length(cd->out);
2260         if (pnn >= size) {
2261                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2262                                  cd->tunable, pnn, size));
2263                 return;
2264         }
2265
2266                 
2267         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2268 }
2269
2270 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2271                                        int32_t res, TDB_DATA outdata,
2272                                        void *callback)
2273 {
2274         struct get_tunable_callback_data *cd =
2275                 (struct get_tunable_callback_data *)callback;
2276
2277         switch (res) {
2278         case -ETIME:
2279                 DEBUG(DEBUG_ERR,
2280                       ("Timed out getting tunable \"%s\" from node %d\n",
2281                        cd->tunable, pnn));
2282                 cd->fatal = true;
2283                 break;
2284         case -EINVAL:
2285         case -1:
2286                 DEBUG(DEBUG_WARNING,
2287                       ("Tunable \"%s\" not implemented on node %d\n",
2288                        cd->tunable, pnn));
2289                 break;
2290         default:
2291                 DEBUG(DEBUG_ERR,
2292                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2293                        cd->tunable, pnn));
2294                 cd->fatal = true;
2295         }
2296 }
2297
2298 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2299                                         TALLOC_CTX *tmp_ctx,
2300                                         struct ctdb_node_map_old *nodemap,
2301                                         const char *tunable,
2302                                         uint32_t default_value)
2303 {
2304         TDB_DATA data;
2305         struct ctdb_control_get_tunable *t;
2306         uint32_t *nodes;
2307         uint32_t *tvals;
2308         struct get_tunable_callback_data callback_data;
2309         int i;
2310
2311         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2312         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2313         for (i=0; i<nodemap->num; i++) {
2314                 tvals[i] = default_value;
2315         }
2316                 
2317         callback_data.out = tvals;
2318         callback_data.tunable = tunable;
2319         callback_data.fatal = false;
2320
2321         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2322         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2323         t = (struct ctdb_control_get_tunable *)data.dptr;
2324         t->length = strlen(tunable)+1;
2325         memcpy(t->name, tunable, t->length);
2326         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2327         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2328                                       nodes, 0, TAKEOVER_TIMEOUT(),
2329                                       false, data,
2330                                       get_tunable_callback,
2331                                       get_tunable_fail_callback,
2332                                       &callback_data) != 0) {
2333                 if (callback_data.fatal) {
2334                         talloc_free(tvals);
2335                         tvals = NULL;
2336                 }
2337         }
2338         talloc_free(nodes);
2339         talloc_free(data.dptr);
2340
2341         return tvals;
2342 }
2343
2344 /* Set internal flags for IP allocation:
2345  *   Clear ip flags
2346  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2347  *   Set NOIPHOST ip flag for each INACTIVE node
2348  *   if all nodes are disabled:
2349  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2350  *   else
2351  *     Set NOIPHOST ip flags for disabled nodes
2352  */
2353 static struct ctdb_ipflags *
2354 set_ipflags_internal(struct ctdb_context *ctdb,
2355                      TALLOC_CTX *tmp_ctx,
2356                      struct ctdb_node_map_old *nodemap,
2357                      uint32_t *tval_noiptakeover,
2358                      uint32_t *tval_noiphostonalldisabled)
2359 {
2360         int i;
2361         struct ctdb_ipflags *ipflags;
2362
2363         /* Clear IP flags - implicit due to talloc_zero */
2364         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2365         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2366
2367         for (i=0;i<nodemap->num;i++) {
2368                 /* Can not take IPs on node with NoIPTakeover set */
2369                 if (tval_noiptakeover[i] != 0) {
2370                         ipflags[i].noiptakeover = true;
2371                 }
2372
2373                 /* Can not host IPs on INACTIVE node */
2374                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2375                         ipflags[i].noiphost = true;
2376                 }
2377         }
2378
2379         if (all_nodes_are_disabled(nodemap)) {
2380                 /* If all nodes are disabled, can not host IPs on node
2381                  * with NoIPHostOnAllDisabled set
2382                  */
2383                 for (i=0;i<nodemap->num;i++) {
2384                         if (tval_noiphostonalldisabled[i] != 0) {
2385                                 ipflags[i].noiphost = true;
2386                         }
2387                 }
2388         } else {
2389                 /* If some nodes are not disabled, then can not host
2390                  * IPs on DISABLED node
2391                  */
2392                 for (i=0;i<nodemap->num;i++) {
2393                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2394                                 ipflags[i].noiphost = true;
2395                         }
2396                 }
2397         }
2398
2399         return ipflags;
2400 }
2401
2402 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2403                                         TALLOC_CTX *tmp_ctx,
2404                                         struct ctdb_node_map_old *nodemap)
2405 {
2406         uint32_t *tval_noiptakeover;
2407         uint32_t *tval_noiphostonalldisabled;
2408         struct ctdb_ipflags *ipflags;
2409
2410
2411         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2412                                                    "NoIPTakeover", 0);
2413         if (tval_noiptakeover == NULL) {
2414                 return NULL;
2415         }
2416
2417         tval_noiphostonalldisabled =
2418                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2419                                        "NoIPHostOnAllDisabled", 0);
2420         if (tval_noiphostonalldisabled == NULL) {
2421                 /* Caller frees tmp_ctx */
2422                 return NULL;
2423         }
2424
2425         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2426                                        tval_noiptakeover,
2427                                        tval_noiphostonalldisabled);
2428
2429         talloc_free(tval_noiptakeover);
2430         talloc_free(tval_noiphostonalldisabled);
2431
2432         return ipflags;
2433 }
2434
2435 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2436                                                  TALLOC_CTX *mem_ctx)
2437 {
2438         struct ipalloc_state *ipalloc_state =
2439                 talloc_zero(mem_ctx, struct ipalloc_state);
2440         if (ipalloc_state == NULL) {
2441                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2442                 return NULL;
2443         }
2444
2445         ipalloc_state->num = ctdb->num_nodes;
2446         ipalloc_state->known_public_ips =
2447                 talloc_zero_array(ipalloc_state,
2448                                   struct ctdb_public_ip_list_old *,
2449                                   ipalloc_state->num);
2450         if (ipalloc_state->known_public_ips == NULL) {
2451                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2452                 talloc_free(ipalloc_state);
2453                 return NULL;
2454         }
2455         ipalloc_state->available_public_ips =
2456                 talloc_zero_array(ipalloc_state,
2457                                   struct ctdb_public_ip_list_old *,
2458                                   ipalloc_state->num);
2459         if (ipalloc_state->available_public_ips == NULL) {
2460                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2461                 talloc_free(ipalloc_state);
2462                 return NULL;
2463         }
2464
2465         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2466                 ipalloc_state->algorithm = IPALLOC_LCP2;
2467         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2468                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2469         } else {
2470                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2471         }
2472
2473         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2474
2475         return ipalloc_state;
2476 }
2477
2478 struct iprealloc_callback_data {
2479         bool *retry_nodes;
2480         int retry_count;
2481         client_async_callback fail_callback;
2482         void *fail_callback_data;
2483         struct ctdb_node_map_old *nodemap;
2484 };
2485
2486 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2487                                         int32_t res, TDB_DATA outdata,
2488                                         void *callback)
2489 {
2490         int numnodes;
2491         struct iprealloc_callback_data *cd =
2492                 (struct iprealloc_callback_data *)callback;
2493
2494         numnodes = talloc_array_length(cd->retry_nodes);
2495         if (pnn > numnodes) {
2496                 DEBUG(DEBUG_ERR,
2497                       ("ipreallocated failure from node %d, "
2498                        "but only %d nodes in nodemap\n",
2499                        pnn, numnodes));
2500                 return;
2501         }
2502
2503         /* Can't run the "ipreallocated" event on a INACTIVE node */
2504         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2505                 DEBUG(DEBUG_WARNING,
2506                       ("ipreallocated failed on inactive node %d, ignoring\n",
2507                        pnn));
2508                 return;
2509         }
2510
2511         switch (res) {
2512         case -ETIME:
2513                 /* If the control timed out then that's a real error,
2514                  * so call the real fail callback
2515                  */
2516                 if (cd->fail_callback) {
2517                         cd->fail_callback(ctdb, pnn, res, outdata,
2518                                           cd->fail_callback_data);
2519                 } else {
2520                         DEBUG(DEBUG_WARNING,
2521                               ("iprealloc timed out but no callback registered\n"));
2522                 }
2523                 break;
2524         default:
2525                 /* If not a timeout then either the ipreallocated
2526                  * eventscript (or some setup) failed.  This might
2527                  * have failed because the IPREALLOCATED control isn't
2528                  * implemented - right now there is no way of knowing
2529                  * because the error codes are all folded down to -1.
2530                  * Consider retrying using EVENTSCRIPT control...
2531                  */
2532                 DEBUG(DEBUG_WARNING,
2533                       ("ipreallocated failure from node %d, flagging retry\n",
2534                        pnn));
2535                 cd->retry_nodes[pnn] = true;
2536                 cd->retry_count++;
2537         }
2538 }
2539
2540 struct takeover_callback_data {
2541         bool *node_failed;
2542         client_async_callback fail_callback;
2543         void *fail_callback_data;
2544         struct ctdb_node_map_old *nodemap;
2545 };
2546
2547 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2548                                        uint32_t node_pnn, int32_t res,
2549                                        TDB_DATA outdata, void *callback_data)
2550 {
2551         struct takeover_callback_data *cd =
2552                 talloc_get_type_abort(callback_data,
2553                                       struct takeover_callback_data);
2554         int i;
2555
2556         for (i = 0; i < cd->nodemap->num; i++) {
2557                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2558                         break;
2559                 }
2560         }
2561
2562         if (i == cd->nodemap->num) {
2563                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2564                 return;
2565         }
2566
2567         if (!cd->node_failed[i]) {
2568                 cd->node_failed[i] = true;
2569                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2570                                   cd->fail_callback_data);
2571         }
2572 }
2573
2574 /*
2575   make any IP alias changes for public addresses that are necessary 
2576  */
2577 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2578                       uint32_t *force_rebalance_nodes,
2579                       client_async_callback fail_callback, void *callback_data)
2580 {
2581         int i, j, ret;
2582         struct ctdb_public_ip ip;
2583         uint32_t *nodes;
2584         struct public_ip_list *all_ips, *tmp_ip;
2585         TDB_DATA data;
2586         struct timeval timeout;
2587         struct client_async_data *async_data;
2588         struct ctdb_client_control_state *state;
2589         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2590         struct ctdb_ipflags *ipflags;
2591         struct ipalloc_state *ipalloc_state;
2592         struct takeover_callback_data *takeover_data;
2593         struct iprealloc_callback_data iprealloc_data;
2594         bool *retry_data;
2595         bool can_host_ips;
2596
2597         /*
2598          * ip failover is completely disabled, just send out the 
2599          * ipreallocated event.
2600          */
2601         if (ctdb->tunable.disable_ip_failover != 0) {
2602                 goto ipreallocated;
2603         }
2604
2605         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2606         if (ipalloc_state == NULL) {
2607                 talloc_free(tmp_ctx);
2608                 return -1;
2609         }
2610         ctdb->ipalloc_state = ipalloc_state;
2611
2612         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2613         if (ipflags == NULL) {
2614                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2615                 talloc_free(tmp_ctx);
2616                 return -1;
2617         }
2618
2619         /* Fetch known/available public IPs from each active node */
2620         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2621         if (ret != 0) {
2622                 talloc_free(tmp_ctx);
2623                 return -1;
2624         }
2625
2626         /* Short-circuit IP allocation if no node has available IPs */
2627         can_host_ips = false;
2628         for (i=0; i < ipalloc_state->num; i++) {
2629                 if (ipalloc_state->available_public_ips[i] != NULL) {
2630                         can_host_ips = true;
2631                 }
2632         }
2633         if (!can_host_ips) {
2634                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2635                 return 0;
2636         }
2637
2638         /* since nodes only know about those public addresses that
2639            can be served by that particular node, no single node has
2640            a full list of all public addresses that exist in the cluster.
2641            Walk over all node structures and create a merged list of
2642            all public addresses that exist in the cluster.
2643
2644            keep the tree of ips around as ctdb->ip_tree
2645         */
2646         all_ips = create_merged_ip_list(ctdb);
2647
2648         /* Do the IP reassignment calculations */
2649         ctdb_takeover_run_core(ctdb, ipflags, all_ips, force_rebalance_nodes);
2650
2651         /* Now tell all nodes to release any public IPs should not
2652          * host.  This will be a NOOP on nodes that don't currently
2653          * hold the given IP.
2654          */
2655         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2656         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2657
2658         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2659                                                        bool, nodemap->num);
2660         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2661         takeover_data->fail_callback = fail_callback;
2662         takeover_data->fail_callback_data = callback_data;
2663         takeover_data->nodemap = nodemap;
2664
2665         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2666         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2667
2668         async_data->fail_callback = takeover_run_fail_callback;
2669         async_data->callback_data = takeover_data;
2670
2671         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2672
2673         /* Send a RELEASE_IP to all nodes that should not be hosting
2674          * each IP.  For each IP, all but one of these will be
2675          * redundant.  However, the redundant ones are used to tell
2676          * nodes which node should be hosting the IP so that commands
2677          * like "ctdb ip" can display a particular nodes idea of who
2678          * is hosting what. */
2679         for (i=0;i<nodemap->num;i++) {
2680                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2681                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2682                         continue;
2683                 }
2684
2685                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2686                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2687                                 /* This node should be serving this
2688                                    vnn so don't tell it to release the ip
2689                                 */
2690                                 continue;
2691                         }
2692                         ip.pnn  = tmp_ip->pnn;
2693                         ip.addr = tmp_ip->addr;
2694
2695                         timeout = TAKEOVER_TIMEOUT();
2696                         data.dsize = sizeof(ip);
2697                         data.dptr  = (uint8_t *)&ip;
2698                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2699                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2700                                                   data, async_data,
2701                                                   &timeout, NULL);
2702                         if (state == NULL) {
2703                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2704                                 talloc_free(tmp_ctx);
2705                                 return -1;
2706                         }
2707
2708                         ctdb_client_async_add(async_data, state);
2709                 }
2710         }
2711         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2712                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2713                 talloc_free(tmp_ctx);
2714                 return -1;
2715         }
2716         talloc_free(async_data);
2717
2718
2719         /* For each IP, send a TAKOVER_IP to the node that should be
2720          * hosting it.  Many of these will often be redundant (since
2721          * the allocation won't have changed) but they can be useful
2722          * to recover from inconsistencies. */
2723         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2724         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2725
2726         async_data->fail_callback = fail_callback;
2727         async_data->callback_data = callback_data;
2728
2729         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2730                 if (tmp_ip->pnn == -1) {
2731                         /* this IP won't be taken over */
2732                         continue;
2733                 }
2734
2735                 ip.pnn  = tmp_ip->pnn;
2736                 ip.addr = tmp_ip->addr;
2737
2738                 timeout = TAKEOVER_TIMEOUT();
2739                 data.dsize = sizeof(ip);
2740                 data.dptr  = (uint8_t *)&ip;
2741                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2742                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2743                                           data, async_data, &timeout, NULL);
2744                 if (state == NULL) {
2745                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2746                         talloc_free(tmp_ctx);
2747                         return -1;
2748                 }
2749
2750                 ctdb_client_async_add(async_data, state);
2751         }
2752         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2753                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2754                 talloc_free(tmp_ctx);
2755                 return -1;
2756         }
2757
2758 ipreallocated:
2759         /*
2760          * Tell all nodes to run eventscripts to process the
2761          * "ipreallocated" event.  This can do a lot of things,
2762          * including restarting services to reconfigure them if public
2763          * IPs have moved.  Once upon a time this event only used to
2764          * update natgw.
2765          */
2766         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2767         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2768         iprealloc_data.retry_nodes = retry_data;
2769         iprealloc_data.retry_count = 0;
2770         iprealloc_data.fail_callback = fail_callback;
2771         iprealloc_data.fail_callback_data = callback_data;
2772         iprealloc_data.nodemap = nodemap;
2773
2774         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2775         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2776                                         nodes, 0, TAKEOVER_TIMEOUT(),
2777                                         false, tdb_null,
2778                                         NULL, iprealloc_fail_callback,
2779                                         &iprealloc_data);
2780         if (ret != 0) {
2781                 /* If the control failed then we should retry to any
2782                  * nodes flagged by iprealloc_fail_callback using the
2783                  * EVENTSCRIPT control.  This is a best-effort at
2784                  * backward compatiblity when running a mixed cluster
2785                  * where some nodes have not yet been upgraded to
2786                  * support the IPREALLOCATED control.
2787                  */
2788                 DEBUG(DEBUG_WARNING,
2789                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2790
2791                 nodes = talloc_array(tmp_ctx, uint32_t,
2792                                      iprealloc_data.retry_count);
2793                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2794
2795                 j = 0;
2796                 for (i=0; i<nodemap->num; i++) {
2797                         if (iprealloc_data.retry_nodes[i]) {
2798                                 nodes[j] = i;
2799                                 j++;
2800                         }
2801                 }
2802
2803                 data.dptr  = discard_const("ipreallocated");
2804                 data.dsize = strlen((char *)data.dptr) + 1; 
2805                 ret = ctdb_client_async_control(ctdb,
2806                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2807                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2808                                                 false, data,
2809                                                 NULL, fail_callback,
2810                                                 callback_data);
2811                 if (ret != 0) {
2812                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2813                 }
2814         }
2815
2816         talloc_free(tmp_ctx);
2817         return ret;
2818 }
2819
2820
2821 /*
2822   destroy a ctdb_client_ip structure
2823  */
2824 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2825 {
2826         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2827                 ctdb_addr_to_str(&ip->addr),
2828                 ntohs(ip->addr.ip.sin_port),
2829                 ip->client_id));
2830
2831         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2832         return 0;
2833 }
2834
2835 /*
2836   called by a client to inform us of a TCP connection that it is managing
2837   that should tickled with an ACK when IP takeover is done
2838  */
2839 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2840                                 TDB_DATA indata)
2841 {
2842         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2843         struct ctdb_connection *tcp_sock = NULL;
2844         struct ctdb_tcp_list *tcp;
2845         struct ctdb_connection t;
2846         int ret;
2847         TDB_DATA data;
2848         struct ctdb_client_ip *ip;
2849         struct ctdb_vnn *vnn;
2850         ctdb_sock_addr addr;
2851
2852         /* If we don't have public IPs, tickles are useless */
2853         if (ctdb->vnn == NULL) {
2854                 return 0;
2855         }
2856
2857         tcp_sock = (struct ctdb_connection *)indata.dptr;
2858
2859         addr = tcp_sock->src;
2860         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2861         addr = tcp_sock->dst;
2862         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2863
2864         ZERO_STRUCT(addr);
2865         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2866         vnn = find_public_ip_vnn(ctdb, &addr);
2867         if (vnn == NULL) {
2868                 switch (addr.sa.sa_family) {
2869                 case AF_INET:
2870                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2871                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2872                                         ctdb_addr_to_str(&addr)));
2873                         }
2874                         break;
2875                 case AF_INET6:
2876                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2877                                 ctdb_addr_to_str(&addr)));
2878                         break;
2879                 default:
2880                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2881                 }
2882
2883                 return 0;
2884         }
2885
2886         if (vnn->pnn != ctdb->pnn) {
2887                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2888                         ctdb_addr_to_str(&addr),
2889                         client_id, client->pid));
2890                 /* failing this call will tell smbd to die */
2891                 return -1;
2892         }
2893
2894         ip = talloc(client, struct ctdb_client_ip);
2895         CTDB_NO_MEMORY(ctdb, ip);
2896
2897         ip->ctdb      = ctdb;
2898         ip->addr      = addr;
2899         ip->client_id = client_id;
2900         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2901         DLIST_ADD(ctdb->client_ip_list, ip);
2902
2903         tcp = talloc(client, struct ctdb_tcp_list);
2904         CTDB_NO_MEMORY(ctdb, tcp);
2905
2906         tcp->connection.src = tcp_sock->src;
2907         tcp->connection.dst = tcp_sock->dst;
2908
2909         DLIST_ADD(client->tcp_list, tcp);
2910
2911         t.src = tcp_sock->src;
2912         t.dst = tcp_sock->dst;
2913
2914         data.dptr = (uint8_t *)&t;
2915         data.dsize = sizeof(t);
2916
2917         switch (addr.sa.sa_family) {
2918         case AF_INET:
2919                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2920                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2921                         ctdb_addr_to_str(&tcp_sock->src),
2922                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2923                 break;
2924         case AF_INET6:
2925                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2926                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2927                         ctdb_addr_to_str(&tcp_sock->src),
2928                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2929                 break;
2930         default:
2931                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2932         }
2933
2934
2935         /* tell all nodes about this tcp connection */
2936         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2937                                        CTDB_CONTROL_TCP_ADD,
2938                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2939         if (ret != 0) {
2940                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2941                 return -1;
2942         }
2943
2944         return 0;
2945 }
2946
2947 /*
2948   find a tcp address on a list
2949  */
2950 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2951                                            struct ctdb_connection *tcp)
2952 {
2953         int i;
2954
2955         if (array == NULL) {
2956                 return NULL;
2957         }
2958
2959         for (i=0;i<array->num;i++) {
2960                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2961                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2962                         return &array->connections[i];
2963                 }
2964         }
2965         return NULL;
2966 }
2967
2968
2969
2970 /*
2971   called by a daemon to inform us of a TCP connection that one of its
2972   clients managing that should tickled with an ACK when IP takeover is
2973   done
2974  */
2975 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2976 {
2977         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2978         struct ctdb_tcp_array *tcparray;
2979         struct ctdb_connection tcp;
2980         struct ctdb_vnn *vnn;
2981
2982         /* If we don't have public IPs, tickles are useless */
2983         if (ctdb->vnn == NULL) {
2984                 return 0;
2985         }
2986
2987         vnn = find_public_ip_vnn(ctdb, &p->dst);
2988         if (vnn == NULL) {
2989                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2990                         ctdb_addr_to_str(&p->dst)));
2991
2992                 return -1;
2993         }
2994
2995
2996         tcparray = vnn->tcp_array;
2997
2998         /* If this is the first tickle */
2999         if (tcparray == NULL) {
3000                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3001                 CTDB_NO_MEMORY(ctdb, tcparray);
3002                 vnn->tcp_array = tcparray;
3003
3004                 tcparray->num = 0;
3005                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3006                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3007
3008                 tcparray->connections[tcparray->num].src = p->src;
3009                 tcparray->connections[tcparray->num].dst = p->dst;
3010                 tcparray->num++;
3011
3012                 if (tcp_update_needed) {
3013                         vnn->tcp_update_needed = true;
3014                 }
3015                 return 0;
3016         }
3017
3018
3019         /* Do we already have this tickle ?*/
3020         tcp.src = p->src;
3021         tcp.dst = p->dst;
3022         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3023                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3024                         ctdb_addr_to_str(&tcp.dst),
3025                         ntohs(tcp.dst.ip.sin_port),
3026                         vnn->pnn));
3027                 return 0;
3028         }
3029
3030         /* A new tickle, we must add it to the array */
3031         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3032                                         struct ctdb_connection,
3033                                         tcparray->num+1);
3034         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3035
3036         tcparray->connections[tcparray->num].src = p->src;
3037         tcparray->connections[tcparray->num].dst = p->dst;
3038         tcparray->num++;
3039
3040         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3041                 ctdb_addr_to_str(&tcp.dst),
3042                 ntohs(tcp.dst.ip.sin_port),
3043                 vnn->pnn));
3044
3045         if (tcp_update_needed) {
3046                 vnn->tcp_update_needed = true;
3047         }
3048
3049         return 0;
3050 }
3051
3052
3053 /*
3054   called by a daemon to inform us of a TCP connection that one of its
3055   clients managing that should tickled with an ACK when IP takeover is
3056   done
3057  */
3058 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3059 {
3060         struct ctdb_connection *tcpp;
3061         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3062
3063         if (vnn == NULL) {
3064                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3065                         ctdb_addr_to_str(&conn->dst)));
3066                 return;
3067         }
3068
3069         /* if the array is empty we cant remove it
3070            and we don't need to do anything
3071          */
3072         if (vnn->tcp_array == NULL) {
3073                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3074                         ctdb_addr_to_str(&conn->dst),
3075                         ntohs(conn->dst.ip.sin_port)));
3076                 return;
3077         }
3078
3079
3080         /* See if we know this connection
3081            if we don't know this connection  then we dont need to do anything
3082          */
3083         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3084         if (tcpp == NULL) {
3085                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3086                         ctdb_addr_to_str(&conn->dst),
3087                         ntohs(conn->dst.ip.sin_port)));
3088                 return;
3089         }
3090
3091
3092         /* We need to remove this entry from the array.
3093            Instead of allocating a new array and copying data to it
3094            we cheat and just copy the last entry in the existing array
3095            to the entry that is to be removed and just shring the 
3096            ->num field
3097          */
3098         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3099         vnn->tcp_array->num--;
3100
3101         /* If we deleted the last entry we also need to remove the entire array
3102          */
3103         if (vnn->tcp_array->num == 0) {
3104                 talloc_free(vnn->tcp_array);
3105                 vnn->tcp_array = NULL;
3106         }               
3107
3108         vnn->tcp_update_needed = true;
3109
3110         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3111                 ctdb_addr_to_str(&conn->src),
3112                 ntohs(conn->src.ip.sin_port)));
3113 }
3114
3115
3116 /*
3117   called by a daemon to inform us of a TCP connection that one of its
3118   clients used are no longer needed in the tickle database
3119  */
3120 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3121 {
3122         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3123
3124         /* If we don't have public IPs, tickles are useless */
3125         if (ctdb->vnn == NULL) {
3126                 return 0;
3127         }
3128
3129         ctdb_remove_connection(ctdb, conn);
3130
3131         return 0;
3132 }
3133
3134
3135 /*
3136   Called when another daemon starts - causes all tickles for all
3137   public addresses we are serving to be sent to the new node on the
3138   next check.  This actually causes the next scheduled call to
3139   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3140   doesn't require careful error handling.
3141  */
3142 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3143 {
3144         struct ctdb_vnn *vnn;
3145
3146         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3147                            (unsigned long) pnn));
3148
3149         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3150                 vnn->tcp_update_needed = true;
3151         }
3152
3153         return 0;
3154 }
3155
3156
3157 /*
3158   called when a client structure goes away - hook to remove
3159   elements from the tcp_list in all daemons
3160  */
3161 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3162 {
3163         while (client->tcp_list) {
3164                 struct ctdb_tcp_list *tcp = client->tcp_list;
3165                 DLIST_REMOVE(client->tcp_list, tcp);
3166                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3167         }
3168 }
3169
3170
3171 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3172 {
3173         struct ctdb_vnn *vnn;
3174         int count = 0;
3175
3176         if (ctdb->tunable.disable_ip_failover == 1) {
3177                 return;
3178         }
3179
3180         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3181                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3182                         ctdb_vnn_unassign_iface(ctdb, vnn);
3183                         continue;
3184                 }
3185                 if (!vnn->iface) {
3186                         continue;
3187                 }
3188
3189                 /* Don't allow multiple releases at once.  Some code,
3190                  * particularly ctdb_tickle_sentenced_connections() is
3191                  * not re-entrant */
3192                 if (vnn->update_in_flight) {
3193                         DEBUG(DEBUG_WARNING,
3194                               (__location__
3195                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3196                                     ctdb_addr_to_str(&vnn->public_address),
3197                                     vnn->public_netmask_bits,
3198                                     ctdb_vnn_iface_string(vnn)));
3199                         continue;
3200                 }
3201                 vnn->update_in_flight = true;
3202
3203                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3204                                     ctdb_addr_to_str(&vnn->public_address),
3205                                     vnn->public_netmask_bits,
3206                                     ctdb_vnn_iface_string(vnn)));
3207
3208                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3209                                   ctdb_vnn_iface_string(vnn),
3210                                   ctdb_addr_to_str(&vnn->public_address),
3211                                   vnn->public_netmask_bits);
3212                 release_kill_clients(ctdb, &vnn->public_address);
3213                 ctdb_vnn_unassign_iface(ctdb, vnn);
3214                 vnn->update_in_flight = false;
3215                 count++;
3216         }
3217
3218         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3219 }
3220
3221
3222 /*
3223   get list of public IPs
3224  */
3225 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3226                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3227 {
3228         int i, num, len;
3229         struct ctdb_public_ip_list_old *ips;
3230         struct ctdb_vnn *vnn;
3231         bool only_available = false;
3232
3233         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3234                 only_available = true;
3235         }
3236
3237         /* count how many public ip structures we have */
3238         num = 0;
3239         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3240                 num++;
3241         }
3242
3243         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3244                 num*sizeof(struct ctdb_public_ip);
3245         ips = talloc_zero_size(outdata, len);
3246         CTDB_NO_MEMORY(ctdb, ips);
3247
3248         i = 0;
3249         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3250                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3251                         continue;
3252                 }
3253                 ips->ips[i].pnn  = vnn->pnn;
3254                 ips->ips[i].addr = vnn->public_address;
3255                 i++;
3256         }
3257         ips->num = i;
3258         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3259                 i*sizeof(struct ctdb_public_ip);
3260
3261         outdata->dsize = len;
3262         outdata->dptr  = (uint8_t *)ips;
3263
3264         return 0;
3265 }
3266
3267
3268 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3269                                         struct ctdb_req_control_old *c,
3270                                         TDB_DATA indata,
3271                                         TDB_DATA *outdata)
3272 {
3273         int i, num, len;
3274         ctdb_sock_addr *addr;
3275         struct ctdb_public_ip_info_old *info;
3276         struct ctdb_vnn *vnn;
3277
3278         addr = (ctdb_sock_addr *)indata.dptr;
3279
3280         vnn = find_public_ip_vnn(ctdb, addr);
3281         if (vnn == NULL) {
3282                 /* if it is not a public ip   it could be our 'single ip' */
3283                 if (ctdb->single_ip_vnn) {
3284                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3285                                 vnn = ctdb->single_ip_vnn;
3286                         }
3287                 }
3288         }
3289         if (vnn == NULL) {
3290                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3291                                  "'%s'not a public address\n",
3292                                  ctdb_addr_to_str(addr)));
3293                 return -1;
3294         }
3295
3296         /* count how many public ip structures we have */
3297         num = 0;
3298         for (;vnn->ifaces[num];) {
3299                 num++;
3300         }
3301
3302         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3303                 num*sizeof(struct ctdb_iface);
3304         info = talloc_zero_size(outdata, len);
3305         CTDB_NO_MEMORY(ctdb, info);
3306
3307         info->ip.addr = vnn->public_address;
3308         info->ip.pnn = vnn->pnn;
3309         info->active_idx = 0xFFFFFFFF;
3310
3311         for (i=0; vnn->ifaces[i]; i++) {
3312                 struct ctdb_interface *cur;
3313
3314                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3315                 if (cur == NULL) {
3316                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3317                                            vnn->ifaces[i]));
3318                         return -1;
3319                 }
3320                 if (vnn->iface == cur) {
3321                         info->active_idx = i;
3322                 }
3323                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3324                 info->ifaces[i].link_state = cur->link_up;
3325                 info->ifaces[i].references = cur->references;
3326         }
3327         info->num = i;
3328         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3329                 i*sizeof(struct ctdb_iface);
3330
3331         outdata->dsize = len;
3332         outdata->dptr  = (uint8_t *)info;
3333
3334         return 0;
3335 }
3336
3337 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3338                                 struct ctdb_req_control_old *c,
3339                                 TDB_DATA *outdata)
3340 {
3341         int i, num, len;
3342         struct ctdb_iface_list_old *ifaces;
3343         struct ctdb_interface *cur;
3344
3345         /* count how many public ip structures we have */
3346         num = 0;
3347         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3348                 num++;
3349         }
3350
3351         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3352                 num*sizeof(struct ctdb_iface);
3353         ifaces = talloc_zero_size(outdata, len);
3354         CTDB_NO_MEMORY(ctdb, ifaces);
3355
3356         i = 0;
3357         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3358                 strcpy(ifaces->ifaces[i].name, cur->name);
3359                 ifaces->ifaces[i].link_state = cur->link_up;
3360                 ifaces->ifaces[i].references = cur->references;
3361                 i++;
3362         }
3363         ifaces->num = i;
3364         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3365                 i*sizeof(struct ctdb_iface);
3366
3367         outdata->dsize = len;
3368         outdata->dptr  = (uint8_t *)ifaces;
3369
3370         return 0;
3371 }
3372
3373 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3374                                     struct ctdb_req_control_old *c,
3375                                     TDB_DATA indata)
3376 {
3377         struct ctdb_iface *info;
3378         struct ctdb_interface *iface;
3379         bool link_up = false;
3380
3381         info = (struct ctdb_iface *)indata.dptr;
3382
3383         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3384                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3385                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3386                                   len, len, info->name));
3387                 return -1;
3388         }
3389
3390         switch (info->link_state) {
3391         case 0:
3392                 link_up = false;
3393                 break;
3394         case 1:
3395                 link_up = true;
3396                 break;
3397         default:
3398                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3399                                   (unsigned int)info->link_state));
3400                 return -1;
3401         }
3402
3403         if (info->references != 0) {
3404                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3405                                   (unsigned int)info->references));
3406                 return -1;
3407         }
3408
3409         iface = ctdb_find_iface(ctdb, info->name);
3410         if (iface == NULL) {
3411                 return -1;
3412         }
3413
3414         if (link_up == iface->link_up) {
3415                 return 0;
3416         }
3417
3418         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3419               ("iface[%s] has changed it's link status %s => %s\n",
3420                iface->name,
3421                iface->link_up?"up":"down",
3422                link_up?"up":"down"));
3423
3424         iface->link_up = link_up;
3425         return 0;
3426 }
3427
3428
3429 /* 
3430    structure containing the listening socket and the list of tcp connections
3431    that the ctdb daemon is to kill
3432 */
3433 struct ctdb_kill_tcp {
3434         struct ctdb_vnn *vnn;
3435         struct ctdb_context *ctdb;
3436         int capture_fd;
3437         struct tevent_fd *fde;
3438         trbt_tree_t *connections;
3439         void *private_data;
3440 };
3441
3442 /*
3443   a tcp connection that is to be killed
3444  */
3445 struct ctdb_killtcp_con {
3446         ctdb_sock_addr src_addr;
3447         ctdb_sock_addr dst_addr;
3448         int count;
3449         struct ctdb_kill_tcp *killtcp;
3450 };
3451
3452 /* this function is used to create a key to represent this socketpair
3453    in the killtcp tree.
3454    this key is used to insert and lookup matching socketpairs that are
3455    to be tickled and RST
3456 */
3457 #define KILLTCP_KEYLEN  10
3458 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3459 {
3460         static uint32_t key[KILLTCP_KEYLEN];
3461
3462         bzero(key, sizeof(key));
3463
3464         if (src->sa.sa_family != dst->sa.sa_family) {
3465                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3466                 return key;
3467         }
3468         
3469         switch (src->sa.sa_family) {
3470         case AF_INET:
3471                 key[0]  = dst->ip.sin_addr.s_addr;
3472                 key[1]  = src->ip.sin_addr.s_addr;
3473                 key[2]  = dst->ip.sin_port;
3474                 key[3]  = src->ip.sin_port;
3475                 break;
3476         case AF_INET6: {
3477                 uint32_t *dst6_addr32 =
3478                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3479                 uint32_t *src6_addr32 =
3480                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3481                 key[0]  = dst6_addr32[3];
3482                 key[1]  = src6_addr32[3];
3483                 key[2]  = dst6_addr32[2];
3484                 key[3]  = src6_addr32[2];
3485                 key[4]  = dst6_addr32[1];
3486                 key[5]  = src6_addr32[1];
3487                 key[6]  = dst6_addr32[0];
3488                 key[7]  = src6_addr32[0];
3489                 key[8]  = dst->ip6.sin6_port;
3490                 key[9]  = src->ip6.sin6_port;
3491                 break;
3492         }
3493         default:
3494                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3495                 return key;
3496         }
3497
3498         return key;
3499 }
3500
3501 /*
3502   called when we get a read event on the raw socket
3503  */
3504 static void capture_tcp_handler(struct tevent_context *ev,
3505                                 struct tevent_fd *fde,
3506                                 uint16_t flags, void *private_data)
3507 {
3508         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3509         struct ctdb_killtcp_con *con;
3510         ctdb_sock_addr src, dst;
3511         uint32_t ack_seq, seq;
3512
3513         if (!(flags & TEVENT_FD_READ)) {
3514                 return;
3515         }
3516
3517         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3518                                 killtcp->private_data,
3519                                 &src, &dst,
3520                                 &ack_seq, &seq) != 0) {
3521                 /* probably a non-tcp ACK packet */
3522                 return;
3523         }
3524
3525         /* check if we have this guy in our list of connections
3526            to kill
3527         */
3528         con = trbt_lookuparray32(killtcp->connections, 
3529                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3530         if (con == NULL) {
3531                 /* no this was some other packet we can just ignore */
3532                 return;
3533         }
3534
3535         /* This one has been tickled !
3536            now reset him and remove him from the list.
3537          */
3538         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3539                 ntohs(con->dst_addr.ip.sin_port),
3540                 ctdb_addr_to_str(&con->src_addr),
3541                 ntohs(con->src_addr.ip.sin_port)));
3542
3543         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3544         talloc_free(con);
3545 }
3546
3547
3548 /* when traversing the list of all tcp connections to send tickle acks to
3549    (so that we can capture the ack coming back and kill the connection
3550     by a RST)
3551    this callback is called for each connection we are currently trying to kill
3552 */
3553 static int tickle_connection_traverse(void *param, void *data)
3554 {
3555         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3556
3557         /* have tried too many times, just give up */
3558         if (con->count >= 5) {
3559                 /* can't delete in traverse: reparent to delete_cons */
3560                 talloc_steal(param, con);
3561                 return 0;
3562         }
3563
3564         /* othervise, try tickling it again */
3565         con->count++;
3566         ctdb_sys_send_tcp(
3567                 (ctdb_sock_addr *)&con->dst_addr,
3568                 (ctdb_sock_addr *)&con->src_addr,
3569                 0, 0, 0);
3570         return 0;
3571 }
3572
3573
3574 /* 
3575    called every second until all sentenced connections have been reset
3576  */
3577 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3578                                               struct tevent_timer *te,
3579                                               struct timeval t, void *private_data)
3580 {
3581         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3582         void *delete_cons = talloc_new(NULL);
3583
3584         /* loop over all connections sending tickle ACKs */
3585         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3586
3587         /* now we've finished traverse, it's safe to do deletion. */
3588         talloc_free(delete_cons);
3589
3590         /* If there are no more connections to kill we can remove the
3591            entire killtcp structure
3592          */
3593         if ( (killtcp->connections == NULL) || 
3594              (killtcp->connections->root == NULL) ) {
3595                 talloc_free(killtcp);
3596                 return;
3597         }
3598
3599         /* try tickling them again in a seconds time
3600          */
3601         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3602                          timeval_current_ofs(1, 0),
3603                          ctdb_tickle_sentenced_connections, killtcp);
3604 }
3605
3606 /*
3607   destroy the killtcp structure
3608  */
3609 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3610 {
3611         struct ctdb_vnn *tmpvnn;
3612
3613         /* verify that this vnn is still active */
3614         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3615                 if (tmpvnn == killtcp->vnn) {
3616                         break;
3617                 }
3618         }
3619
3620         if (tmpvnn == NULL) {
3621                 return 0;
3622         }
3623
3624         if (killtcp->vnn->killtcp != killtcp) {
3625                 return 0;
3626         }
3627
3628         killtcp->vnn->killtcp = NULL;
3629
3630         return 0;
3631 }
3632
3633
3634 /* nothing fancy here, just unconditionally replace any existing
3635    connection structure with the new one.
3636
3637    don't even free the old one if it did exist, that one is talloc_stolen
3638    by the same node in the tree anyway and will be deleted when the new data 
3639    is deleted
3640 */
3641 static void *add_killtcp_callback(void *parm, void *data)
3642 {
3643         return parm;
3644 }
3645
3646 /*
3647   add a tcp socket to the list of connections we want to RST
3648  */
3649 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3650                                        ctdb_sock_addr *s,
3651                                        ctdb_sock_addr *d)
3652 {
3653         ctdb_sock_addr src, dst;
3654         struct ctdb_kill_tcp *killtcp;
3655         struct ctdb_killtcp_con *con;
3656         struct ctdb_vnn *vnn;
3657
3658         ctdb_canonicalize_ip(s, &src);
3659         ctdb_canonicalize_ip(d, &dst);
3660
3661         vnn = find_public_ip_vnn(ctdb, &dst);
3662         if (vnn == NULL) {
3663                 vnn = find_public_ip_vnn(ctdb, &src);
3664         }
3665         if (vnn == NULL) {
3666                 /* if it is not a public ip   it could be our 'single ip' */
3667                 if (ctdb->single_ip_vnn) {
3668                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3669                                 vnn = ctdb->single_ip_vnn;
3670                         }
3671                 }
3672         }
3673         if (vnn == NULL) {
3674                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3675                 return -1;
3676         }
3677
3678         killtcp = vnn->killtcp;
3679         
3680         /* If this is the first connection to kill we must allocate
3681            a new structure
3682          */
3683         if (killtcp == NULL) {
3684                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3685                 CTDB_NO_MEMORY(ctdb, killtcp);
3686
3687                 killtcp->vnn         = vnn;
3688                 killtcp->ctdb        = ctdb;
3689                 killtcp->capture_fd  = -1;
3690                 killtcp->connections = trbt_create(killtcp, 0);
3691
3692                 vnn->killtcp         = killtcp;
3693                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3694         }
3695
3696
3697
3698         /* create a structure that describes this connection we want to
3699            RST and store it in killtcp->connections
3700         */
3701         con = talloc(killtcp, struct ctdb_killtcp_con);
3702         CTDB_NO_MEMORY(ctdb, con);
3703         con->src_addr = src;
3704         con->dst_addr = dst;
3705         con->count    = 0;
3706         con->killtcp  = killtcp;
3707
3708
3709         trbt_insertarray32_callback(killtcp->connections,
3710                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3711                         add_killtcp_callback, con);
3712
3713         /* 
3714            If we don't have a socket to listen on yet we must create it
3715          */
3716         if (killtcp->capture_fd == -1) {
3717                 const char *iface = ctdb_vnn_iface_string(vnn);
3718                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3719                 if (killtcp->capture_fd == -1) {
3720                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3721                                           "socket on iface '%s' for killtcp (%s)\n",
3722                                           iface, strerror(errno)));
3723                         goto failed;
3724                 }
3725         }
3726
3727
3728         if (killtcp->fde == NULL) {
3729                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3730                                              killtcp->capture_fd,
3731                                              TEVENT_FD_READ,
3732                                              capture_tcp_handler, killtcp);
3733                 tevent_fd_set_auto_close(killtcp->fde);
3734
3735                 /* We also need to set up some events to tickle all these connections
3736                    until they are all reset
3737                 */
3738                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3739                                  ctdb_tickle_sentenced_connections, killtcp);
3740         }
3741
3742         /* tickle him once now */
3743         ctdb_sys_send_tcp(
3744                 &con->dst_addr,
3745                 &con->src_addr,
3746                 0, 0, 0);
3747
3748         return 0;
3749
3750 failed:
3751         talloc_free(vnn->killtcp);
3752         vnn->killtcp = NULL;
3753         return -1;
3754 }
3755
3756 /*
3757   kill a TCP connection.
3758  */
3759 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3760 {
3761         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3762
3763         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3764 }
3765
3766 /*
3767   called by a daemon to inform us of the entire list of TCP tickles for
3768   a particular public address.
3769   this control should only be sent by the node that is currently serving
3770   that public address.
3771  */
3772 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3773 {
3774         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3775         struct ctdb_tcp_array *tcparray;
3776         struct ctdb_vnn *vnn;
3777
3778         /* We must at least have tickles.num or else we cant verify the size
3779            of the received data blob
3780          */
3781         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3782                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3783                 return -1;
3784         }
3785
3786         /* verify that the size of data matches what we expect */
3787         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3788                          + sizeof(struct ctdb_connection) * list->num) {
3789                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3790                 return -1;
3791         }
3792
3793         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3794                            ctdb_addr_to_str(&list->addr)));
3795
3796         vnn = find_public_ip_vnn(ctdb, &list->addr);
3797         if (vnn == NULL) {
3798                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3799                         ctdb_addr_to_str(&list->addr)));
3800
3801                 return 1;
3802         }
3803
3804         /* remove any old ticklelist we might have */
3805         talloc_free(vnn->tcp_array);
3806         vnn->tcp_array = NULL;
3807
3808         tcparray = talloc(vnn, struct ctdb_tcp_array);
3809         CTDB_NO_MEMORY(ctdb, tcparray);
3810
3811         tcparray->num = list->num;
3812
3813         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3814         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3815
3816         memcpy(tcparray->connections, &list->connections[0],
3817                sizeof(struct ctdb_connection)*tcparray->num);
3818
3819         /* We now have a new fresh tickle list array for this vnn */
3820         vnn->tcp_array = tcparray;
3821
3822         return 0;
3823 }
3824
3825 /*
3826   called to return the full list of tickles for the puclic address associated 
3827   with the provided vnn
3828  */
3829 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3830 {
3831         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3832         struct ctdb_tickle_list_old *list;
3833         struct ctdb_tcp_array *tcparray;
3834         int num;
3835         struct ctdb_vnn *vnn;
3836
3837         vnn = find_public_ip_vnn(ctdb, addr);
3838         if (vnn == NULL) {
3839                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3840                         ctdb_addr_to_str(addr)));
3841
3842                 return 1;
3843         }
3844
3845         tcparray = vnn->tcp_array;
3846         if (tcparray) {
3847                 num = tcparray->num;
3848         } else {
3849                 num = 0;
3850         }
3851
3852         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3853                         + sizeof(struct ctdb_connection) * num;
3854
3855         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3856         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3857         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3858
3859         list->addr = *addr;
3860         list->num = num;
3861         if (num) {
3862                 memcpy(&list->connections[0], tcparray->connections,
3863                         sizeof(struct ctdb_connection) * num);
3864         }
3865
3866         return 0;
3867 }
3868
3869
3870 /*
3871   set the list of all tcp tickles for a public address
3872  */
3873 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3874                                             ctdb_sock_addr *addr,
3875                                             struct ctdb_tcp_array *tcparray)
3876 {
3877         int ret, num;
3878         TDB_DATA data;
3879         struct ctdb_tickle_list_old *list;
3880
3881         if (tcparray) {
3882                 num = tcparray->num;
3883         } else {
3884                 num = 0;
3885         }
3886
3887         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3888                         sizeof(struct ctdb_connection) * num;
3889         data.dptr = talloc_size(ctdb, data.dsize);
3890         CTDB_NO_MEMORY(ctdb, data.dptr);
3891
3892         list = (struct ctdb_tickle_list_old *)data.dptr;
3893         list->addr = *addr;
3894         list->num = num;
3895         if (tcparray) {
3896                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3897         }
3898
3899         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3900                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3901                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3902         if (ret != 0) {
3903                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3904                 return -1;
3905         }
3906
3907         talloc_free(data.dptr);
3908
3909         return ret;
3910 }
3911
3912
3913 /*
3914   perform tickle updates if required
3915  */
3916 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3917                                     struct tevent_timer *te,
3918                                     struct timeval t, void *private_data)
3919 {
3920         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3921         int ret;
3922         struct ctdb_vnn *vnn;
3923
3924         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3925                 /* we only send out updates for public addresses that 
3926                    we have taken over
3927                  */
3928                 if (ctdb->pnn != vnn->pnn) {
3929                         continue;
3930                 }
3931                 /* We only send out the updates if we need to */
3932                 if (!vnn->tcp_update_needed) {
3933                         continue;
3934                 }
3935                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3936                                                        &vnn->public_address,
3937                                                        vnn->tcp_array);
3938                 if (ret != 0) {
3939                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3940                                 ctdb_addr_to_str(&vnn->public_address)));
3941                 } else {
3942                         DEBUG(DEBUG_INFO,
3943                               ("Sent tickle update for public address %s\n",
3944                                ctdb_addr_to_str(&vnn->public_address)));
3945                         vnn->tcp_update_needed = false;
3946                 }
3947         }
3948
3949         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3950                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3951                          ctdb_update_tcp_tickles, ctdb);
3952 }
3953
3954 /*
3955   start periodic update of tcp tickles
3956  */
3957 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3958 {
3959         ctdb->tickle_update_context = talloc_new(ctdb);
3960
3961         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3962                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3963                          ctdb_update_tcp_tickles, ctdb);
3964 }
3965
3966
3967
3968
3969 struct control_gratious_arp {
3970         struct ctdb_context *ctdb;
3971         ctdb_sock_addr addr;
3972         const char *iface;
3973         int count;
3974 };
3975
3976 /*
3977   send a control_gratuitous arp
3978  */
3979 static void send_gratious_arp(struct tevent_context *ev,
3980                               struct tevent_timer *te,
3981                               struct timeval t, void *private_data)
3982 {
3983         int ret;
3984         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3985                                                         struct control_gratious_arp);
3986
3987         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3988         if (ret != 0) {
3989                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3990                                  arp->iface, strerror(errno)));
3991         }
3992
3993
3994         arp->count++;
3995         if (arp->count == CTDB_ARP_REPEAT) {
3996                 talloc_free(arp);
3997                 return;
3998         }
3999
4000         tevent_add_timer(arp->ctdb->ev, arp,
4001                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4002                          send_gratious_arp, arp);
4003 }
4004
4005
4006 /*
4007   send a gratious arp 
4008  */
4009 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4010 {
4011         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4012         struct control_gratious_arp *arp;
4013
4014         /* verify the size of indata */
4015         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4016                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4017                                  (unsigned)indata.dsize, 
4018                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4019                 return -1;
4020         }
4021         if (indata.dsize != 
4022                 ( offsetof(struct ctdb_addr_info_old, iface)
4023                 + gratious_arp->len ) ){
4024
4025                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4026                         "but should be %u bytes\n", 
4027                          (unsigned)indata.dsize, 
4028                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4029                 return -1;
4030         }
4031
4032
4033         arp = talloc(ctdb, struct control_gratious_arp);
4034         CTDB_NO_MEMORY(ctdb, arp);
4035
4036         arp->ctdb  = ctdb;
4037         arp->addr   = gratious_arp->addr;
4038         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4039         CTDB_NO_MEMORY(ctdb, arp->iface);
4040         arp->count = 0;
4041
4042         tevent_add_timer(arp->ctdb->ev, arp,
4043                          timeval_zero(), send_gratious_arp, arp);
4044
4045         return 0;
4046 }
4047
4048 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4049 {
4050         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4051         int ret;
4052
4053         /* verify the size of indata */
4054         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4055                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4056                 return -1;
4057         }
4058         if (indata.dsize != 
4059                 ( offsetof(struct ctdb_addr_info_old, iface)
4060                 + pub->len ) ){
4061
4062                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4063                         "but should be %u bytes\n", 
4064                          (unsigned)indata.dsize, 
4065                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4066                 return -1;
4067         }
4068
4069         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4070
4071         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4072
4073         if (ret != 0) {
4074                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4075                 return -1;
4076         }
4077
4078         return 0;
4079 }
4080
4081 struct delete_ip_callback_state {
4082         struct ctdb_req_control_old *c;
4083 };
4084
4085 /*
4086   called when releaseip event finishes for del_public_address
4087  */
4088 static void delete_ip_callback(struct ctdb_context *ctdb,
4089                                int32_t status, TDB_DATA data,
4090                                const char *errormsg,
4091                                void *private_data)
4092 {
4093         struct delete_ip_callback_state *state =
4094                 talloc_get_type(private_data, struct delete_ip_callback_state);
4095
4096         /* If release failed then fail. */
4097         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4098         talloc_free(private_data);
4099 }
4100
4101 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4102                                         struct ctdb_req_control_old *c,
4103                                         TDB_DATA indata, bool *async_reply)
4104 {
4105         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4106         struct ctdb_vnn *vnn;
4107
4108         /* verify the size of indata */
4109         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4110                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4111                 return -1;
4112         }
4113         if (indata.dsize != 
4114                 ( offsetof(struct ctdb_addr_info_old, iface)
4115                 + pub->len ) ){
4116
4117                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4118                         "but should be %u bytes\n", 
4119                          (unsigned)indata.dsize, 
4120                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4121                 return -1;
4122         }
4123
4124         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4125
4126         /* walk over all public addresses until we find a match */
4127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4128                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4129                         if (vnn->pnn == ctdb->pnn) {
4130                                 struct delete_ip_callback_state *state;
4131                                 struct ctdb_public_ip *ip;
4132                                 TDB_DATA data;
4133                                 int ret;
4134
4135                                 vnn->delete_pending = true;
4136
4137                                 state = talloc(ctdb,
4138                                                struct delete_ip_callback_state);
4139                                 CTDB_NO_MEMORY(ctdb, state);
4140                                 state->c = c;
4141
4142                                 ip = talloc(state, struct ctdb_public_ip);
4143                                 if (ip == NULL) {
4144                                         DEBUG(DEBUG_ERR,
4145                                               (__location__ " Out of memory\n"));
4146                                         talloc_free(state);
4147                                         return -1;
4148                                 }
4149                                 ip->pnn = -1;
4150                                 ip->addr = pub->addr;
4151
4152                                 data.dsize = sizeof(struct ctdb_public_ip);
4153                                 data.dptr = (unsigned char *)ip;
4154
4155                                 ret = ctdb_daemon_send_control(ctdb,
4156                                                                ctdb_get_pnn(ctdb),
4157                                                                0,
4158                                                                CTDB_CONTROL_RELEASE_IP,
4159                                                                0, 0,
4160                                                                data,
4161                                                                delete_ip_callback,
4162                                                                state);
4163                                 if (ret == -1) {
4164                                         DEBUG(DEBUG_ERR,
4165                                               (__location__ "Unable to send "
4166                                                "CTDB_CONTROL_RELEASE_IP\n"));
4167                                         talloc_free(state);
4168                                         return -1;
4169                                 }
4170
4171                                 state->c = talloc_steal(state, c);
4172                                 *async_reply = true;
4173                         } else {
4174                                 /* This IP is not hosted on the
4175                                  * current node so just delete it
4176                                  * now. */
4177                                 do_delete_ip(ctdb, vnn);
4178                         }
4179
4180                         return 0;
4181                 }
4182         }
4183
4184         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4185                          ctdb_addr_to_str(&pub->addr)));
4186         return -1;
4187 }
4188
4189
4190 struct ipreallocated_callback_state {
4191         struct ctdb_req_control_old *c;
4192 };
4193
4194 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4195                                         int status, void *p)
4196 {
4197         struct ipreallocated_callback_state *state =
4198                 talloc_get_type(p, struct ipreallocated_callback_state);
4199
4200         if (status != 0) {
4201                 DEBUG(DEBUG_ERR,
4202                       (" \"ipreallocated\" event script failed (status %d)\n",
4203                        status));
4204                 if (status == -ETIME) {
4205                         ctdb_ban_self(ctdb);
4206                 }
4207         }
4208
4209         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4210         talloc_free(state);
4211 }
4212
4213 /* A control to run the ipreallocated event */
4214 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4215                                    struct ctdb_req_control_old *c,
4216                                    bool *async_reply)
4217 {
4218         int ret;
4219         struct ipreallocated_callback_state *state;
4220
4221         state = talloc(ctdb, struct ipreallocated_callback_state);
4222         CTDB_NO_MEMORY(ctdb, state);
4223
4224         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4225
4226         ret = ctdb_event_script_callback(ctdb, state,
4227                                          ctdb_ipreallocated_callback, state,
4228                                          CTDB_EVENT_IPREALLOCATED,
4229                                          "%s", "");
4230
4231         if (ret != 0) {
4232                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4233                 talloc_free(state);
4234                 return -1;
4235         }
4236
4237         /* tell the control that we will be reply asynchronously */
4238         state->c    = talloc_steal(state, c);
4239         *async_reply = true;
4240
4241         return 0;
4242 }
4243
4244
4245 /* This function is called from the recovery daemon to verify that a remote
4246    node has the expected ip allocation.
4247    This is verified against ctdb->ip_tree
4248 */
4249 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4250                                        struct ctdb_public_ip_list_old *ips,
4251                                        uint32_t pnn)
4252 {
4253         struct public_ip_list *tmp_ip;
4254         int i;
4255
4256         if (ctdb->ip_tree == NULL) {
4257                 /* don't know the expected allocation yet, assume remote node
4258                    is correct. */
4259                 return 0;
4260         }
4261
4262         if (ips == NULL) {
4263                 return 0;
4264         }
4265
4266         for (i=0; i<ips->num; i++) {
4267                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4268                 if (tmp_ip == NULL) {
4269                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4270                         return -1;
4271                 }
4272
4273                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4274                         continue;
4275                 }
4276
4277                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4278                         DEBUG(DEBUG_ERR,
4279                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4280                                pnn,
4281                                ctdb_addr_to_str(&ips->ips[i].addr),
4282                                ips->ips[i].pnn, tmp_ip->pnn));
4283                         return -1;
4284                 }
4285         }
4286
4287         return 0;
4288 }
4289
4290 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4291 {
4292         struct public_ip_list *tmp_ip;
4293
4294         /* IP tree is never built if DisableIPFailover is set */
4295         if (ctdb->tunable.disable_ip_failover != 0) {
4296                 return 0;
4297         }
4298
4299         if (ctdb->ip_tree == NULL) {
4300                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4301                 return -1;
4302         }
4303
4304         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4305         if (tmp_ip == NULL) {
4306                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4307                 return -1;
4308         }
4309
4310         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4311         tmp_ip->pnn = ip->pnn;
4312
4313         return 0;
4314 }
4315
4316 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4317 {
4318         TALLOC_FREE(ctdb->ip_tree);
4319 }
4320
4321 struct ctdb_reloadips_handle {
4322         struct ctdb_context *ctdb;
4323         struct ctdb_req_control_old *c;
4324         int status;
4325         int fd[2];
4326         pid_t child;
4327         struct tevent_fd *fde;
4328 };
4329
4330 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4331 {
4332         if (h == h->ctdb->reload_ips) {
4333                 h->ctdb->reload_ips = NULL;
4334         }
4335         if (h->c != NULL) {
4336                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4337                 h->c = NULL;
4338         }
4339         ctdb_kill(h->ctdb, h->child, SIGKILL);
4340         return 0;
4341 }
4342
4343 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4344                                          struct tevent_timer *te,
4345                                          struct timeval t, void *private_data)
4346 {
4347         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4348
4349         talloc_free(h);
4350 }
4351
4352 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4353                                          struct tevent_fd *fde,
4354                                          uint16_t flags, void *private_data)
4355 {
4356         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4357
4358         char res;
4359         int ret;
4360
4361         ret = sys_read(h->fd[0], &res, 1);
4362         if (ret < 1 || res != 0) {
4363                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4364                 res = 1;
4365         }
4366         h->status = res;
4367
4368         talloc_free(h);
4369 }
4370
4371 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4372 {
4373         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4374         struct ctdb_public_ip_list_old *ips;
4375         struct ctdb_vnn *vnn;
4376         struct client_async_data *async_data;
4377         struct timeval timeout;
4378         TDB_DATA data;
4379         struct ctdb_client_control_state *state;
4380         bool first_add;
4381         int i, ret;
4382
4383         CTDB_NO_MEMORY(ctdb, mem_ctx);
4384
4385         /* Read IPs from local node */
4386         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4387                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4388         if (ret != 0) {
4389                 DEBUG(DEBUG_ERR,
4390                       ("Unable to fetch public IPs from local node\n"));
4391                 talloc_free(mem_ctx);
4392                 return -1;
4393         }
4394
4395         /* Read IPs file - this is safe since this is a child process */
4396         ctdb->vnn = NULL;
4397         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4398                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4399                 talloc_free(mem_ctx);
4400                 return -1;
4401         }
4402
4403         async_data = talloc_zero(mem_ctx, struct client_async_data);
4404         CTDB_NO_MEMORY(ctdb, async_data);
4405
4406         /* Compare IPs between node and file for IPs to be deleted */
4407         for (i = 0; i < ips->num; i++) {
4408                 /* */
4409                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4410                         if (ctdb_same_ip(&vnn->public_address,
4411                                          &ips->ips[i].addr)) {
4412                                 /* IP is still in file */
4413                                 break;
4414                         }
4415                 }
4416
4417                 if (vnn == NULL) {
4418                         /* Delete IP ips->ips[i] */
4419                         struct ctdb_addr_info_old *pub;
4420
4421                         DEBUG(DEBUG_NOTICE,
4422                               ("IP %s no longer configured, deleting it\n",
4423                                ctdb_addr_to_str(&ips->ips[i].addr)));
4424
4425                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4426                         CTDB_NO_MEMORY(ctdb, pub);
4427
4428                         pub->addr  = ips->ips[i].addr;
4429                         pub->mask  = 0;
4430                         pub->len   = 0;
4431
4432                         timeout = TAKEOVER_TIMEOUT();
4433
4434                         data.dsize = offsetof(struct ctdb_addr_info_old,
4435                                               iface) + pub->len;
4436                         data.dptr = (uint8_t *)pub;
4437
4438                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4439                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4440                                                   0, data, async_data,
4441                                                   &timeout, NULL);
4442                         if (state == NULL) {
4443                                 DEBUG(DEBUG_ERR,
4444                                       (__location__
4445                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4446                                 goto failed;
4447                         }
4448
4449                         ctdb_client_async_add(async_data, state);
4450                 }
4451         }
4452
4453         /* Compare IPs between node and file for IPs to be added */
4454         first_add = true;
4455         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4456                 for (i = 0; i < ips->num; i++) {
4457                         if (ctdb_same_ip(&vnn->public_address,
4458                                          &ips->ips[i].addr)) {
4459                                 /* IP already on node */
4460                                 break;
4461                         }
4462                 }
4463                 if (i == ips->num) {
4464                         /* Add IP ips->ips[i] */
4465                         struct ctdb_addr_info_old *pub;
4466                         const char *ifaces = NULL;
4467                         uint32_t len;
4468                         int iface = 0;
4469
4470                         DEBUG(DEBUG_NOTICE,
4471                               ("New IP %s configured, adding it\n",
4472                                ctdb_addr_to_str(&vnn->public_address)));
4473                         if (first_add) {
4474                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4475
4476                                 data.dsize = sizeof(pnn);
4477                                 data.dptr  = (uint8_t *)&pnn;
4478
4479                                 ret = ctdb_client_send_message(
4480                                         ctdb,
4481                                         CTDB_BROADCAST_CONNECTED,
4482                                         CTDB_SRVID_REBALANCE_NODE,
4483                                         data);
4484                                 if (ret != 0) {
4485                                         DEBUG(DEBUG_WARNING,
4486                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4487                                 }
4488
4489                                 first_add = false;
4490                         }
4491
4492                         ifaces = vnn->ifaces[0];
4493                         iface = 1;
4494                         while (vnn->ifaces[iface] != NULL) {
4495                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4496                                                          vnn->ifaces[iface]);
4497                                 iface++;
4498                         }
4499
4500                         len   = strlen(ifaces) + 1;
4501                         pub = talloc_zero_size(mem_ctx,
4502                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4503                         CTDB_NO_MEMORY(ctdb, pub);
4504
4505                         pub->addr  = vnn->public_address;
4506                         pub->mask  = vnn->public_netmask_bits;
4507                         pub->len   = len;
4508                         memcpy(&pub->iface[0], ifaces, pub->len);
4509
4510                         timeout = TAKEOVER_TIMEOUT();
4511
4512                         data.dsize = offsetof(struct ctdb_addr_info_old,
4513                                               iface) + pub->len;
4514                         data.dptr = (uint8_t *)pub;
4515
4516                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4517                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4518                                                   0, data, async_data,
4519                                                   &timeout, NULL);
4520                         if (state == NULL) {
4521                                 DEBUG(DEBUG_ERR,
4522                                       (__location__
4523                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4524                                 goto failed;
4525                         }
4526
4527                         ctdb_client_async_add(async_data, state);
4528                 }
4529         }
4530
4531         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4532                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4533                 goto failed;
4534         }
4535
4536         talloc_free(mem_ctx);
4537         return 0;
4538
4539 failed:
4540         talloc_free(mem_ctx);
4541         return -1;
4542 }
4543
4544 /* This control is sent to force the node to re-read the public addresses file
4545    and drop any addresses we should nnot longer host, and add new addresses
4546    that we are now able to host
4547 */
4548 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4549 {
4550         struct ctdb_reloadips_handle *h;
4551         pid_t parent = getpid();
4552
4553         if (ctdb->reload_ips != NULL) {
4554                 talloc_free(ctdb->reload_ips);
4555                 ctdb->reload_ips = NULL;
4556         }
4557
4558         h = talloc(ctdb, struct ctdb_reloadips_handle);
4559         CTDB_NO_MEMORY(ctdb, h);
4560         h->ctdb     = ctdb;
4561         h->c        = NULL;
4562         h->status   = -1;
4563         
4564         if (pipe(h->fd) == -1) {
4565                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4566                 talloc_free(h);
4567                 return -1;
4568         }
4569
4570         h->child = ctdb_fork(ctdb);
4571         if (h->child == (pid_t)-1) {
4572                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4573                 close(h->fd[0]);
4574                 close(h->fd[1]);
4575                 talloc_free(h);
4576                 return -1;
4577         }
4578
4579         /* child process */
4580         if (h->child == 0) {
4581                 signed char res = 0;
4582
4583                 close(h->fd[0]);
4584                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4585
4586                 prctl_set_comment("ctdb_reloadips");
4587                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4588                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4589                         res = -1;
4590                 } else {
4591                         res = ctdb_reloadips_child(ctdb);
4592                         if (res != 0) {
4593                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4594                         }
4595                 }
4596
4597                 sys_write(h->fd[1], &res, 1);
4598                 /* make sure we die when our parent dies */
4599                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4600                         sleep(5);
4601                 }
4602                 _exit(0);
4603         }
4604
4605         h->c             = talloc_steal(h, c);
4606
4607         close(h->fd[1]);
4608         set_close_on_exec(h->fd[0]);
4609
4610         talloc_set_destructor(h, ctdb_reloadips_destructor);
4611
4612
4613         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4614                                ctdb_reloadips_child_handler, (void *)h);
4615         tevent_fd_set_auto_close(h->fde);
4616
4617         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4618                          ctdb_reloadips_timeout_event, h);
4619
4620         /* we reply later */
4621         *async_reply = true;
4622         return 0;
4623 }