055aac3ee3ef0dab8b7c641652473a226e6c58cd
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68
69         enum ipalloc_algorithm algorithm;
70         uint32_t no_ip_failback;
71 };
72
73 struct ctdb_interface {
74         struct ctdb_interface *prev, *next;
75         const char *name;
76         bool link_up;
77         uint32_t references;
78 };
79
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
81 {
82         if (vnn->iface) {
83                 return vnn->iface->name;
84         }
85
86         return "__none__";
87 }
88
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
90 {
91         struct ctdb_interface *i;
92
93         /* Verify that we don't have an entry for this ip yet */
94         for (i=ctdb->ifaces;i;i=i->next) {
95                 if (strcmp(i->name, iface) == 0) {
96                         return 0;
97                 }
98         }
99
100         /* create a new structure for this interface */
101         i = talloc_zero(ctdb, struct ctdb_interface);
102         CTDB_NO_MEMORY_FATAL(ctdb, i);
103         i->name = talloc_strdup(i, iface);
104         CTDB_NO_MEMORY(ctdb, i->name);
105
106         i->link_up = true;
107
108         DLIST_ADD(ctdb->ifaces, i);
109
110         return 0;
111 }
112
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
114                                         const char *name)
115 {
116         int n;
117
118         for (n = 0; vnn->ifaces[n] != NULL; n++) {
119                 if (strcmp(name, vnn->ifaces[n]) == 0) {
120                         return true;
121                 }
122         }
123
124         return false;
125 }
126
127 /* If any interfaces now have no possible IPs then delete them.  This
128  * implementation is naive (i.e. simple) rather than clever
129  * (i.e. complex).  Given that this is run on delip and that operation
130  * is rare, this doesn't need to be efficient - it needs to be
131  * foolproof.  One alternative is reference counting, where the logic
132  * is distributed and can, therefore, be broken in multiple places.
133  * Another alternative is to build a red-black tree of interfaces that
134  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135  * once) and then walking ctdb->ifaces once and deleting those not in
136  * the tree.  Let's go to one of those if the naive implementation
137  * causes problems...  :-)
138  */
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140                                         struct ctdb_vnn *vnn)
141 {
142         struct ctdb_interface *i, *next;
143
144         /* For each interface, check if there's an IP using it. */
145         for (i = ctdb->ifaces; i != NULL; i = next) {
146                 struct ctdb_vnn *tv;
147                 bool found;
148                 next = i->next;
149
150                 /* Only consider interfaces named in the given VNN. */
151                 if (!vnn_has_interface_with_name(vnn, i->name)) {
152                         continue;
153                 }
154
155                 /* Is the "single IP" on this interface? */
156                 if ((ctdb->single_ip_vnn != NULL) &&
157                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159                         /* Found, next interface please... */
160                         continue;
161                 }
162                 /* Search for a vnn with this interface. */
163                 found = false;
164                 for (tv=ctdb->vnn; tv; tv=tv->next) {
165                         if (vnn_has_interface_with_name(tv, i->name)) {
166                                 found = true;
167                                 break;
168                         }
169                 }
170
171                 if (!found) {
172                         /* None of the VNNs are using this interface. */
173                         DLIST_REMOVE(ctdb->ifaces, i);
174                         talloc_free(i);
175                 }
176         }
177 }
178
179
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
181                                               const char *iface)
182 {
183         struct ctdb_interface *i;
184
185         for (i=ctdb->ifaces;i;i=i->next) {
186                 if (strcmp(i->name, iface) == 0) {
187                         return i;
188                 }
189         }
190
191         return NULL;
192 }
193
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195                                                   struct ctdb_vnn *vnn)
196 {
197         int i;
198         struct ctdb_interface *cur = NULL;
199         struct ctdb_interface *best = NULL;
200
201         for (i=0; vnn->ifaces[i]; i++) {
202
203                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
204                 if (cur == NULL) {
205                         continue;
206                 }
207
208                 if (!cur->link_up) {
209                         continue;
210                 }
211
212                 if (best == NULL) {
213                         best = cur;
214                         continue;
215                 }
216
217                 if (cur->references < best->references) {
218                         best = cur;
219                         continue;
220                 }
221         }
222
223         return best;
224 }
225
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227                                      struct ctdb_vnn *vnn)
228 {
229         struct ctdb_interface *best = NULL;
230
231         if (vnn->iface) {
232                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                                    "still assigned to iface '%s'\n",
234                                    ctdb_addr_to_str(&vnn->public_address),
235                                    ctdb_vnn_iface_string(vnn)));
236                 return 0;
237         }
238
239         best = ctdb_vnn_best_iface(ctdb, vnn);
240         if (best == NULL) {
241                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242                                   "cannot assign to iface any iface\n",
243                                   ctdb_addr_to_str(&vnn->public_address)));
244                 return -1;
245         }
246
247         vnn->iface = best;
248         best->references++;
249         vnn->pnn = ctdb->pnn;
250
251         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252                            "now assigned to iface '%s' refs[%d]\n",
253                            ctdb_addr_to_str(&vnn->public_address),
254                            ctdb_vnn_iface_string(vnn),
255                            best->references));
256         return 0;
257 }
258
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260                                     struct ctdb_vnn *vnn)
261 {
262         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263                            "now unassigned (old iface '%s' refs[%d])\n",
264                            ctdb_addr_to_str(&vnn->public_address),
265                            ctdb_vnn_iface_string(vnn),
266                            vnn->iface?vnn->iface->references:0));
267         if (vnn->iface) {
268                 vnn->iface->references--;
269         }
270         vnn->iface = NULL;
271         if (vnn->pnn == ctdb->pnn) {
272                 vnn->pnn = -1;
273         }
274 }
275
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277                                struct ctdb_vnn *vnn)
278 {
279         int i;
280
281         /* Nodes that are not RUNNING can not host IPs */
282         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
283                 return false;
284         }
285
286         if (vnn->delete_pending) {
287                 return false;
288         }
289
290         if (vnn->iface && vnn->iface->link_up) {
291                 return true;
292         }
293
294         for (i=0; vnn->ifaces[i]; i++) {
295                 struct ctdb_interface *cur;
296
297                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
298                 if (cur == NULL) {
299                         continue;
300                 }
301
302                 if (cur->link_up) {
303                         return true;
304                 }
305         }
306
307         return false;
308 }
309
310 struct ctdb_takeover_arp {
311         struct ctdb_context *ctdb;
312         uint32_t count;
313         ctdb_sock_addr addr;
314         struct ctdb_tcp_array *tcparray;
315         struct ctdb_vnn *vnn;
316 };
317
318
319 /*
320   lists of tcp endpoints
321  */
322 struct ctdb_tcp_list {
323         struct ctdb_tcp_list *prev, *next;
324         struct ctdb_connection connection;
325 };
326
327 /*
328   list of clients to kill on IP release
329  */
330 struct ctdb_client_ip {
331         struct ctdb_client_ip *prev, *next;
332         struct ctdb_context *ctdb;
333         ctdb_sock_addr addr;
334         uint32_t client_id;
335 };
336
337
338 /*
339   send a gratuitous arp
340  */
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342                                   struct tevent_timer *te,
343                                   struct timeval t, void *private_data)
344 {
345         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
346                                                         struct ctdb_takeover_arp);
347         int i, ret;
348         struct ctdb_tcp_array *tcparray;
349         const char *iface = ctdb_vnn_iface_string(arp->vnn);
350
351         ret = ctdb_sys_send_arp(&arp->addr, iface);
352         if (ret != 0) {
353                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354                                   iface, strerror(errno)));
355         }
356
357         tcparray = arp->tcparray;
358         if (tcparray) {
359                 for (i=0;i<tcparray->num;i++) {
360                         struct ctdb_connection *tcon;
361
362                         tcon = &tcparray->connections[i];
363                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
365                                 ctdb_addr_to_str(&tcon->src),
366                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
367                         ret = ctdb_sys_send_tcp(
368                                 &tcon->src,
369                                 &tcon->dst,
370                                 0, 0, 0);
371                         if (ret != 0) {
372                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373                                         ctdb_addr_to_str(&tcon->src)));
374                         }
375                 }
376         }
377
378         arp->count++;
379
380         if (arp->count == CTDB_ARP_REPEAT) {
381                 talloc_free(arp);
382                 return;
383         }
384
385         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387                          ctdb_control_send_arp, arp);
388 }
389
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391                                        struct ctdb_vnn *vnn)
392 {
393         struct ctdb_takeover_arp *arp;
394         struct ctdb_tcp_array *tcparray;
395
396         if (!vnn->takeover_ctx) {
397                 vnn->takeover_ctx = talloc_new(vnn);
398                 if (!vnn->takeover_ctx) {
399                         return -1;
400                 }
401         }
402
403         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
404         if (!arp) {
405                 return -1;
406         }
407
408         arp->ctdb = ctdb;
409         arp->addr = vnn->public_address;
410         arp->vnn  = vnn;
411
412         tcparray = vnn->tcp_array;
413         if (tcparray) {
414                 /* add all of the known tcp connections for this IP to the
415                    list of tcp connections to send tickle acks for */
416                 arp->tcparray = talloc_steal(arp, tcparray);
417
418                 vnn->tcp_array = NULL;
419                 vnn->tcp_update_needed = true;
420         }
421
422         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423                          timeval_zero(), ctdb_control_send_arp, arp);
424
425         return 0;
426 }
427
428 struct takeover_callback_state {
429         struct ctdb_req_control_old *c;
430         ctdb_sock_addr *addr;
431         struct ctdb_vnn *vnn;
432 };
433
434 struct ctdb_do_takeip_state {
435         struct ctdb_req_control_old *c;
436         struct ctdb_vnn *vnn;
437 };
438
439 /*
440   called when takeip event finishes
441  */
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
443                                     void *private_data)
444 {
445         struct ctdb_do_takeip_state *state =
446                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
447         int32_t ret;
448         TDB_DATA data;
449
450         if (status != 0) {
451                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
452         
453                 if (status == -ETIME) {
454                         ctdb_ban_self(ctdb);
455                 }
456                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457                                  ctdb_addr_to_str(&state->vnn->public_address),
458                                  ctdb_vnn_iface_string(state->vnn)));
459                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
460
461                 node->flags |= NODE_FLAGS_UNHEALTHY;
462                 talloc_free(state);
463                 return;
464         }
465
466         if (ctdb->do_checkpublicip) {
467
468         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469         if (ret != 0) {
470                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471                 talloc_free(state);
472                 return;
473         }
474
475         }
476
477         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478         data.dsize = strlen((char *)data.dptr) + 1;
479         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
480
481         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
482
483
484         /* the control succeeded */
485         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
486         talloc_free(state);
487         return;
488 }
489
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
491 {
492         state->vnn->update_in_flight = false;
493         return 0;
494 }
495
496 /*
497   take over an ip address
498  */
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500                               struct ctdb_req_control_old *c,
501                               struct ctdb_vnn *vnn)
502 {
503         int ret;
504         struct ctdb_do_takeip_state *state;
505
506         if (vnn->update_in_flight) {
507                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508                                     "update for this IP already in flight\n",
509                                     ctdb_addr_to_str(&vnn->public_address),
510                                     vnn->public_netmask_bits));
511                 return -1;
512         }
513
514         ret = ctdb_vnn_assign_iface(ctdb, vnn);
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517                                  "assign a usable interface\n",
518                                  ctdb_addr_to_str(&vnn->public_address),
519                                  vnn->public_netmask_bits));
520                 return -1;
521         }
522
523         state = talloc(vnn, struct ctdb_do_takeip_state);
524         CTDB_NO_MEMORY(ctdb, state);
525
526         state->c = talloc_steal(ctdb, c);
527         state->vnn   = vnn;
528
529         vnn->update_in_flight = true;
530         talloc_set_destructor(state, ctdb_takeip_destructor);
531
532         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533                             ctdb_addr_to_str(&vnn->public_address),
534                             vnn->public_netmask_bits,
535                             ctdb_vnn_iface_string(vnn)));
536
537         ret = ctdb_event_script_callback(ctdb,
538                                          state,
539                                          ctdb_do_takeip_callback,
540                                          state,
541                                          CTDB_EVENT_TAKE_IP,
542                                          "%s %s %u",
543                                          ctdb_vnn_iface_string(vnn),
544                                          ctdb_addr_to_str(&vnn->public_address),
545                                          vnn->public_netmask_bits);
546
547         if (ret != 0) {
548                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549                         ctdb_addr_to_str(&vnn->public_address),
550                         ctdb_vnn_iface_string(vnn)));
551                 talloc_free(state);
552                 return -1;
553         }
554
555         return 0;
556 }
557
558 struct ctdb_do_updateip_state {
559         struct ctdb_req_control_old *c;
560         struct ctdb_interface *old;
561         struct ctdb_vnn *vnn;
562 };
563
564 /*
565   called when updateip event finishes
566  */
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
568                                       void *private_data)
569 {
570         struct ctdb_do_updateip_state *state =
571                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
572         int32_t ret;
573
574         if (status != 0) {
575                 if (status == -ETIME) {
576                         ctdb_ban_self(ctdb);
577                 }
578                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579                         ctdb_addr_to_str(&state->vnn->public_address),
580                         state->old->name,
581                         ctdb_vnn_iface_string(state->vnn)));
582
583                 /*
584                  * All we can do is reset the old interface
585                  * and let the next run fix it
586                  */
587                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588                 state->vnn->iface = state->old;
589                 state->vnn->iface->references++;
590
591                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
592                 talloc_free(state);
593                 return;
594         }
595
596         if (ctdb->do_checkpublicip) {
597
598         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
599         if (ret != 0) {
600                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
601                 talloc_free(state);
602                 return;
603         }
604
605         }
606
607         /* the control succeeded */
608         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
609         talloc_free(state);
610         return;
611 }
612
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
614 {
615         state->vnn->update_in_flight = false;
616         return 0;
617 }
618
619 /*
620   update (move) an ip address
621  */
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623                                 struct ctdb_req_control_old *c,
624                                 struct ctdb_vnn *vnn)
625 {
626         int ret;
627         struct ctdb_do_updateip_state *state;
628         struct ctdb_interface *old = vnn->iface;
629         const char *new_name;
630
631         if (vnn->update_in_flight) {
632                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633                                     "update for this IP already in flight\n",
634                                     ctdb_addr_to_str(&vnn->public_address),
635                                     vnn->public_netmask_bits));
636                 return -1;
637         }
638
639         ctdb_vnn_unassign_iface(ctdb, vnn);
640         ret = ctdb_vnn_assign_iface(ctdb, vnn);
641         if (ret != 0) {
642                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643                                  "assin a usable interface (old iface '%s')\n",
644                                  ctdb_addr_to_str(&vnn->public_address),
645                                  vnn->public_netmask_bits,
646                                  old->name));
647                 return -1;
648         }
649
650         new_name = ctdb_vnn_iface_string(vnn);
651         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652                 /* A benign update from one interface onto itself.
653                  * no need to run the eventscripts in this case, just return
654                  * success.
655                  */
656                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
657                 return 0;
658         }
659
660         state = talloc(vnn, struct ctdb_do_updateip_state);
661         CTDB_NO_MEMORY(ctdb, state);
662
663         state->c = talloc_steal(ctdb, c);
664         state->old = old;
665         state->vnn = vnn;
666
667         vnn->update_in_flight = true;
668         talloc_set_destructor(state, ctdb_updateip_destructor);
669
670         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671                             "interface %s to %s\n",
672                             ctdb_addr_to_str(&vnn->public_address),
673                             vnn->public_netmask_bits,
674                             old->name,
675                             new_name));
676
677         ret = ctdb_event_script_callback(ctdb,
678                                          state,
679                                          ctdb_do_updateip_callback,
680                                          state,
681                                          CTDB_EVENT_UPDATE_IP,
682                                          "%s %s %s %u",
683                                          state->old->name,
684                                          new_name,
685                                          ctdb_addr_to_str(&vnn->public_address),
686                                          vnn->public_netmask_bits);
687         if (ret != 0) {
688                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689                                  ctdb_addr_to_str(&vnn->public_address),
690                                  old->name, new_name));
691                 talloc_free(state);
692                 return -1;
693         }
694
695         return 0;
696 }
697
698 /*
699   Find the vnn of the node that has a public ip address
700   returns -1 if the address is not known as a public address
701  */
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
703 {
704         struct ctdb_vnn *vnn;
705
706         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707                 if (ctdb_same_ip(&vnn->public_address, addr)) {
708                         return vnn;
709                 }
710         }
711
712         return NULL;
713 }
714
715 /*
716   take over an ip address
717  */
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719                                  struct ctdb_req_control_old *c,
720                                  TDB_DATA indata,
721                                  bool *async_reply)
722 {
723         int ret;
724         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725         struct ctdb_vnn *vnn;
726         bool have_ip = false;
727         bool do_updateip = false;
728         bool do_takeip = false;
729         struct ctdb_interface *best_iface = NULL;
730
731         if (pip->pnn != ctdb->pnn) {
732                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733                                  "with pnn %d, but we're node %d\n",
734                                  ctdb_addr_to_str(&pip->addr),
735                                  pip->pnn, ctdb->pnn));
736                 return -1;
737         }
738
739         /* update out vnn list */
740         vnn = find_public_ip_vnn(ctdb, &pip->addr);
741         if (vnn == NULL) {
742                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743                         ctdb_addr_to_str(&pip->addr)));
744                 return 0;
745         }
746
747         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748                 have_ip = ctdb_sys_have_ip(&pip->addr);
749         }
750         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751         if (best_iface == NULL) {
752                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753                                  "a usable interface (old %s, have_ip %d)\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  vnn->public_netmask_bits,
756                                  ctdb_vnn_iface_string(vnn),
757                                  have_ip));
758                 return -1;
759         }
760
761         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
763                 have_ip = false;
764         }
765
766
767         if (vnn->iface == NULL && have_ip) {
768                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770                                  ctdb_addr_to_str(&vnn->public_address)));
771                 return 0;
772         }
773
774         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776                                   "and we have it on iface[%s], but it was assigned to node %d"
777                                   "and we are node %d, banning ourself\n",
778                                  ctdb_addr_to_str(&vnn->public_address),
779                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
780                 ctdb_ban_self(ctdb);
781                 return -1;
782         }
783
784         if (vnn->pnn == -1 && have_ip) {
785                 vnn->pnn = ctdb->pnn;
786                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787                                   "and we already have it on iface[%s], update local daemon\n",
788                                  ctdb_addr_to_str(&vnn->public_address),
789                                   ctdb_vnn_iface_string(vnn)));
790                 return 0;
791         }
792
793         if (vnn->iface) {
794                 if (vnn->iface != best_iface) {
795                         if (!vnn->iface->link_up) {
796                                 do_updateip = true;
797                         } else if (vnn->iface->references > (best_iface->references + 1)) {
798                                 /* only move when the rebalance gains something */
799                                         do_updateip = true;
800                         }
801                 }
802         }
803
804         if (!have_ip) {
805                 if (do_updateip) {
806                         ctdb_vnn_unassign_iface(ctdb, vnn);
807                         do_updateip = false;
808                 }
809                 do_takeip = true;
810         }
811
812         if (do_takeip) {
813                 ret = ctdb_do_takeip(ctdb, c, vnn);
814                 if (ret != 0) {
815                         return -1;
816                 }
817         } else if (do_updateip) {
818                 ret = ctdb_do_updateip(ctdb, c, vnn);
819                 if (ret != 0) {
820                         return -1;
821                 }
822         } else {
823                 /*
824                  * The interface is up and the kernel known the ip
825                  * => do nothing
826                  */
827                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828                         ctdb_addr_to_str(&pip->addr),
829                         vnn->public_netmask_bits,
830                         ctdb_vnn_iface_string(vnn)));
831                 return 0;
832         }
833
834         /* tell ctdb_control.c that we will be replying asynchronously */
835         *async_reply = true;
836
837         return 0;
838 }
839
840 /*
841   kill any clients that are registered with a IP that is being released
842  */
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
844 {
845         struct ctdb_client_ip *ip;
846
847         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848                 ctdb_addr_to_str(addr)));
849
850         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851                 ctdb_sock_addr tmp_addr;
852
853                 tmp_addr = ip->addr;
854                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
855                         ip->client_id,
856                         ctdb_addr_to_str(&ip->addr)));
857
858                 if (ctdb_same_ip(&tmp_addr, addr)) {
859                         struct ctdb_client *client = reqid_find(ctdb->idr,
860                                                                 ip->client_id,
861                                                                 struct ctdb_client);
862                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
863                                 ip->client_id,
864                                 ctdb_addr_to_str(&ip->addr),
865                                 client->pid));
866
867                         if (client->pid != 0) {
868                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869                                         (unsigned)client->pid,
870                                         ctdb_addr_to_str(addr),
871                                         ip->client_id));
872                                 kill(client->pid, SIGKILL);
873                         }
874                 }
875         }
876 }
877
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
879 {
880         DLIST_REMOVE(ctdb->vnn, vnn);
881         ctdb_vnn_unassign_iface(ctdb, vnn);
882         ctdb_remove_orphaned_ifaces(ctdb, vnn);
883         talloc_free(vnn);
884 }
885
886 /*
887   called when releaseip event finishes
888  */
889 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
890                                 void *private_data)
891 {
892         struct takeover_callback_state *state = 
893                 talloc_get_type(private_data, struct takeover_callback_state);
894         TDB_DATA data;
895
896         if (status == -ETIME) {
897                 ctdb_ban_self(ctdb);
898         }
899
900         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901                 if  (ctdb_sys_have_ip(state->addr)) {
902                         DEBUG(DEBUG_ERR,
903                               ("IP %s still hosted during release IP callback, failing\n",
904                                ctdb_addr_to_str(state->addr)));
905                         ctdb_request_control_reply(ctdb, state->c,
906                                                    NULL, -1, NULL);
907                         talloc_free(state);
908                         return;
909                 }
910         }
911
912         /* send a message to all clients of this node telling them
913            that the cluster has been reconfigured and they should
914            release any sockets on this IP */
915         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917         data.dsize = strlen((char *)data.dptr)+1;
918
919         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
920
921         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
922
923         /* kill clients that have registered with this IP */
924         release_kill_clients(ctdb, state->addr);
925
926         ctdb_vnn_unassign_iface(ctdb, state->vnn);
927
928         /* Process the IP if it has been marked for deletion */
929         if (state->vnn->delete_pending) {
930                 do_delete_ip(ctdb, state->vnn);
931                 state->vnn = NULL;
932         }
933
934         /* the control succeeded */
935         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
936         talloc_free(state);
937 }
938
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
940 {
941         if (state->vnn != NULL) {
942                 state->vnn->update_in_flight = false;
943         }
944         return 0;
945 }
946
947 /*
948   release an ip address
949  */
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
951                                 struct ctdb_req_control_old *c,
952                                 TDB_DATA indata, 
953                                 bool *async_reply)
954 {
955         int ret;
956         struct takeover_callback_state *state;
957         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958         struct ctdb_vnn *vnn;
959         char *iface;
960
961         /* update our vnn list */
962         vnn = find_public_ip_vnn(ctdb, &pip->addr);
963         if (vnn == NULL) {
964                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965                         ctdb_addr_to_str(&pip->addr)));
966                 return 0;
967         }
968         vnn->pnn = pip->pnn;
969
970         /* stop any previous arps */
971         talloc_free(vnn->takeover_ctx);
972         vnn->takeover_ctx = NULL;
973
974         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975          * lazy multicast to drop an IP from any node that isn't the
976          * intended new node.  The following causes makes ctdbd ignore
977          * a release for any address it doesn't host.
978          */
979         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980                 if (!ctdb_sys_have_ip(&pip->addr)) {
981                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982                                 ctdb_addr_to_str(&pip->addr),
983                                 vnn->public_netmask_bits,
984                                 ctdb_vnn_iface_string(vnn)));
985                         ctdb_vnn_unassign_iface(ctdb, vnn);
986                         return 0;
987                 }
988         } else {
989                 if (vnn->iface == NULL) {
990                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991                                            ctdb_addr_to_str(&pip->addr),
992                                            vnn->public_netmask_bits));
993                         return 0;
994                 }
995         }
996
997         /* There is a potential race between take_ip and us because we
998          * update the VNN via a callback that run when the
999          * eventscripts have been run.  Avoid the race by allowing one
1000          * update to be in flight at a time.
1001          */
1002         if (vnn->update_in_flight) {
1003                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004                                     "update for this IP already in flight\n",
1005                                     ctdb_addr_to_str(&vnn->public_address),
1006                                     vnn->public_netmask_bits));
1007                 return -1;
1008         }
1009
1010         iface = strdup(ctdb_vnn_iface_string(vnn));
1011
1012         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1013                 ctdb_addr_to_str(&pip->addr),
1014                 vnn->public_netmask_bits,
1015                 iface,
1016                 pip->pnn));
1017
1018         state = talloc(ctdb, struct takeover_callback_state);
1019         if (state == NULL) {
1020                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021                                __FILE__, __LINE__);
1022                 free(iface);
1023                 return -1;
1024         }
1025
1026         state->c = talloc_steal(state, c);
1027         state->addr = talloc(state, ctdb_sock_addr);       
1028         if (state->addr == NULL) {
1029                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030                                __FILE__, __LINE__);
1031                 free(iface);
1032                 talloc_free(state);
1033                 return -1;
1034         }
1035         *state->addr = pip->addr;
1036         state->vnn   = vnn;
1037
1038         vnn->update_in_flight = true;
1039         talloc_set_destructor(state, ctdb_releaseip_destructor);
1040
1041         ret = ctdb_event_script_callback(ctdb, 
1042                                          state, release_ip_callback, state,
1043                                          CTDB_EVENT_RELEASE_IP,
1044                                          "%s %s %u",
1045                                          iface,
1046                                          ctdb_addr_to_str(&pip->addr),
1047                                          vnn->public_netmask_bits);
1048         free(iface);
1049         if (ret != 0) {
1050                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051                         ctdb_addr_to_str(&pip->addr),
1052                         ctdb_vnn_iface_string(vnn)));
1053                 talloc_free(state);
1054                 return -1;
1055         }
1056
1057         /* tell the control that we will be reply asynchronously */
1058         *async_reply = true;
1059         return 0;
1060 }
1061
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063                                    ctdb_sock_addr *addr,
1064                                    unsigned mask, const char *ifaces,
1065                                    bool check_address)
1066 {
1067         struct ctdb_vnn      *vnn;
1068         uint32_t num = 0;
1069         char *tmp;
1070         const char *iface;
1071         int i;
1072         int ret;
1073
1074         tmp = strdup(ifaces);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 if (!ctdb_sys_check_iface_exists(iface)) {
1077                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1078                         free(tmp);
1079                         return -1;
1080                 }
1081         }
1082         free(tmp);
1083
1084         /* Verify that we don't have an entry for this ip yet */
1085         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1088                                 ctdb_addr_to_str(addr)));
1089                         return -1;
1090                 }               
1091         }
1092
1093         /* create a new vnn structure for this ip address */
1094         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097         tmp = talloc_strdup(vnn, ifaces);
1098         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1104                 num++;
1105         }
1106         talloc_free(tmp);
1107         vnn->ifaces[num] = NULL;
1108         vnn->public_address      = *addr;
1109         vnn->public_netmask_bits = mask;
1110         vnn->pnn                 = -1;
1111         if (check_address) {
1112                 if (ctdb_sys_have_ip(addr)) {
1113                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114                         vnn->pnn = ctdb->pnn;
1115                 }
1116         }
1117
1118         for (i=0; vnn->ifaces[i]; i++) {
1119                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1120                 if (ret != 0) {
1121                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122                                            "for public_address[%s]\n",
1123                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1124                         talloc_free(vnn);
1125                         return -1;
1126                 }
1127         }
1128
1129         DLIST_ADD(ctdb->vnn, vnn);
1130
1131         return 0;
1132 }
1133
1134 /*
1135   setup the public address lists from a file
1136 */
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1138 {
1139         char **lines;
1140         int nlines;
1141         int i;
1142
1143         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144         if (lines == NULL) {
1145                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1146                 return -1;
1147         }
1148         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1149                 nlines--;
1150         }
1151
1152         for (i=0;i<nlines;i++) {
1153                 unsigned mask;
1154                 ctdb_sock_addr addr;
1155                 const char *addrstr;
1156                 const char *ifaces;
1157                 char *tok, *line;
1158
1159                 line = lines[i];
1160                 while ((*line == ' ') || (*line == '\t')) {
1161                         line++;
1162                 }
1163                 if (*line == '#') {
1164                         continue;
1165                 }
1166                 if (strcmp(line, "") == 0) {
1167                         continue;
1168                 }
1169                 tok = strtok(line, " \t");
1170                 addrstr = tok;
1171                 tok = strtok(NULL, " \t");
1172                 if (tok == NULL) {
1173                         if (NULL == ctdb->default_public_interface) {
1174                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1175                                          i+1));
1176                                 talloc_free(lines);
1177                                 return -1;
1178                         }
1179                         ifaces = ctdb->default_public_interface;
1180                 } else {
1181                         ifaces = tok;
1182                 }
1183
1184                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1186                         talloc_free(lines);
1187                         return -1;
1188                 }
1189                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1191                         talloc_free(lines);
1192                         return -1;
1193                 }
1194         }
1195
1196
1197         talloc_free(lines);
1198         return 0;
1199 }
1200
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1202                               const char *iface,
1203                               const char *ip)
1204 {
1205         struct ctdb_vnn *svnn;
1206         struct ctdb_interface *cur = NULL;
1207         bool ok;
1208         int ret;
1209
1210         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211         CTDB_NO_MEMORY(ctdb, svnn);
1212
1213         svnn->ifaces = talloc_array(svnn, const char *, 2);
1214         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217         svnn->ifaces[1] = NULL;
1218
1219         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1220         if (!ok) {
1221                 talloc_free(svnn);
1222                 return -1;
1223         }
1224
1225         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1226         if (ret != 0) {
1227                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228                                    "for single_ip[%s]\n",
1229                                    svnn->ifaces[0],
1230                                    ctdb_addr_to_str(&svnn->public_address)));
1231                 talloc_free(svnn);
1232                 return -1;
1233         }
1234
1235         /* assume the single public ip interface is initially "good" */
1236         cur = ctdb_find_iface(ctdb, iface);
1237         if (cur == NULL) {
1238                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1239                 return -1;
1240         }
1241         cur->link_up = true;
1242
1243         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1244         if (ret != 0) {
1245                 talloc_free(svnn);
1246                 return -1;
1247         }
1248
1249         ctdb->single_ip_vnn = svnn;
1250         return 0;
1251 }
1252
1253 struct public_ip_list {
1254         struct public_ip_list *next;
1255         uint32_t pnn;
1256         ctdb_sock_addr addr;
1257 };
1258
1259 /* Given a physical node, return the number of
1260    public addresses that is currently assigned to this node.
1261 */
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1263 {
1264         int num=0;
1265
1266         for (;ips;ips=ips->next) {
1267                 if (ips->pnn == pnn) {
1268                         num++;
1269                 }
1270         }
1271         return num;
1272 }
1273
1274
1275 /* Can the given node host the given IP: is the public IP known to the
1276  * node and is NOIPHOST unset?
1277 */
1278 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1279                              int32_t pnn,
1280                              struct ctdb_ipflags ipflags,
1281                              struct public_ip_list *ip)
1282 {
1283         struct ctdb_public_ip_list_old *public_ips;
1284         int i;
1285
1286         if (ipflags.noiphost) {
1287                 return false;
1288         }
1289
1290         public_ips = ipalloc_state->available_public_ips[pnn];
1291
1292         if (public_ips == NULL) {
1293                 return false;
1294         }
1295
1296         for (i=0; i<public_ips->num; i++) {
1297                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1298                         /* yes, this node can serve this public ip */
1299                         return true;
1300                 }
1301         }
1302
1303         return false;
1304 }
1305
1306 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1307                                  int32_t pnn,
1308                                  struct ctdb_ipflags ipflags,
1309                                  struct public_ip_list *ip)
1310 {
1311         if (ipflags.noiptakeover) {
1312                 return false;
1313         }
1314
1315         return can_node_host_ip(ipalloc_state, pnn, ipflags, ip);
1316 }
1317
1318 /* search the node lists list for a node to takeover this ip.
1319    pick the node that currently are serving the least number of ips
1320    so that the ips get spread out evenly.
1321 */
1322 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1323                               struct ctdb_ipflags *ipflags,
1324                               struct public_ip_list *ip,
1325                               struct public_ip_list *all_ips)
1326 {
1327         int pnn, min=0, num;
1328         int i, numnodes;
1329
1330         numnodes = talloc_array_length(ipflags);
1331         pnn    = -1;
1332         for (i=0; i<numnodes; i++) {
1333                 /* verify that this node can serve this ip */
1334                 if (!can_node_takeover_ip(ipalloc_state, i, ipflags[i], ip)) {
1335                         /* no it couldnt   so skip to the next node */
1336                         continue;
1337                 }
1338
1339                 num = node_ip_coverage(i, all_ips);
1340                 /* was this the first node we checked ? */
1341                 if (pnn == -1) {
1342                         pnn = i;
1343                         min  = num;
1344                 } else {
1345                         if (num < min) {
1346                                 pnn = i;
1347                                 min  = num;
1348                         }
1349                 }
1350         }
1351         if (pnn == -1) {
1352                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1353                         ctdb_addr_to_str(&ip->addr)));
1354
1355                 return -1;
1356         }
1357
1358         ip->pnn = pnn;
1359         return 0;
1360 }
1361
1362 #define IP_KEYLEN       4
1363 static uint32_t *ip_key(ctdb_sock_addr *ip)
1364 {
1365         static uint32_t key[IP_KEYLEN];
1366
1367         bzero(key, sizeof(key));
1368
1369         switch (ip->sa.sa_family) {
1370         case AF_INET:
1371                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1372                 break;
1373         case AF_INET6: {
1374                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1375                 key[0]  = htonl(s6_a32[0]);
1376                 key[1]  = htonl(s6_a32[1]);
1377                 key[2]  = htonl(s6_a32[2]);
1378                 key[3]  = htonl(s6_a32[3]);
1379                 break;
1380         }
1381         default:
1382                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1383                 return key;
1384         }
1385
1386         return key;
1387 }
1388
1389 static void *add_ip_callback(void *parm, void *data)
1390 {
1391         struct public_ip_list *this_ip = parm;
1392         struct public_ip_list *prev_ip = data;
1393
1394         if (prev_ip == NULL) {
1395                 return parm;
1396         }
1397         if (this_ip->pnn == -1) {
1398                 this_ip->pnn = prev_ip->pnn;
1399         }
1400
1401         return parm;
1402 }
1403
1404 static int getips_count_callback(void *param, void *data)
1405 {
1406         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1407         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1408
1409         new_ip->next = *ip_list;
1410         *ip_list     = new_ip;
1411         return 0;
1412 }
1413
1414 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1415                                        struct ctdb_public_ip_list_old *ips,
1416                                        uint32_t pnn);
1417
1418 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1419                                          struct ipalloc_state *ipalloc_state,
1420                                          struct ctdb_node_map_old *nodemap)
1421 {
1422         int j;
1423         int ret;
1424
1425         if (ipalloc_state->num != nodemap->num) {
1426                 DEBUG(DEBUG_ERR,
1427                       (__location__
1428                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1429                        ipalloc_state->num, nodemap->num));
1430                 return -1;
1431         }
1432
1433         for (j=0; j<nodemap->num; j++) {
1434                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1435                         continue;
1436                 }
1437
1438                 /* Retrieve the list of known public IPs from the node */
1439                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1440                                         TAKEOVER_TIMEOUT(),
1441                                         j,
1442                                         ctdb->nodes,
1443                                         0,
1444                                         &ipalloc_state->known_public_ips[j]);
1445                 if (ret != 0) {
1446                         DEBUG(DEBUG_ERR,
1447                               ("Failed to read known public IPs from node: %u\n",
1448                                j));
1449                         return -1;
1450                 }
1451
1452                 if (ctdb->do_checkpublicip) {
1453                         verify_remote_ip_allocation(ctdb,
1454                                                     ipalloc_state->known_public_ips[j],
1455                                                     j);
1456                 }
1457
1458                 /* Retrieve the list of available public IPs from the node */
1459                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1460                                         TAKEOVER_TIMEOUT(),
1461                                         j,
1462                                         ctdb->nodes,
1463                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1464                                         &ipalloc_state->available_public_ips[j]);
1465                 if (ret != 0) {
1466                         DEBUG(DEBUG_ERR,
1467                               ("Failed to read available public IPs from node: %u\n",
1468                                j));
1469                         return -1;
1470                 }
1471         }
1472
1473         return 0;
1474 }
1475
1476 static struct public_ip_list *
1477 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1478 {
1479         int i, j;
1480         struct public_ip_list *ip_list;
1481         struct ctdb_public_ip_list_old *public_ips;
1482
1483         TALLOC_FREE(ctdb->ip_tree);
1484         ctdb->ip_tree = trbt_create(ctdb, 0);
1485
1486         for (i=0; i < ctdb->num_nodes; i++) {
1487                 public_ips = ipalloc_state->known_public_ips[i];
1488
1489                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1490                         continue;
1491                 }
1492
1493                 /* there were no public ips for this node */
1494                 if (public_ips == NULL) {
1495                         continue;
1496                 }
1497
1498                 for (j=0; j < public_ips->num; j++) {
1499                         struct public_ip_list *tmp_ip;
1500
1501                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1502                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503                         /* Do not use information about IP addresses hosted
1504                          * on other nodes, it may not be accurate */
1505                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1507                         } else {
1508                                 tmp_ip->pnn = -1;
1509                         }
1510                         tmp_ip->addr = public_ips->ips[j].addr;
1511                         tmp_ip->next = NULL;
1512
1513                         trbt_insertarray32_callback(ctdb->ip_tree,
1514                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1515                                 add_ip_callback,
1516                                 tmp_ip);
1517                 }
1518         }
1519
1520         ip_list = NULL;
1521         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1522
1523         return ip_list;
1524 }
1525
1526 /* 
1527  * This is the length of the longtest common prefix between the IPs.
1528  * It is calculated by XOR-ing the 2 IPs together and counting the
1529  * number of leading zeroes.  The implementation means that all
1530  * addresses end up being 128 bits long.
1531  *
1532  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534  * lots of nodes and IP addresses?
1535  */
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1537 {
1538         uint32_t ip1_k[IP_KEYLEN];
1539         uint32_t *t;
1540         int i;
1541         uint32_t x;
1542
1543         uint32_t distance = 0;
1544
1545         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1546         t = ip_key(ip2);
1547         for (i=0; i<IP_KEYLEN; i++) {
1548                 x = ip1_k[i] ^ t[i];
1549                 if (x == 0) {
1550                         distance += 32;
1551                 } else {
1552                         /* Count number of leading zeroes. 
1553                          * FIXME? This could be optimised...
1554                          */
1555                         while ((x & (1 << 31)) == 0) {
1556                                 x <<= 1;
1557                                 distance += 1;
1558                         }
1559                 }
1560         }
1561
1562         return distance;
1563 }
1564
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566    given node.  The ips argument is generally the all_ips variable
1567    used in the main part of the algorithm.
1568  */
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570                                   struct public_ip_list *ips,
1571                                   int pnn)
1572 {
1573         struct public_ip_list *t;
1574         uint32_t d;
1575
1576         uint32_t sum = 0;
1577
1578         for (t=ips; t != NULL; t=t->next) {
1579                 if (t->pnn != pnn) {
1580                         continue;
1581                 }
1582
1583                 /* Optimisation: We never calculate the distance
1584                  * between an address and itself.  This allows us to
1585                  * calculate the effect of removing an address from a
1586                  * node by simply calculating the distance between
1587                  * that address and all of the exitsing addresses.
1588                  * Moreover, we assume that we're only ever dealing
1589                  * with addresses from all_ips so we can identify an
1590                  * address via a pointer rather than doing a more
1591                  * expensive address comparison. */
1592                 if (&(t->addr) == ip) {
1593                         continue;
1594                 }
1595
1596                 d = ip_distance(ip, &(t->addr));
1597                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1598         }
1599
1600         return sum;
1601 }
1602
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1604    to the given node.
1605  */
1606 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1607 {
1608         struct public_ip_list *t;
1609
1610         uint32_t imbalance = 0;
1611
1612         for (t=all_ips; t!=NULL; t=t->next) {
1613                 if (t->pnn != pnn) {
1614                         continue;
1615                 }
1616                 /* Pass the rest of the IPs rather than the whole
1617                    all_ips input list.
1618                 */
1619                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1620         }
1621
1622         return imbalance;
1623 }
1624
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626  * finding the best node for each.
1627  */
1628 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1629                                       struct ctdb_ipflags *ipflags,
1630                                       struct public_ip_list *all_ips)
1631 {
1632         struct public_ip_list *tmp_ip;
1633
1634         /* loop over all ip's and find a physical node to cover for
1635            each unassigned ip.
1636         */
1637         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638                 if (tmp_ip->pnn == -1) {
1639                         if (find_takeover_node(ipalloc_state, ipflags,
1640                                                tmp_ip, all_ips)) {
1641                                 DEBUG(DEBUG_WARNING,
1642                                       ("Failed to find node to cover ip %s\n",
1643                                        ctdb_addr_to_str(&tmp_ip->addr)));
1644                         }
1645                 }
1646         }
1647 }
1648
1649 /* Basic non-deterministic rebalancing algorithm.
1650  */
1651 static void basic_failback(struct ipalloc_state *ipalloc_state,
1652                            struct ctdb_ipflags *ipflags,
1653                            struct public_ip_list *all_ips,
1654                            int num_ips)
1655 {
1656         int i, numnodes;
1657         int maxnode, maxnum, minnode, minnum, num, retries;
1658         struct public_ip_list *tmp_ip;
1659
1660         numnodes = talloc_array_length(ipflags);
1661         retries = 0;
1662
1663 try_again:
1664         maxnum=0;
1665         minnum=0;
1666
1667         /* for each ip address, loop over all nodes that can serve
1668            this ip and make sure that the difference between the node
1669            serving the most and the node serving the least ip's are
1670            not greater than 1.
1671         */
1672         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1673                 if (tmp_ip->pnn == -1) {
1674                         continue;
1675                 }
1676
1677                 /* Get the highest and lowest number of ips's served by any 
1678                    valid node which can serve this ip.
1679                 */
1680                 maxnode = -1;
1681                 minnode = -1;
1682                 for (i=0; i<numnodes; i++) {
1683                         /* only check nodes that can actually serve this ip */
1684                         if (!can_node_takeover_ip(ipalloc_state, i,
1685                                                   ipflags[i], tmp_ip)) {
1686                                 /* no it couldnt   so skip to the next node */
1687                                 continue;
1688                         }
1689
1690                         num = node_ip_coverage(i, all_ips);
1691                         if (maxnode == -1) {
1692                                 maxnode = i;
1693                                 maxnum  = num;
1694                         } else {
1695                                 if (num > maxnum) {
1696                                         maxnode = i;
1697                                         maxnum  = num;
1698                                 }
1699                         }
1700                         if (minnode == -1) {
1701                                 minnode = i;
1702                                 minnum  = num;
1703                         } else {
1704                                 if (num < minnum) {
1705                                         minnode = i;
1706                                         minnum  = num;
1707                                 }
1708                         }
1709                 }
1710                 if (maxnode == -1) {
1711                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1712                                 ctdb_addr_to_str(&tmp_ip->addr)));
1713
1714                         continue;
1715                 }
1716
1717                 /* if the spread between the smallest and largest coverage by
1718                    a node is >=2 we steal one of the ips from the node with
1719                    most coverage to even things out a bit.
1720                    try to do this a limited number of times since we dont
1721                    want to spend too much time balancing the ip coverage.
1722                 */
1723                 if ( (maxnum > minnum+1)
1724                      && (retries < (num_ips + 5)) ){
1725                         struct public_ip_list *tmp;
1726
1727                         /* Reassign one of maxnode's VNNs */
1728                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1729                                 if (tmp->pnn == maxnode) {
1730                                         (void)find_takeover_node(ipalloc_state,
1731                                                                  ipflags,
1732                                                                  tmp,
1733                                                                  all_ips);
1734                                         retries++;
1735                                         goto try_again;;
1736                                 }
1737                         }
1738                 }
1739         }
1740 }
1741
1742 static void lcp2_init(TALLOC_CTX *tmp_ctx,
1743                       struct ctdb_ipflags *ipflags,
1744                       struct public_ip_list *all_ips,
1745                       uint32_t *force_rebalance_nodes,
1746                       uint32_t **lcp2_imbalances,
1747                       bool **rebalance_candidates)
1748 {
1749         int i, numnodes;
1750         struct public_ip_list *tmp_ip;
1751
1752         numnodes = talloc_array_length(ipflags);
1753
1754         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1755         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1756         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1757         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1758
1759         for (i=0; i<numnodes; i++) {
1760                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1761                 /* First step: assume all nodes are candidates */
1762                 (*rebalance_candidates)[i] = true;
1763         }
1764
1765         /* 2nd step: if a node has IPs assigned then it must have been
1766          * healthy before, so we remove it from consideration.  This
1767          * is overkill but is all we have because we don't maintain
1768          * state between takeover runs.  An alternative would be to
1769          * keep state and invalidate it every time the recovery master
1770          * changes.
1771          */
1772         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1773                 if (tmp_ip->pnn != -1) {
1774                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1775                 }
1776         }
1777
1778         /* 3rd step: if a node is forced to re-balance then
1779            we allow failback onto the node */
1780         if (force_rebalance_nodes == NULL) {
1781                 return;
1782         }
1783         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1784                 uint32_t pnn = force_rebalance_nodes[i];
1785                 if (pnn >= numnodes) {
1786                         DEBUG(DEBUG_ERR,
1787                               (__location__ "unknown node %u\n", pnn));
1788                         continue;
1789                 }
1790
1791                 DEBUG(DEBUG_NOTICE,
1792                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1793                 (*rebalance_candidates)[pnn] = true;
1794         }
1795 }
1796
1797 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1798  * the IP/node combination that will cost the least.
1799  */
1800 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1801                                      struct ctdb_ipflags *ipflags,
1802                                      struct public_ip_list *all_ips,
1803                                      uint32_t *lcp2_imbalances)
1804 {
1805         struct public_ip_list *tmp_ip;
1806         int dstnode, numnodes;
1807
1808         int minnode;
1809         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1810         struct public_ip_list *minip;
1811
1812         bool should_loop = true;
1813         bool have_unassigned = true;
1814
1815         numnodes = talloc_array_length(ipflags);
1816
1817         while (have_unassigned && should_loop) {
1818                 should_loop = false;
1819
1820                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1821                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1822
1823                 minnode = -1;
1824                 mindsum = 0;
1825                 minip = NULL;
1826
1827                 /* loop over each unassigned ip. */
1828                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1829                         if (tmp_ip->pnn != -1) {
1830                                 continue;
1831                         }
1832
1833                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1834                                 /* only check nodes that can actually takeover this ip */
1835                                 if (!can_node_takeover_ip(ipalloc_state,
1836                                                           dstnode,
1837                                                           ipflags[dstnode],
1838                                                           tmp_ip)) {
1839                                         /* no it couldnt   so skip to the next node */
1840                                         continue;
1841                                 }
1842
1843                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1844                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1845                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1846                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1847                                                    dstnode,
1848                                                    dstimbl - lcp2_imbalances[dstnode]));
1849
1850
1851                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1852                                         minnode = dstnode;
1853                                         minimbl = dstimbl;
1854                                         mindsum = dstdsum;
1855                                         minip = tmp_ip;
1856                                         should_loop = true;
1857                                 }
1858                         }
1859                 }
1860
1861                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1862
1863                 /* If we found one then assign it to the given node. */
1864                 if (minnode != -1) {
1865                         minip->pnn = minnode;
1866                         lcp2_imbalances[minnode] = minimbl;
1867                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1868                                           ctdb_addr_to_str(&(minip->addr)),
1869                                           minnode,
1870                                           mindsum));
1871                 }
1872
1873                 /* There might be a better way but at least this is clear. */
1874                 have_unassigned = false;
1875                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876                         if (tmp_ip->pnn == -1) {
1877                                 have_unassigned = true;
1878                         }
1879                 }
1880         }
1881
1882         /* We know if we have an unassigned addresses so we might as
1883          * well optimise.
1884          */
1885         if (have_unassigned) {
1886                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1887                         if (tmp_ip->pnn == -1) {
1888                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1889                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1890                         }
1891                 }
1892         }
1893 }
1894
1895 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1896  * to move IPs from, determines the best IP/destination node
1897  * combination to move from the source node.
1898  */
1899 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1900                                     struct ctdb_ipflags *ipflags,
1901                                     struct public_ip_list *all_ips,
1902                                     int srcnode,
1903                                     uint32_t *lcp2_imbalances,
1904                                     bool *rebalance_candidates)
1905 {
1906         int dstnode, mindstnode, numnodes;
1907         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1908         uint32_t minsrcimbl, mindstimbl;
1909         struct public_ip_list *minip;
1910         struct public_ip_list *tmp_ip;
1911
1912         /* Find an IP and destination node that best reduces imbalance. */
1913         srcimbl = 0;
1914         minip = NULL;
1915         minsrcimbl = 0;
1916         mindstnode = -1;
1917         mindstimbl = 0;
1918
1919         numnodes = talloc_array_length(ipflags);
1920
1921         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1922         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1923                            srcnode, lcp2_imbalances[srcnode]));
1924
1925         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1926                 /* Only consider addresses on srcnode. */
1927                 if (tmp_ip->pnn != srcnode) {
1928                         continue;
1929                 }
1930
1931                 /* What is this IP address costing the source node? */
1932                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1933                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1934
1935                 /* Consider this IP address would cost each potential
1936                  * destination node.  Destination nodes are limited to
1937                  * those that are newly healthy, since we don't want
1938                  * to do gratuitous failover of IPs just to make minor
1939                  * balance improvements.
1940                  */
1941                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1942                         if (!rebalance_candidates[dstnode]) {
1943                                 continue;
1944                         }
1945
1946                         /* only check nodes that can actually takeover this ip */
1947                         if (!can_node_takeover_ip(ipalloc_state, dstnode,
1948                                                   ipflags[dstnode], tmp_ip)) {
1949                                 /* no it couldnt   so skip to the next node */
1950                                 continue;
1951                         }
1952
1953                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1954                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1955                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1956                                            srcnode, -srcdsum,
1957                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1958                                            dstnode, dstdsum));
1959
1960                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1961                             (dstdsum < srcdsum) &&                      \
1962                             ((mindstnode == -1) ||                              \
1963                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1964
1965                                 minip = tmp_ip;
1966                                 minsrcimbl = srcimbl;
1967                                 mindstnode = dstnode;
1968                                 mindstimbl = dstimbl;
1969                         }
1970                 }
1971         }
1972         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1973
1974         if (mindstnode != -1) {
1975                 /* We found a move that makes things better... */
1976                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1977                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1978                                   ctdb_addr_to_str(&(minip->addr)),
1979                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1980
1981
1982                 lcp2_imbalances[srcnode] = minsrcimbl;
1983                 lcp2_imbalances[mindstnode] = mindstimbl;
1984                 minip->pnn = mindstnode;
1985
1986                 return true;
1987         }
1988
1989         return false;
1990         
1991 }
1992
1993 struct lcp2_imbalance_pnn {
1994         uint32_t imbalance;
1995         int pnn;
1996 };
1997
1998 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1999 {
2000         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2001         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2002
2003         if (lipa->imbalance > lipb->imbalance) {
2004                 return -1;
2005         } else if (lipa->imbalance == lipb->imbalance) {
2006                 return 0;
2007         } else {
2008                 return 1;
2009         }
2010 }
2011
2012 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2013  * node with the highest LCP2 imbalance, and then determines the best
2014  * IP/destination node combination to move from the source node.
2015  */
2016 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2017                           struct ctdb_ipflags *ipflags,
2018                           struct public_ip_list *all_ips,
2019                           uint32_t *lcp2_imbalances,
2020                           bool *rebalance_candidates)
2021 {
2022         int i, numnodes;
2023         struct lcp2_imbalance_pnn * lips;
2024         bool again;
2025
2026         numnodes = talloc_array_length(ipflags);
2027
2028 try_again:
2029         /* Put the imbalances and nodes into an array, sort them and
2030          * iterate through candidates.  Usually the 1st one will be
2031          * used, so this doesn't cost much...
2032          */
2033         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2034         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2035         lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2036         for (i=0; i<numnodes; i++) {
2037                 lips[i].imbalance = lcp2_imbalances[i];
2038                 lips[i].pnn = i;
2039                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2040         }
2041         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2042               lcp2_cmp_imbalance_pnn);
2043
2044         again = false;
2045         for (i=0; i<numnodes; i++) {
2046                 /* This means that all nodes had 0 or 1 addresses, so
2047                  * can't be imbalanced.
2048                  */
2049                 if (lips[i].imbalance == 0) {
2050                         break;
2051                 }
2052
2053                 if (lcp2_failback_candidate(ipalloc_state,
2054                                             ipflags,
2055                                             all_ips,
2056                                             lips[i].pnn,
2057                                             lcp2_imbalances,
2058                                             rebalance_candidates)) {
2059                         again = true;
2060                         break;
2061                 }
2062         }
2063
2064         talloc_free(lips);
2065         if (again) {
2066                 goto try_again;
2067         }
2068 }
2069
2070 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
2071                                     struct ctdb_ipflags *ipflags,
2072                                     struct public_ip_list *all_ips)
2073 {
2074         struct public_ip_list *tmp_ip;
2075
2076         /* verify that the assigned nodes can serve that public ip
2077            and set it to -1 if not
2078         */
2079         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2080                 if (tmp_ip->pnn == -1) {
2081                         continue;
2082                 }
2083                 if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
2084                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2085                         /* this node can not serve this ip. */
2086                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2087                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2088                                            tmp_ip->pnn));
2089                         tmp_ip->pnn = -1;
2090                 }
2091         }
2092 }
2093
2094 static void ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
2095                                        struct ctdb_ipflags *ipflags,
2096                                        struct public_ip_list *all_ips)
2097 {
2098         struct public_ip_list *tmp_ip;
2099         int i, numnodes;
2100
2101         numnodes = talloc_array_length(ipflags);
2102
2103         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2104        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2105         *  always be allocated the same way for a specific set of
2106         *  available/unavailable nodes.
2107         */
2108
2109         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2110                 tmp_ip->pnn = i % numnodes;
2111         }
2112
2113         /* IP failback doesn't make sense with deterministic
2114          * IPs, since the modulo step above implicitly fails
2115          * back IPs to their "home" node.
2116          */
2117         if (1 == ipalloc_state->no_ip_failback) {
2118                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2119         }
2120
2121         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2122
2123         basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2124
2125         /* No failback here! */
2126 }
2127
2128 static void ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
2129                                           struct ctdb_ipflags *ipflags,
2130                                           struct public_ip_list *all_ips)
2131 {
2132         /* This should be pushed down into basic_failback. */
2133         struct public_ip_list *tmp_ip;
2134         int num_ips = 0;
2135         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2136                 num_ips++;
2137         }
2138
2139         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2140
2141         basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2142
2143         /* If we don't want IPs to fail back then don't rebalance IPs. */
2144         if (1 == ipalloc_state->no_ip_failback) {
2145                 return;
2146         }
2147
2148         /* Now, try to make sure the ip adresses are evenly distributed
2149            across the nodes.
2150         */
2151         basic_failback(ipalloc_state, ipflags, all_ips, num_ips);
2152 }
2153
2154 static void ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
2155                           struct ctdb_ipflags *ipflags,
2156                           struct public_ip_list *all_ips,
2157                           uint32_t *force_rebalance_nodes)
2158 {
2159         uint32_t *lcp2_imbalances;
2160         bool *rebalance_candidates;
2161         int numnodes, num_rebalance_candidates, i;
2162
2163         TALLOC_CTX *tmp_ctx = talloc_new(ipalloc_state);
2164
2165         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2166
2167         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2168                   &lcp2_imbalances, &rebalance_candidates);
2169
2170         lcp2_allocate_unassigned(ipalloc_state, ipflags, all_ips, lcp2_imbalances);
2171
2172         /* If we don't want IPs to fail back then don't rebalance IPs. */
2173         if (1 == ipalloc_state->no_ip_failback) {
2174                 goto finished;
2175         }
2176
2177         /* It is only worth continuing if we have suitable target
2178          * nodes to transfer IPs to.  This check is much cheaper than
2179          * continuing on...
2180          */
2181         numnodes = talloc_array_length(ipflags);
2182         num_rebalance_candidates = 0;
2183         for (i=0; i<numnodes; i++) {
2184                 if (rebalance_candidates[i]) {
2185                         num_rebalance_candidates++;
2186                 }
2187         }
2188         if (num_rebalance_candidates == 0) {
2189                 goto finished;
2190         }
2191
2192         /* Now, try to make sure the ip adresses are evenly distributed
2193            across the nodes.
2194         */
2195         lcp2_failback(ipalloc_state, ipflags, all_ips,
2196                       lcp2_imbalances, rebalance_candidates);
2197
2198 finished:
2199         talloc_free(tmp_ctx);
2200 }
2201
2202 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2203 {
2204         int i;
2205
2206         for (i=0;i<nodemap->num;i++) {
2207                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2208                         /* Found one completely healthy node */
2209                         return false;
2210                 }
2211         }
2212
2213         return true;
2214 }
2215
2216 /* The calculation part of the IP allocation algorithm. */
2217 static void ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
2218                                    struct ctdb_ipflags *ipflags,
2219                                    struct public_ip_list *all_ips,
2220                                    uint32_t *force_rebalance_nodes)
2221 {
2222         switch (ipalloc_state->algorithm) {
2223         case IPALLOC_LCP2:
2224                 ip_alloc_lcp2(ipalloc_state, ipflags, all_ips,
2225                               force_rebalance_nodes);
2226                 break;
2227         case IPALLOC_DETERMINISTIC:
2228                 ip_alloc_deterministic_ips(ipalloc_state, ipflags, all_ips);
2229                 break;
2230         case IPALLOC_NONDETERMINISTIC:
2231                 ip_alloc_nondeterministic_ips(ipalloc_state, ipflags, all_ips);
2232                break;
2233         }
2234
2235         /* at this point ->pnn is the node which will own each IP
2236            or -1 if there is no node that can cover this ip
2237         */
2238
2239         return;
2240 }
2241
2242 struct get_tunable_callback_data {
2243         const char *tunable;
2244         uint32_t *out;
2245         bool fatal;
2246 };
2247
2248 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2249                                  int32_t res, TDB_DATA outdata,
2250                                  void *callback)
2251 {
2252         struct get_tunable_callback_data *cd =
2253                 (struct get_tunable_callback_data *)callback;
2254         int size;
2255
2256         if (res != 0) {
2257                 /* Already handled in fail callback */
2258                 return;
2259         }
2260
2261         if (outdata.dsize != sizeof(uint32_t)) {
2262                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2263                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2264                                  (int)outdata.dsize));
2265                 cd->fatal = true;
2266                 return;
2267         }
2268
2269         size = talloc_array_length(cd->out);
2270         if (pnn >= size) {
2271                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2272                                  cd->tunable, pnn, size));
2273                 return;
2274         }
2275
2276                 
2277         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2278 }
2279
2280 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2281                                        int32_t res, TDB_DATA outdata,
2282                                        void *callback)
2283 {
2284         struct get_tunable_callback_data *cd =
2285                 (struct get_tunable_callback_data *)callback;
2286
2287         switch (res) {
2288         case -ETIME:
2289                 DEBUG(DEBUG_ERR,
2290                       ("Timed out getting tunable \"%s\" from node %d\n",
2291                        cd->tunable, pnn));
2292                 cd->fatal = true;
2293                 break;
2294         case -EINVAL:
2295         case -1:
2296                 DEBUG(DEBUG_WARNING,
2297                       ("Tunable \"%s\" not implemented on node %d\n",
2298                        cd->tunable, pnn));
2299                 break;
2300         default:
2301                 DEBUG(DEBUG_ERR,
2302                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2303                        cd->tunable, pnn));
2304                 cd->fatal = true;
2305         }
2306 }
2307
2308 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2309                                         TALLOC_CTX *tmp_ctx,
2310                                         struct ctdb_node_map_old *nodemap,
2311                                         const char *tunable,
2312                                         uint32_t default_value)
2313 {
2314         TDB_DATA data;
2315         struct ctdb_control_get_tunable *t;
2316         uint32_t *nodes;
2317         uint32_t *tvals;
2318         struct get_tunable_callback_data callback_data;
2319         int i;
2320
2321         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2322         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2323         for (i=0; i<nodemap->num; i++) {
2324                 tvals[i] = default_value;
2325         }
2326                 
2327         callback_data.out = tvals;
2328         callback_data.tunable = tunable;
2329         callback_data.fatal = false;
2330
2331         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2332         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2333         t = (struct ctdb_control_get_tunable *)data.dptr;
2334         t->length = strlen(tunable)+1;
2335         memcpy(t->name, tunable, t->length);
2336         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2337         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2338                                       nodes, 0, TAKEOVER_TIMEOUT(),
2339                                       false, data,
2340                                       get_tunable_callback,
2341                                       get_tunable_fail_callback,
2342                                       &callback_data) != 0) {
2343                 if (callback_data.fatal) {
2344                         talloc_free(tvals);
2345                         tvals = NULL;
2346                 }
2347         }
2348         talloc_free(nodes);
2349         talloc_free(data.dptr);
2350
2351         return tvals;
2352 }
2353
2354 /* Set internal flags for IP allocation:
2355  *   Clear ip flags
2356  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2357  *   Set NOIPHOST ip flag for each INACTIVE node
2358  *   if all nodes are disabled:
2359  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2360  *   else
2361  *     Set NOIPHOST ip flags for disabled nodes
2362  */
2363 static struct ctdb_ipflags *
2364 set_ipflags_internal(TALLOC_CTX *tmp_ctx,
2365                      struct ctdb_node_map_old *nodemap,
2366                      uint32_t *tval_noiptakeover,
2367                      uint32_t *tval_noiphostonalldisabled)
2368 {
2369         int i;
2370         struct ctdb_ipflags *ipflags;
2371
2372         /* Clear IP flags - implicit due to talloc_zero */
2373         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2374         if (ipflags == NULL) {
2375                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
2376                 return NULL;
2377         }
2378
2379         for (i=0;i<nodemap->num;i++) {
2380                 /* Can not take IPs on node with NoIPTakeover set */
2381                 if (tval_noiptakeover[i] != 0) {
2382                         ipflags[i].noiptakeover = true;
2383                 }
2384
2385                 /* Can not host IPs on INACTIVE node */
2386                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2387                         ipflags[i].noiphost = true;
2388                 }
2389         }
2390
2391         if (all_nodes_are_disabled(nodemap)) {
2392                 /* If all nodes are disabled, can not host IPs on node
2393                  * with NoIPHostOnAllDisabled set
2394                  */
2395                 for (i=0;i<nodemap->num;i++) {
2396                         if (tval_noiphostonalldisabled[i] != 0) {
2397                                 ipflags[i].noiphost = true;
2398                         }
2399                 }
2400         } else {
2401                 /* If some nodes are not disabled, then can not host
2402                  * IPs on DISABLED node
2403                  */
2404                 for (i=0;i<nodemap->num;i++) {
2405                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2406                                 ipflags[i].noiphost = true;
2407                         }
2408                 }
2409         }
2410
2411         return ipflags;
2412 }
2413
2414 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2415                                         TALLOC_CTX *tmp_ctx,
2416                                         struct ctdb_node_map_old *nodemap)
2417 {
2418         uint32_t *tval_noiptakeover;
2419         uint32_t *tval_noiphostonalldisabled;
2420         struct ctdb_ipflags *ipflags;
2421
2422
2423         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2424                                                    "NoIPTakeover", 0);
2425         if (tval_noiptakeover == NULL) {
2426                 return NULL;
2427         }
2428
2429         tval_noiphostonalldisabled =
2430                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2431                                        "NoIPHostOnAllDisabled", 0);
2432         if (tval_noiphostonalldisabled == NULL) {
2433                 /* Caller frees tmp_ctx */
2434                 return NULL;
2435         }
2436
2437         ipflags = set_ipflags_internal(tmp_ctx, nodemap,
2438                                        tval_noiptakeover,
2439                                        tval_noiphostonalldisabled);
2440
2441         talloc_free(tval_noiptakeover);
2442         talloc_free(tval_noiphostonalldisabled);
2443
2444         return ipflags;
2445 }
2446
2447 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2448                                                  TALLOC_CTX *mem_ctx)
2449 {
2450         struct ipalloc_state *ipalloc_state =
2451                 talloc_zero(mem_ctx, struct ipalloc_state);
2452         if (ipalloc_state == NULL) {
2453                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2454                 return NULL;
2455         }
2456
2457         ipalloc_state->num = ctdb->num_nodes;
2458         ipalloc_state->known_public_ips =
2459                 talloc_zero_array(ipalloc_state,
2460                                   struct ctdb_public_ip_list_old *,
2461                                   ipalloc_state->num);
2462         if (ipalloc_state->known_public_ips == NULL) {
2463                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2464                 talloc_free(ipalloc_state);
2465                 return NULL;
2466         }
2467         ipalloc_state->available_public_ips =
2468                 talloc_zero_array(ipalloc_state,
2469                                   struct ctdb_public_ip_list_old *,
2470                                   ipalloc_state->num);
2471         if (ipalloc_state->available_public_ips == NULL) {
2472                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2473                 talloc_free(ipalloc_state);
2474                 return NULL;
2475         }
2476
2477         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2478                 ipalloc_state->algorithm = IPALLOC_LCP2;
2479         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2480                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2481         } else {
2482                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2483         }
2484
2485         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2486
2487         return ipalloc_state;
2488 }
2489
2490 struct iprealloc_callback_data {
2491         bool *retry_nodes;
2492         int retry_count;
2493         client_async_callback fail_callback;
2494         void *fail_callback_data;
2495         struct ctdb_node_map_old *nodemap;
2496 };
2497
2498 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2499                                         int32_t res, TDB_DATA outdata,
2500                                         void *callback)
2501 {
2502         int numnodes;
2503         struct iprealloc_callback_data *cd =
2504                 (struct iprealloc_callback_data *)callback;
2505
2506         numnodes = talloc_array_length(cd->retry_nodes);
2507         if (pnn > numnodes) {
2508                 DEBUG(DEBUG_ERR,
2509                       ("ipreallocated failure from node %d, "
2510                        "but only %d nodes in nodemap\n",
2511                        pnn, numnodes));
2512                 return;
2513         }
2514
2515         /* Can't run the "ipreallocated" event on a INACTIVE node */
2516         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2517                 DEBUG(DEBUG_WARNING,
2518                       ("ipreallocated failed on inactive node %d, ignoring\n",
2519                        pnn));
2520                 return;
2521         }
2522
2523         switch (res) {
2524         case -ETIME:
2525                 /* If the control timed out then that's a real error,
2526                  * so call the real fail callback
2527                  */
2528                 if (cd->fail_callback) {
2529                         cd->fail_callback(ctdb, pnn, res, outdata,
2530                                           cd->fail_callback_data);
2531                 } else {
2532                         DEBUG(DEBUG_WARNING,
2533                               ("iprealloc timed out but no callback registered\n"));
2534                 }
2535                 break;
2536         default:
2537                 /* If not a timeout then either the ipreallocated
2538                  * eventscript (or some setup) failed.  This might
2539                  * have failed because the IPREALLOCATED control isn't
2540                  * implemented - right now there is no way of knowing
2541                  * because the error codes are all folded down to -1.
2542                  * Consider retrying using EVENTSCRIPT control...
2543                  */
2544                 DEBUG(DEBUG_WARNING,
2545                       ("ipreallocated failure from node %d, flagging retry\n",
2546                        pnn));
2547                 cd->retry_nodes[pnn] = true;
2548                 cd->retry_count++;
2549         }
2550 }
2551
2552 struct takeover_callback_data {
2553         bool *node_failed;
2554         client_async_callback fail_callback;
2555         void *fail_callback_data;
2556         struct ctdb_node_map_old *nodemap;
2557 };
2558
2559 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2560                                        uint32_t node_pnn, int32_t res,
2561                                        TDB_DATA outdata, void *callback_data)
2562 {
2563         struct takeover_callback_data *cd =
2564                 talloc_get_type_abort(callback_data,
2565                                       struct takeover_callback_data);
2566         int i;
2567
2568         for (i = 0; i < cd->nodemap->num; i++) {
2569                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2570                         break;
2571                 }
2572         }
2573
2574         if (i == cd->nodemap->num) {
2575                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2576                 return;
2577         }
2578
2579         if (!cd->node_failed[i]) {
2580                 cd->node_failed[i] = true;
2581                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2582                                   cd->fail_callback_data);
2583         }
2584 }
2585
2586 /*
2587   make any IP alias changes for public addresses that are necessary 
2588  */
2589 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2590                       uint32_t *force_rebalance_nodes,
2591                       client_async_callback fail_callback, void *callback_data)
2592 {
2593         int i, j, ret;
2594         struct ctdb_public_ip ip;
2595         uint32_t *nodes;
2596         struct public_ip_list *all_ips, *tmp_ip;
2597         TDB_DATA data;
2598         struct timeval timeout;
2599         struct client_async_data *async_data;
2600         struct ctdb_client_control_state *state;
2601         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2602         struct ctdb_ipflags *ipflags;
2603         struct ipalloc_state *ipalloc_state;
2604         struct takeover_callback_data *takeover_data;
2605         struct iprealloc_callback_data iprealloc_data;
2606         bool *retry_data;
2607         bool can_host_ips;
2608
2609         /*
2610          * ip failover is completely disabled, just send out the 
2611          * ipreallocated event.
2612          */
2613         if (ctdb->tunable.disable_ip_failover != 0) {
2614                 goto ipreallocated;
2615         }
2616
2617         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2618         if (ipalloc_state == NULL) {
2619                 talloc_free(tmp_ctx);
2620                 return -1;
2621         }
2622
2623         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2624         if (ipflags == NULL) {
2625                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2626                 talloc_free(tmp_ctx);
2627                 return -1;
2628         }
2629
2630         /* Fetch known/available public IPs from each active node */
2631         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2632         if (ret != 0) {
2633                 talloc_free(tmp_ctx);
2634                 return -1;
2635         }
2636
2637         /* Short-circuit IP allocation if no node has available IPs */
2638         can_host_ips = false;
2639         for (i=0; i < ipalloc_state->num; i++) {
2640                 if (ipalloc_state->available_public_ips[i] != NULL) {
2641                         can_host_ips = true;
2642                 }
2643         }
2644         if (!can_host_ips) {
2645                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2646                 return 0;
2647         }
2648
2649         /* since nodes only know about those public addresses that
2650            can be served by that particular node, no single node has
2651            a full list of all public addresses that exist in the cluster.
2652            Walk over all node structures and create a merged list of
2653            all public addresses that exist in the cluster.
2654
2655            keep the tree of ips around as ctdb->ip_tree
2656         */
2657         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2658
2659         /* Do the IP reassignment calculations */
2660         ctdb_takeover_run_core(ipalloc_state, ipflags,
2661                                all_ips, force_rebalance_nodes);
2662
2663         /* Now tell all nodes to release any public IPs should not
2664          * host.  This will be a NOOP on nodes that don't currently
2665          * hold the given IP.
2666          */
2667         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2668         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2669
2670         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2671                                                        bool, nodemap->num);
2672         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2673         takeover_data->fail_callback = fail_callback;
2674         takeover_data->fail_callback_data = callback_data;
2675         takeover_data->nodemap = nodemap;
2676
2677         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2678         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2679
2680         async_data->fail_callback = takeover_run_fail_callback;
2681         async_data->callback_data = takeover_data;
2682
2683         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2684
2685         /* Send a RELEASE_IP to all nodes that should not be hosting
2686          * each IP.  For each IP, all but one of these will be
2687          * redundant.  However, the redundant ones are used to tell
2688          * nodes which node should be hosting the IP so that commands
2689          * like "ctdb ip" can display a particular nodes idea of who
2690          * is hosting what. */
2691         for (i=0;i<nodemap->num;i++) {
2692                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2693                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2694                         continue;
2695                 }
2696
2697                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2698                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2699                                 /* This node should be serving this
2700                                    vnn so don't tell it to release the ip
2701                                 */
2702                                 continue;
2703                         }
2704                         ip.pnn  = tmp_ip->pnn;
2705                         ip.addr = tmp_ip->addr;
2706
2707                         timeout = TAKEOVER_TIMEOUT();
2708                         data.dsize = sizeof(ip);
2709                         data.dptr  = (uint8_t *)&ip;
2710                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2711                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2712                                                   data, async_data,
2713                                                   &timeout, NULL);
2714                         if (state == NULL) {
2715                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2716                                 talloc_free(tmp_ctx);
2717                                 return -1;
2718                         }
2719
2720                         ctdb_client_async_add(async_data, state);
2721                 }
2722         }
2723         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2724                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2725                 talloc_free(tmp_ctx);
2726                 return -1;
2727         }
2728         talloc_free(async_data);
2729
2730
2731         /* For each IP, send a TAKOVER_IP to the node that should be
2732          * hosting it.  Many of these will often be redundant (since
2733          * the allocation won't have changed) but they can be useful
2734          * to recover from inconsistencies. */
2735         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2736         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2737
2738         async_data->fail_callback = fail_callback;
2739         async_data->callback_data = callback_data;
2740
2741         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2742                 if (tmp_ip->pnn == -1) {
2743                         /* this IP won't be taken over */
2744                         continue;
2745                 }
2746
2747                 ip.pnn  = tmp_ip->pnn;
2748                 ip.addr = tmp_ip->addr;
2749
2750                 timeout = TAKEOVER_TIMEOUT();
2751                 data.dsize = sizeof(ip);
2752                 data.dptr  = (uint8_t *)&ip;
2753                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2754                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2755                                           data, async_data, &timeout, NULL);
2756                 if (state == NULL) {
2757                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2758                         talloc_free(tmp_ctx);
2759                         return -1;
2760                 }
2761
2762                 ctdb_client_async_add(async_data, state);
2763         }
2764         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2765                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2766                 talloc_free(tmp_ctx);
2767                 return -1;
2768         }
2769
2770 ipreallocated:
2771         /*
2772          * Tell all nodes to run eventscripts to process the
2773          * "ipreallocated" event.  This can do a lot of things,
2774          * including restarting services to reconfigure them if public
2775          * IPs have moved.  Once upon a time this event only used to
2776          * update natgw.
2777          */
2778         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2779         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2780         iprealloc_data.retry_nodes = retry_data;
2781         iprealloc_data.retry_count = 0;
2782         iprealloc_data.fail_callback = fail_callback;
2783         iprealloc_data.fail_callback_data = callback_data;
2784         iprealloc_data.nodemap = nodemap;
2785
2786         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2787         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2788                                         nodes, 0, TAKEOVER_TIMEOUT(),
2789                                         false, tdb_null,
2790                                         NULL, iprealloc_fail_callback,
2791                                         &iprealloc_data);
2792         if (ret != 0) {
2793                 /* If the control failed then we should retry to any
2794                  * nodes flagged by iprealloc_fail_callback using the
2795                  * EVENTSCRIPT control.  This is a best-effort at
2796                  * backward compatiblity when running a mixed cluster
2797                  * where some nodes have not yet been upgraded to
2798                  * support the IPREALLOCATED control.
2799                  */
2800                 DEBUG(DEBUG_WARNING,
2801                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2802
2803                 nodes = talloc_array(tmp_ctx, uint32_t,
2804                                      iprealloc_data.retry_count);
2805                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2806
2807                 j = 0;
2808                 for (i=0; i<nodemap->num; i++) {
2809                         if (iprealloc_data.retry_nodes[i]) {
2810                                 nodes[j] = i;
2811                                 j++;
2812                         }
2813                 }
2814
2815                 data.dptr  = discard_const("ipreallocated");
2816                 data.dsize = strlen((char *)data.dptr) + 1; 
2817                 ret = ctdb_client_async_control(ctdb,
2818                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2819                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2820                                                 false, data,
2821                                                 NULL, fail_callback,
2822                                                 callback_data);
2823                 if (ret != 0) {
2824                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2825                 }
2826         }
2827
2828         talloc_free(tmp_ctx);
2829         return ret;
2830 }
2831
2832
2833 /*
2834   destroy a ctdb_client_ip structure
2835  */
2836 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2837 {
2838         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2839                 ctdb_addr_to_str(&ip->addr),
2840                 ntohs(ip->addr.ip.sin_port),
2841                 ip->client_id));
2842
2843         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2844         return 0;
2845 }
2846
2847 /*
2848   called by a client to inform us of a TCP connection that it is managing
2849   that should tickled with an ACK when IP takeover is done
2850  */
2851 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2852                                 TDB_DATA indata)
2853 {
2854         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2855         struct ctdb_connection *tcp_sock = NULL;
2856         struct ctdb_tcp_list *tcp;
2857         struct ctdb_connection t;
2858         int ret;
2859         TDB_DATA data;
2860         struct ctdb_client_ip *ip;
2861         struct ctdb_vnn *vnn;
2862         ctdb_sock_addr addr;
2863
2864         /* If we don't have public IPs, tickles are useless */
2865         if (ctdb->vnn == NULL) {
2866                 return 0;
2867         }
2868
2869         tcp_sock = (struct ctdb_connection *)indata.dptr;
2870
2871         addr = tcp_sock->src;
2872         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2873         addr = tcp_sock->dst;
2874         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2875
2876         ZERO_STRUCT(addr);
2877         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2878         vnn = find_public_ip_vnn(ctdb, &addr);
2879         if (vnn == NULL) {
2880                 switch (addr.sa.sa_family) {
2881                 case AF_INET:
2882                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2883                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2884                                         ctdb_addr_to_str(&addr)));
2885                         }
2886                         break;
2887                 case AF_INET6:
2888                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2889                                 ctdb_addr_to_str(&addr)));
2890                         break;
2891                 default:
2892                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2893                 }
2894
2895                 return 0;
2896         }
2897
2898         if (vnn->pnn != ctdb->pnn) {
2899                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2900                         ctdb_addr_to_str(&addr),
2901                         client_id, client->pid));
2902                 /* failing this call will tell smbd to die */
2903                 return -1;
2904         }
2905
2906         ip = talloc(client, struct ctdb_client_ip);
2907         CTDB_NO_MEMORY(ctdb, ip);
2908
2909         ip->ctdb      = ctdb;
2910         ip->addr      = addr;
2911         ip->client_id = client_id;
2912         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2913         DLIST_ADD(ctdb->client_ip_list, ip);
2914
2915         tcp = talloc(client, struct ctdb_tcp_list);
2916         CTDB_NO_MEMORY(ctdb, tcp);
2917
2918         tcp->connection.src = tcp_sock->src;
2919         tcp->connection.dst = tcp_sock->dst;
2920
2921         DLIST_ADD(client->tcp_list, tcp);
2922
2923         t.src = tcp_sock->src;
2924         t.dst = tcp_sock->dst;
2925
2926         data.dptr = (uint8_t *)&t;
2927         data.dsize = sizeof(t);
2928
2929         switch (addr.sa.sa_family) {
2930         case AF_INET:
2931                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2932                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2933                         ctdb_addr_to_str(&tcp_sock->src),
2934                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2935                 break;
2936         case AF_INET6:
2937                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2938                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2939                         ctdb_addr_to_str(&tcp_sock->src),
2940                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2941                 break;
2942         default:
2943                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2944         }
2945
2946
2947         /* tell all nodes about this tcp connection */
2948         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2949                                        CTDB_CONTROL_TCP_ADD,
2950                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2951         if (ret != 0) {
2952                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2953                 return -1;
2954         }
2955
2956         return 0;
2957 }
2958
2959 /*
2960   find a tcp address on a list
2961  */
2962 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2963                                            struct ctdb_connection *tcp)
2964 {
2965         int i;
2966
2967         if (array == NULL) {
2968                 return NULL;
2969         }
2970
2971         for (i=0;i<array->num;i++) {
2972                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2973                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2974                         return &array->connections[i];
2975                 }
2976         }
2977         return NULL;
2978 }
2979
2980
2981
2982 /*
2983   called by a daemon to inform us of a TCP connection that one of its
2984   clients managing that should tickled with an ACK when IP takeover is
2985   done
2986  */
2987 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2988 {
2989         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2990         struct ctdb_tcp_array *tcparray;
2991         struct ctdb_connection tcp;
2992         struct ctdb_vnn *vnn;
2993
2994         /* If we don't have public IPs, tickles are useless */
2995         if (ctdb->vnn == NULL) {
2996                 return 0;
2997         }
2998
2999         vnn = find_public_ip_vnn(ctdb, &p->dst);
3000         if (vnn == NULL) {
3001                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3002                         ctdb_addr_to_str(&p->dst)));
3003
3004                 return -1;
3005         }
3006
3007
3008         tcparray = vnn->tcp_array;
3009
3010         /* If this is the first tickle */
3011         if (tcparray == NULL) {
3012                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3013                 CTDB_NO_MEMORY(ctdb, tcparray);
3014                 vnn->tcp_array = tcparray;
3015
3016                 tcparray->num = 0;
3017                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3018                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3019
3020                 tcparray->connections[tcparray->num].src = p->src;
3021                 tcparray->connections[tcparray->num].dst = p->dst;
3022                 tcparray->num++;
3023
3024                 if (tcp_update_needed) {
3025                         vnn->tcp_update_needed = true;
3026                 }
3027                 return 0;
3028         }
3029
3030
3031         /* Do we already have this tickle ?*/
3032         tcp.src = p->src;
3033         tcp.dst = p->dst;
3034         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3035                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3036                         ctdb_addr_to_str(&tcp.dst),
3037                         ntohs(tcp.dst.ip.sin_port),
3038                         vnn->pnn));
3039                 return 0;
3040         }
3041
3042         /* A new tickle, we must add it to the array */
3043         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3044                                         struct ctdb_connection,
3045                                         tcparray->num+1);
3046         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3047
3048         tcparray->connections[tcparray->num].src = p->src;
3049         tcparray->connections[tcparray->num].dst = p->dst;
3050         tcparray->num++;
3051
3052         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3053                 ctdb_addr_to_str(&tcp.dst),
3054                 ntohs(tcp.dst.ip.sin_port),
3055                 vnn->pnn));
3056
3057         if (tcp_update_needed) {
3058                 vnn->tcp_update_needed = true;
3059         }
3060
3061         return 0;
3062 }
3063
3064
3065 /*
3066   called by a daemon to inform us of a TCP connection that one of its
3067   clients managing that should tickled with an ACK when IP takeover is
3068   done
3069  */
3070 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3071 {
3072         struct ctdb_connection *tcpp;
3073         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3074
3075         if (vnn == NULL) {
3076                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3077                         ctdb_addr_to_str(&conn->dst)));
3078                 return;
3079         }
3080
3081         /* if the array is empty we cant remove it
3082            and we don't need to do anything
3083          */
3084         if (vnn->tcp_array == NULL) {
3085                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3086                         ctdb_addr_to_str(&conn->dst),
3087                         ntohs(conn->dst.ip.sin_port)));
3088                 return;
3089         }
3090
3091
3092         /* See if we know this connection
3093            if we don't know this connection  then we dont need to do anything
3094          */
3095         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3096         if (tcpp == NULL) {
3097                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3098                         ctdb_addr_to_str(&conn->dst),
3099                         ntohs(conn->dst.ip.sin_port)));
3100                 return;
3101         }
3102
3103
3104         /* We need to remove this entry from the array.
3105            Instead of allocating a new array and copying data to it
3106            we cheat and just copy the last entry in the existing array
3107            to the entry that is to be removed and just shring the 
3108            ->num field
3109          */
3110         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3111         vnn->tcp_array->num--;
3112
3113         /* If we deleted the last entry we also need to remove the entire array
3114          */
3115         if (vnn->tcp_array->num == 0) {
3116                 talloc_free(vnn->tcp_array);
3117                 vnn->tcp_array = NULL;
3118         }               
3119
3120         vnn->tcp_update_needed = true;
3121
3122         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3123                 ctdb_addr_to_str(&conn->src),
3124                 ntohs(conn->src.ip.sin_port)));
3125 }
3126
3127
3128 /*
3129   called by a daemon to inform us of a TCP connection that one of its
3130   clients used are no longer needed in the tickle database
3131  */
3132 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3133 {
3134         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3135
3136         /* If we don't have public IPs, tickles are useless */
3137         if (ctdb->vnn == NULL) {
3138                 return 0;
3139         }
3140
3141         ctdb_remove_connection(ctdb, conn);
3142
3143         return 0;
3144 }
3145
3146
3147 /*
3148   Called when another daemon starts - causes all tickles for all
3149   public addresses we are serving to be sent to the new node on the
3150   next check.  This actually causes the next scheduled call to
3151   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3152   doesn't require careful error handling.
3153  */
3154 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3155 {
3156         struct ctdb_vnn *vnn;
3157
3158         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3159                            (unsigned long) pnn));
3160
3161         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3162                 vnn->tcp_update_needed = true;
3163         }
3164
3165         return 0;
3166 }
3167
3168
3169 /*
3170   called when a client structure goes away - hook to remove
3171   elements from the tcp_list in all daemons
3172  */
3173 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3174 {
3175         while (client->tcp_list) {
3176                 struct ctdb_tcp_list *tcp = client->tcp_list;
3177                 DLIST_REMOVE(client->tcp_list, tcp);
3178                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3179         }
3180 }
3181
3182
3183 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3184 {
3185         struct ctdb_vnn *vnn;
3186         int count = 0;
3187
3188         if (ctdb->tunable.disable_ip_failover == 1) {
3189                 return;
3190         }
3191
3192         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3193                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3194                         ctdb_vnn_unassign_iface(ctdb, vnn);
3195                         continue;
3196                 }
3197                 if (!vnn->iface) {
3198                         continue;
3199                 }
3200
3201                 /* Don't allow multiple releases at once.  Some code,
3202                  * particularly ctdb_tickle_sentenced_connections() is
3203                  * not re-entrant */
3204                 if (vnn->update_in_flight) {
3205                         DEBUG(DEBUG_WARNING,
3206                               (__location__
3207                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3208                                     ctdb_addr_to_str(&vnn->public_address),
3209                                     vnn->public_netmask_bits,
3210                                     ctdb_vnn_iface_string(vnn)));
3211                         continue;
3212                 }
3213                 vnn->update_in_flight = true;
3214
3215                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3216                                     ctdb_addr_to_str(&vnn->public_address),
3217                                     vnn->public_netmask_bits,
3218                                     ctdb_vnn_iface_string(vnn)));
3219
3220                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3221                                   ctdb_vnn_iface_string(vnn),
3222                                   ctdb_addr_to_str(&vnn->public_address),
3223                                   vnn->public_netmask_bits);
3224                 release_kill_clients(ctdb, &vnn->public_address);
3225                 ctdb_vnn_unassign_iface(ctdb, vnn);
3226                 vnn->update_in_flight = false;
3227                 count++;
3228         }
3229
3230         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3231 }
3232
3233
3234 /*
3235   get list of public IPs
3236  */
3237 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3238                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3239 {
3240         int i, num, len;
3241         struct ctdb_public_ip_list_old *ips;
3242         struct ctdb_vnn *vnn;
3243         bool only_available = false;
3244
3245         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3246                 only_available = true;
3247         }
3248
3249         /* count how many public ip structures we have */
3250         num = 0;
3251         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3252                 num++;
3253         }
3254
3255         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3256                 num*sizeof(struct ctdb_public_ip);
3257         ips = talloc_zero_size(outdata, len);
3258         CTDB_NO_MEMORY(ctdb, ips);
3259
3260         i = 0;
3261         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3262                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3263                         continue;
3264                 }
3265                 ips->ips[i].pnn  = vnn->pnn;
3266                 ips->ips[i].addr = vnn->public_address;
3267                 i++;
3268         }
3269         ips->num = i;
3270         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3271                 i*sizeof(struct ctdb_public_ip);
3272
3273         outdata->dsize = len;
3274         outdata->dptr  = (uint8_t *)ips;
3275
3276         return 0;
3277 }
3278
3279
3280 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3281                                         struct ctdb_req_control_old *c,
3282                                         TDB_DATA indata,
3283                                         TDB_DATA *outdata)
3284 {
3285         int i, num, len;
3286         ctdb_sock_addr *addr;
3287         struct ctdb_public_ip_info_old *info;
3288         struct ctdb_vnn *vnn;
3289
3290         addr = (ctdb_sock_addr *)indata.dptr;
3291
3292         vnn = find_public_ip_vnn(ctdb, addr);
3293         if (vnn == NULL) {
3294                 /* if it is not a public ip   it could be our 'single ip' */
3295                 if (ctdb->single_ip_vnn) {
3296                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3297                                 vnn = ctdb->single_ip_vnn;
3298                         }
3299                 }
3300         }
3301         if (vnn == NULL) {
3302                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3303                                  "'%s'not a public address\n",
3304                                  ctdb_addr_to_str(addr)));
3305                 return -1;
3306         }
3307
3308         /* count how many public ip structures we have */
3309         num = 0;
3310         for (;vnn->ifaces[num];) {
3311                 num++;
3312         }
3313
3314         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3315                 num*sizeof(struct ctdb_iface);
3316         info = talloc_zero_size(outdata, len);
3317         CTDB_NO_MEMORY(ctdb, info);
3318
3319         info->ip.addr = vnn->public_address;
3320         info->ip.pnn = vnn->pnn;
3321         info->active_idx = 0xFFFFFFFF;
3322
3323         for (i=0; vnn->ifaces[i]; i++) {
3324                 struct ctdb_interface *cur;
3325
3326                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3327                 if (cur == NULL) {
3328                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3329                                            vnn->ifaces[i]));
3330                         return -1;
3331                 }
3332                 if (vnn->iface == cur) {
3333                         info->active_idx = i;
3334                 }
3335                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3336                 info->ifaces[i].link_state = cur->link_up;
3337                 info->ifaces[i].references = cur->references;
3338         }
3339         info->num = i;
3340         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3341                 i*sizeof(struct ctdb_iface);
3342
3343         outdata->dsize = len;
3344         outdata->dptr  = (uint8_t *)info;
3345
3346         return 0;
3347 }
3348
3349 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3350                                 struct ctdb_req_control_old *c,
3351                                 TDB_DATA *outdata)
3352 {
3353         int i, num, len;
3354         struct ctdb_iface_list_old *ifaces;
3355         struct ctdb_interface *cur;
3356
3357         /* count how many public ip structures we have */
3358         num = 0;
3359         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3360                 num++;
3361         }
3362
3363         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3364                 num*sizeof(struct ctdb_iface);
3365         ifaces = talloc_zero_size(outdata, len);
3366         CTDB_NO_MEMORY(ctdb, ifaces);
3367
3368         i = 0;
3369         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3370                 strcpy(ifaces->ifaces[i].name, cur->name);
3371                 ifaces->ifaces[i].link_state = cur->link_up;
3372                 ifaces->ifaces[i].references = cur->references;
3373                 i++;
3374         }
3375         ifaces->num = i;
3376         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3377                 i*sizeof(struct ctdb_iface);
3378
3379         outdata->dsize = len;
3380         outdata->dptr  = (uint8_t *)ifaces;
3381
3382         return 0;
3383 }
3384
3385 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3386                                     struct ctdb_req_control_old *c,
3387                                     TDB_DATA indata)
3388 {
3389         struct ctdb_iface *info;
3390         struct ctdb_interface *iface;
3391         bool link_up = false;
3392
3393         info = (struct ctdb_iface *)indata.dptr;
3394
3395         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3396                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3397                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3398                                   len, len, info->name));
3399                 return -1;
3400         }
3401
3402         switch (info->link_state) {
3403         case 0:
3404                 link_up = false;
3405                 break;
3406         case 1:
3407                 link_up = true;
3408                 break;
3409         default:
3410                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3411                                   (unsigned int)info->link_state));
3412                 return -1;
3413         }
3414
3415         if (info->references != 0) {
3416                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3417                                   (unsigned int)info->references));
3418                 return -1;
3419         }
3420
3421         iface = ctdb_find_iface(ctdb, info->name);
3422         if (iface == NULL) {
3423                 return -1;
3424         }
3425
3426         if (link_up == iface->link_up) {
3427                 return 0;
3428         }
3429
3430         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3431               ("iface[%s] has changed it's link status %s => %s\n",
3432                iface->name,
3433                iface->link_up?"up":"down",
3434                link_up?"up":"down"));
3435
3436         iface->link_up = link_up;
3437         return 0;
3438 }
3439
3440
3441 /* 
3442    structure containing the listening socket and the list of tcp connections
3443    that the ctdb daemon is to kill
3444 */
3445 struct ctdb_kill_tcp {
3446         struct ctdb_vnn *vnn;
3447         struct ctdb_context *ctdb;
3448         int capture_fd;
3449         struct tevent_fd *fde;
3450         trbt_tree_t *connections;
3451         void *private_data;
3452 };
3453
3454 /*
3455   a tcp connection that is to be killed
3456  */
3457 struct ctdb_killtcp_con {
3458         ctdb_sock_addr src_addr;
3459         ctdb_sock_addr dst_addr;
3460         int count;
3461         struct ctdb_kill_tcp *killtcp;
3462 };
3463
3464 /* this function is used to create a key to represent this socketpair
3465    in the killtcp tree.
3466    this key is used to insert and lookup matching socketpairs that are
3467    to be tickled and RST
3468 */
3469 #define KILLTCP_KEYLEN  10
3470 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3471 {
3472         static uint32_t key[KILLTCP_KEYLEN];
3473
3474         bzero(key, sizeof(key));
3475
3476         if (src->sa.sa_family != dst->sa.sa_family) {
3477                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3478                 return key;
3479         }
3480         
3481         switch (src->sa.sa_family) {
3482         case AF_INET:
3483                 key[0]  = dst->ip.sin_addr.s_addr;
3484                 key[1]  = src->ip.sin_addr.s_addr;
3485                 key[2]  = dst->ip.sin_port;
3486                 key[3]  = src->ip.sin_port;
3487                 break;
3488         case AF_INET6: {
3489                 uint32_t *dst6_addr32 =
3490                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3491                 uint32_t *src6_addr32 =
3492                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3493                 key[0]  = dst6_addr32[3];
3494                 key[1]  = src6_addr32[3];
3495                 key[2]  = dst6_addr32[2];
3496                 key[3]  = src6_addr32[2];
3497                 key[4]  = dst6_addr32[1];
3498                 key[5]  = src6_addr32[1];
3499                 key[6]  = dst6_addr32[0];
3500                 key[7]  = src6_addr32[0];
3501                 key[8]  = dst->ip6.sin6_port;
3502                 key[9]  = src->ip6.sin6_port;
3503                 break;
3504         }
3505         default:
3506                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3507                 return key;
3508         }
3509
3510         return key;
3511 }
3512
3513 /*
3514   called when we get a read event on the raw socket
3515  */
3516 static void capture_tcp_handler(struct tevent_context *ev,
3517                                 struct tevent_fd *fde,
3518                                 uint16_t flags, void *private_data)
3519 {
3520         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3521         struct ctdb_killtcp_con *con;
3522         ctdb_sock_addr src, dst;
3523         uint32_t ack_seq, seq;
3524
3525         if (!(flags & TEVENT_FD_READ)) {
3526                 return;
3527         }
3528
3529         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3530                                 killtcp->private_data,
3531                                 &src, &dst,
3532                                 &ack_seq, &seq) != 0) {
3533                 /* probably a non-tcp ACK packet */
3534                 return;
3535         }
3536
3537         /* check if we have this guy in our list of connections
3538            to kill
3539         */
3540         con = trbt_lookuparray32(killtcp->connections, 
3541                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3542         if (con == NULL) {
3543                 /* no this was some other packet we can just ignore */
3544                 return;
3545         }
3546
3547         /* This one has been tickled !
3548            now reset him and remove him from the list.
3549          */
3550         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3551                 ntohs(con->dst_addr.ip.sin_port),
3552                 ctdb_addr_to_str(&con->src_addr),
3553                 ntohs(con->src_addr.ip.sin_port)));
3554
3555         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3556         talloc_free(con);
3557 }
3558
3559
3560 /* when traversing the list of all tcp connections to send tickle acks to
3561    (so that we can capture the ack coming back and kill the connection
3562     by a RST)
3563    this callback is called for each connection we are currently trying to kill
3564 */
3565 static int tickle_connection_traverse(void *param, void *data)
3566 {
3567         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3568
3569         /* have tried too many times, just give up */
3570         if (con->count >= 5) {
3571                 /* can't delete in traverse: reparent to delete_cons */
3572                 talloc_steal(param, con);
3573                 return 0;
3574         }
3575
3576         /* othervise, try tickling it again */
3577         con->count++;
3578         ctdb_sys_send_tcp(
3579                 (ctdb_sock_addr *)&con->dst_addr,
3580                 (ctdb_sock_addr *)&con->src_addr,
3581                 0, 0, 0);
3582         return 0;
3583 }
3584
3585
3586 /* 
3587    called every second until all sentenced connections have been reset
3588  */
3589 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3590                                               struct tevent_timer *te,
3591                                               struct timeval t, void *private_data)
3592 {
3593         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3594         void *delete_cons = talloc_new(NULL);
3595
3596         /* loop over all connections sending tickle ACKs */
3597         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3598
3599         /* now we've finished traverse, it's safe to do deletion. */
3600         talloc_free(delete_cons);
3601
3602         /* If there are no more connections to kill we can remove the
3603            entire killtcp structure
3604          */
3605         if ( (killtcp->connections == NULL) || 
3606              (killtcp->connections->root == NULL) ) {
3607                 talloc_free(killtcp);
3608                 return;
3609         }
3610
3611         /* try tickling them again in a seconds time
3612          */
3613         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3614                          timeval_current_ofs(1, 0),
3615                          ctdb_tickle_sentenced_connections, killtcp);
3616 }
3617
3618 /*
3619   destroy the killtcp structure
3620  */
3621 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3622 {
3623         struct ctdb_vnn *tmpvnn;
3624
3625         /* verify that this vnn is still active */
3626         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3627                 if (tmpvnn == killtcp->vnn) {
3628                         break;
3629                 }
3630         }
3631
3632         if (tmpvnn == NULL) {
3633                 return 0;
3634         }
3635
3636         if (killtcp->vnn->killtcp != killtcp) {
3637                 return 0;
3638         }
3639
3640         killtcp->vnn->killtcp = NULL;
3641
3642         return 0;
3643 }
3644
3645
3646 /* nothing fancy here, just unconditionally replace any existing
3647    connection structure with the new one.
3648
3649    don't even free the old one if it did exist, that one is talloc_stolen
3650    by the same node in the tree anyway and will be deleted when the new data 
3651    is deleted
3652 */
3653 static void *add_killtcp_callback(void *parm, void *data)
3654 {
3655         return parm;
3656 }
3657
3658 /*
3659   add a tcp socket to the list of connections we want to RST
3660  */
3661 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3662                                        ctdb_sock_addr *s,
3663                                        ctdb_sock_addr *d)
3664 {
3665         ctdb_sock_addr src, dst;
3666         struct ctdb_kill_tcp *killtcp;
3667         struct ctdb_killtcp_con *con;
3668         struct ctdb_vnn *vnn;
3669
3670         ctdb_canonicalize_ip(s, &src);
3671         ctdb_canonicalize_ip(d, &dst);
3672
3673         vnn = find_public_ip_vnn(ctdb, &dst);
3674         if (vnn == NULL) {
3675                 vnn = find_public_ip_vnn(ctdb, &src);
3676         }
3677         if (vnn == NULL) {
3678                 /* if it is not a public ip   it could be our 'single ip' */
3679                 if (ctdb->single_ip_vnn) {
3680                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3681                                 vnn = ctdb->single_ip_vnn;
3682                         }
3683                 }
3684         }
3685         if (vnn == NULL) {
3686                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3687                 return -1;
3688         }
3689
3690         killtcp = vnn->killtcp;
3691         
3692         /* If this is the first connection to kill we must allocate
3693            a new structure
3694          */
3695         if (killtcp == NULL) {
3696                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3697                 CTDB_NO_MEMORY(ctdb, killtcp);
3698
3699                 killtcp->vnn         = vnn;
3700                 killtcp->ctdb        = ctdb;
3701                 killtcp->capture_fd  = -1;
3702                 killtcp->connections = trbt_create(killtcp, 0);
3703
3704                 vnn->killtcp         = killtcp;
3705                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3706         }
3707
3708
3709
3710         /* create a structure that describes this connection we want to
3711            RST and store it in killtcp->connections
3712         */
3713         con = talloc(killtcp, struct ctdb_killtcp_con);
3714         CTDB_NO_MEMORY(ctdb, con);
3715         con->src_addr = src;
3716         con->dst_addr = dst;
3717         con->count    = 0;
3718         con->killtcp  = killtcp;
3719
3720
3721         trbt_insertarray32_callback(killtcp->connections,
3722                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3723                         add_killtcp_callback, con);
3724
3725         /* 
3726            If we don't have a socket to listen on yet we must create it
3727          */
3728         if (killtcp->capture_fd == -1) {
3729                 const char *iface = ctdb_vnn_iface_string(vnn);
3730                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3731                 if (killtcp->capture_fd == -1) {
3732                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3733                                           "socket on iface '%s' for killtcp (%s)\n",
3734                                           iface, strerror(errno)));
3735                         goto failed;
3736                 }
3737         }
3738
3739
3740         if (killtcp->fde == NULL) {
3741                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3742                                              killtcp->capture_fd,
3743                                              TEVENT_FD_READ,
3744                                              capture_tcp_handler, killtcp);
3745                 tevent_fd_set_auto_close(killtcp->fde);
3746
3747                 /* We also need to set up some events to tickle all these connections
3748                    until they are all reset
3749                 */
3750                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3751                                  ctdb_tickle_sentenced_connections, killtcp);
3752         }
3753
3754         /* tickle him once now */
3755         ctdb_sys_send_tcp(
3756                 &con->dst_addr,
3757                 &con->src_addr,
3758                 0, 0, 0);
3759
3760         return 0;
3761
3762 failed:
3763         talloc_free(vnn->killtcp);
3764         vnn->killtcp = NULL;
3765         return -1;
3766 }
3767
3768 /*
3769   kill a TCP connection.
3770  */
3771 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3772 {
3773         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3774
3775         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3776 }
3777
3778 /*
3779   called by a daemon to inform us of the entire list of TCP tickles for
3780   a particular public address.
3781   this control should only be sent by the node that is currently serving
3782   that public address.
3783  */
3784 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3785 {
3786         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3787         struct ctdb_tcp_array *tcparray;
3788         struct ctdb_vnn *vnn;
3789
3790         /* We must at least have tickles.num or else we cant verify the size
3791            of the received data blob
3792          */
3793         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3794                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3795                 return -1;
3796         }
3797
3798         /* verify that the size of data matches what we expect */
3799         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3800                          + sizeof(struct ctdb_connection) * list->num) {
3801                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3802                 return -1;
3803         }
3804
3805         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3806                            ctdb_addr_to_str(&list->addr)));
3807
3808         vnn = find_public_ip_vnn(ctdb, &list->addr);
3809         if (vnn == NULL) {
3810                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3811                         ctdb_addr_to_str(&list->addr)));
3812
3813                 return 1;
3814         }
3815
3816         /* remove any old ticklelist we might have */
3817         talloc_free(vnn->tcp_array);
3818         vnn->tcp_array = NULL;
3819
3820         tcparray = talloc(vnn, struct ctdb_tcp_array);
3821         CTDB_NO_MEMORY(ctdb, tcparray);
3822
3823         tcparray->num = list->num;
3824
3825         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3826         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3827
3828         memcpy(tcparray->connections, &list->connections[0],
3829                sizeof(struct ctdb_connection)*tcparray->num);
3830
3831         /* We now have a new fresh tickle list array for this vnn */
3832         vnn->tcp_array = tcparray;
3833
3834         return 0;
3835 }
3836
3837 /*
3838   called to return the full list of tickles for the puclic address associated 
3839   with the provided vnn
3840  */
3841 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3842 {
3843         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3844         struct ctdb_tickle_list_old *list;
3845         struct ctdb_tcp_array *tcparray;
3846         int num;
3847         struct ctdb_vnn *vnn;
3848
3849         vnn = find_public_ip_vnn(ctdb, addr);
3850         if (vnn == NULL) {
3851                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3852                         ctdb_addr_to_str(addr)));
3853
3854                 return 1;
3855         }
3856
3857         tcparray = vnn->tcp_array;
3858         if (tcparray) {
3859                 num = tcparray->num;
3860         } else {
3861                 num = 0;
3862         }
3863
3864         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3865                         + sizeof(struct ctdb_connection) * num;
3866
3867         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3868         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3869         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3870
3871         list->addr = *addr;
3872         list->num = num;
3873         if (num) {
3874                 memcpy(&list->connections[0], tcparray->connections,
3875                         sizeof(struct ctdb_connection) * num);
3876         }
3877
3878         return 0;
3879 }
3880
3881
3882 /*
3883   set the list of all tcp tickles for a public address
3884  */
3885 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3886                                             ctdb_sock_addr *addr,
3887                                             struct ctdb_tcp_array *tcparray)
3888 {
3889         int ret, num;
3890         TDB_DATA data;
3891         struct ctdb_tickle_list_old *list;
3892
3893         if (tcparray) {
3894                 num = tcparray->num;
3895         } else {
3896                 num = 0;
3897         }
3898
3899         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3900                         sizeof(struct ctdb_connection) * num;
3901         data.dptr = talloc_size(ctdb, data.dsize);
3902         CTDB_NO_MEMORY(ctdb, data.dptr);
3903
3904         list = (struct ctdb_tickle_list_old *)data.dptr;
3905         list->addr = *addr;
3906         list->num = num;
3907         if (tcparray) {
3908                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3909         }
3910
3911         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3912                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3913                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3914         if (ret != 0) {
3915                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3916                 return -1;
3917         }
3918
3919         talloc_free(data.dptr);
3920
3921         return ret;
3922 }
3923
3924
3925 /*
3926   perform tickle updates if required
3927  */
3928 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3929                                     struct tevent_timer *te,
3930                                     struct timeval t, void *private_data)
3931 {
3932         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3933         int ret;
3934         struct ctdb_vnn *vnn;
3935
3936         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3937                 /* we only send out updates for public addresses that 
3938                    we have taken over
3939                  */
3940                 if (ctdb->pnn != vnn->pnn) {
3941                         continue;
3942                 }
3943                 /* We only send out the updates if we need to */
3944                 if (!vnn->tcp_update_needed) {
3945                         continue;
3946                 }
3947                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3948                                                        &vnn->public_address,
3949                                                        vnn->tcp_array);
3950                 if (ret != 0) {
3951                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3952                                 ctdb_addr_to_str(&vnn->public_address)));
3953                 } else {
3954                         DEBUG(DEBUG_INFO,
3955                               ("Sent tickle update for public address %s\n",
3956                                ctdb_addr_to_str(&vnn->public_address)));
3957                         vnn->tcp_update_needed = false;
3958                 }
3959         }
3960
3961         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3962                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3963                          ctdb_update_tcp_tickles, ctdb);
3964 }
3965
3966 /*
3967   start periodic update of tcp tickles
3968  */
3969 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3970 {
3971         ctdb->tickle_update_context = talloc_new(ctdb);
3972
3973         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3974                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3975                          ctdb_update_tcp_tickles, ctdb);
3976 }
3977
3978
3979
3980
3981 struct control_gratious_arp {
3982         struct ctdb_context *ctdb;
3983         ctdb_sock_addr addr;
3984         const char *iface;
3985         int count;
3986 };
3987
3988 /*
3989   send a control_gratuitous arp
3990  */
3991 static void send_gratious_arp(struct tevent_context *ev,
3992                               struct tevent_timer *te,
3993                               struct timeval t, void *private_data)
3994 {
3995         int ret;
3996         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3997                                                         struct control_gratious_arp);
3998
3999         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4000         if (ret != 0) {
4001                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4002                                  arp->iface, strerror(errno)));
4003         }
4004
4005
4006         arp->count++;
4007         if (arp->count == CTDB_ARP_REPEAT) {
4008                 talloc_free(arp);
4009                 return;
4010         }
4011
4012         tevent_add_timer(arp->ctdb->ev, arp,
4013                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4014                          send_gratious_arp, arp);
4015 }
4016
4017
4018 /*
4019   send a gratious arp 
4020  */
4021 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4022 {
4023         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4024         struct control_gratious_arp *arp;
4025
4026         /* verify the size of indata */
4027         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4028                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4029                                  (unsigned)indata.dsize, 
4030                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4031                 return -1;
4032         }
4033         if (indata.dsize != 
4034                 ( offsetof(struct ctdb_addr_info_old, iface)
4035                 + gratious_arp->len ) ){
4036
4037                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4038                         "but should be %u bytes\n", 
4039                          (unsigned)indata.dsize, 
4040                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4041                 return -1;
4042         }
4043
4044
4045         arp = talloc(ctdb, struct control_gratious_arp);
4046         CTDB_NO_MEMORY(ctdb, arp);
4047
4048         arp->ctdb  = ctdb;
4049         arp->addr   = gratious_arp->addr;
4050         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4051         CTDB_NO_MEMORY(ctdb, arp->iface);
4052         arp->count = 0;
4053
4054         tevent_add_timer(arp->ctdb->ev, arp,
4055                          timeval_zero(), send_gratious_arp, arp);
4056
4057         return 0;
4058 }
4059
4060 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4061 {
4062         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4063         int ret;
4064
4065         /* verify the size of indata */
4066         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4067                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4068                 return -1;
4069         }
4070         if (indata.dsize != 
4071                 ( offsetof(struct ctdb_addr_info_old, iface)
4072                 + pub->len ) ){
4073
4074                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4075                         "but should be %u bytes\n", 
4076                          (unsigned)indata.dsize, 
4077                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4078                 return -1;
4079         }
4080
4081         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4082
4083         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4084
4085         if (ret != 0) {
4086                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4087                 return -1;
4088         }
4089
4090         return 0;
4091 }
4092
4093 struct delete_ip_callback_state {
4094         struct ctdb_req_control_old *c;
4095 };
4096
4097 /*
4098   called when releaseip event finishes for del_public_address
4099  */
4100 static void delete_ip_callback(struct ctdb_context *ctdb,
4101                                int32_t status, TDB_DATA data,
4102                                const char *errormsg,
4103                                void *private_data)
4104 {
4105         struct delete_ip_callback_state *state =
4106                 talloc_get_type(private_data, struct delete_ip_callback_state);
4107
4108         /* If release failed then fail. */
4109         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4110         talloc_free(private_data);
4111 }
4112
4113 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4114                                         struct ctdb_req_control_old *c,
4115                                         TDB_DATA indata, bool *async_reply)
4116 {
4117         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4118         struct ctdb_vnn *vnn;
4119
4120         /* verify the size of indata */
4121         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4122                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4123                 return -1;
4124         }
4125         if (indata.dsize != 
4126                 ( offsetof(struct ctdb_addr_info_old, iface)
4127                 + pub->len ) ){
4128
4129                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4130                         "but should be %u bytes\n", 
4131                          (unsigned)indata.dsize, 
4132                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4133                 return -1;
4134         }
4135
4136         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4137
4138         /* walk over all public addresses until we find a match */
4139         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4140                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4141                         if (vnn->pnn == ctdb->pnn) {
4142                                 struct delete_ip_callback_state *state;
4143                                 struct ctdb_public_ip *ip;
4144                                 TDB_DATA data;
4145                                 int ret;
4146
4147                                 vnn->delete_pending = true;
4148
4149                                 state = talloc(ctdb,
4150                                                struct delete_ip_callback_state);
4151                                 CTDB_NO_MEMORY(ctdb, state);
4152                                 state->c = c;
4153
4154                                 ip = talloc(state, struct ctdb_public_ip);
4155                                 if (ip == NULL) {
4156                                         DEBUG(DEBUG_ERR,
4157                                               (__location__ " Out of memory\n"));
4158                                         talloc_free(state);
4159                                         return -1;
4160                                 }
4161                                 ip->pnn = -1;
4162                                 ip->addr = pub->addr;
4163
4164                                 data.dsize = sizeof(struct ctdb_public_ip);
4165                                 data.dptr = (unsigned char *)ip;
4166
4167                                 ret = ctdb_daemon_send_control(ctdb,
4168                                                                ctdb_get_pnn(ctdb),
4169                                                                0,
4170                                                                CTDB_CONTROL_RELEASE_IP,
4171                                                                0, 0,
4172                                                                data,
4173                                                                delete_ip_callback,
4174                                                                state);
4175                                 if (ret == -1) {
4176                                         DEBUG(DEBUG_ERR,
4177                                               (__location__ "Unable to send "
4178                                                "CTDB_CONTROL_RELEASE_IP\n"));
4179                                         talloc_free(state);
4180                                         return -1;
4181                                 }
4182
4183                                 state->c = talloc_steal(state, c);
4184                                 *async_reply = true;
4185                         } else {
4186                                 /* This IP is not hosted on the
4187                                  * current node so just delete it
4188                                  * now. */
4189                                 do_delete_ip(ctdb, vnn);
4190                         }
4191
4192                         return 0;
4193                 }
4194         }
4195
4196         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4197                          ctdb_addr_to_str(&pub->addr)));
4198         return -1;
4199 }
4200
4201
4202 struct ipreallocated_callback_state {
4203         struct ctdb_req_control_old *c;
4204 };
4205
4206 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4207                                         int status, void *p)
4208 {
4209         struct ipreallocated_callback_state *state =
4210                 talloc_get_type(p, struct ipreallocated_callback_state);
4211
4212         if (status != 0) {
4213                 DEBUG(DEBUG_ERR,
4214                       (" \"ipreallocated\" event script failed (status %d)\n",
4215                        status));
4216                 if (status == -ETIME) {
4217                         ctdb_ban_self(ctdb);
4218                 }
4219         }
4220
4221         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4222         talloc_free(state);
4223 }
4224
4225 /* A control to run the ipreallocated event */
4226 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4227                                    struct ctdb_req_control_old *c,
4228                                    bool *async_reply)
4229 {
4230         int ret;
4231         struct ipreallocated_callback_state *state;
4232
4233         state = talloc(ctdb, struct ipreallocated_callback_state);
4234         CTDB_NO_MEMORY(ctdb, state);
4235
4236         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4237
4238         ret = ctdb_event_script_callback(ctdb, state,
4239                                          ctdb_ipreallocated_callback, state,
4240                                          CTDB_EVENT_IPREALLOCATED,
4241                                          "%s", "");
4242
4243         if (ret != 0) {
4244                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4245                 talloc_free(state);
4246                 return -1;
4247         }
4248
4249         /* tell the control that we will be reply asynchronously */
4250         state->c    = talloc_steal(state, c);
4251         *async_reply = true;
4252
4253         return 0;
4254 }
4255
4256
4257 /* This function is called from the recovery daemon to verify that a remote
4258    node has the expected ip allocation.
4259    This is verified against ctdb->ip_tree
4260 */
4261 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4262                                        struct ctdb_public_ip_list_old *ips,
4263                                        uint32_t pnn)
4264 {
4265         struct public_ip_list *tmp_ip;
4266         int i;
4267
4268         if (ctdb->ip_tree == NULL) {
4269                 /* don't know the expected allocation yet, assume remote node
4270                    is correct. */
4271                 return 0;
4272         }
4273
4274         if (ips == NULL) {
4275                 return 0;
4276         }
4277
4278         for (i=0; i<ips->num; i++) {
4279                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4280                 if (tmp_ip == NULL) {
4281                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4282                         return -1;
4283                 }
4284
4285                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4286                         continue;
4287                 }
4288
4289                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4290                         DEBUG(DEBUG_ERR,
4291                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4292                                pnn,
4293                                ctdb_addr_to_str(&ips->ips[i].addr),
4294                                ips->ips[i].pnn, tmp_ip->pnn));
4295                         return -1;
4296                 }
4297         }
4298
4299         return 0;
4300 }
4301
4302 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4303 {
4304         struct public_ip_list *tmp_ip;
4305
4306         /* IP tree is never built if DisableIPFailover is set */
4307         if (ctdb->tunable.disable_ip_failover != 0) {
4308                 return 0;
4309         }
4310
4311         if (ctdb->ip_tree == NULL) {
4312                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4313                 return -1;
4314         }
4315
4316         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4317         if (tmp_ip == NULL) {
4318                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4319                 return -1;
4320         }
4321
4322         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4323         tmp_ip->pnn = ip->pnn;
4324
4325         return 0;
4326 }
4327
4328 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4329 {
4330         TALLOC_FREE(ctdb->ip_tree);
4331 }
4332
4333 struct ctdb_reloadips_handle {
4334         struct ctdb_context *ctdb;
4335         struct ctdb_req_control_old *c;
4336         int status;
4337         int fd[2];
4338         pid_t child;
4339         struct tevent_fd *fde;
4340 };
4341
4342 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4343 {
4344         if (h == h->ctdb->reload_ips) {
4345                 h->ctdb->reload_ips = NULL;
4346         }
4347         if (h->c != NULL) {
4348                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4349                 h->c = NULL;
4350         }
4351         ctdb_kill(h->ctdb, h->child, SIGKILL);
4352         return 0;
4353 }
4354
4355 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4356                                          struct tevent_timer *te,
4357                                          struct timeval t, void *private_data)
4358 {
4359         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4360
4361         talloc_free(h);
4362 }
4363
4364 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4365                                          struct tevent_fd *fde,
4366                                          uint16_t flags, void *private_data)
4367 {
4368         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4369
4370         char res;
4371         int ret;
4372
4373         ret = sys_read(h->fd[0], &res, 1);
4374         if (ret < 1 || res != 0) {
4375                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4376                 res = 1;
4377         }
4378         h->status = res;
4379
4380         talloc_free(h);
4381 }
4382
4383 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4384 {
4385         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4386         struct ctdb_public_ip_list_old *ips;
4387         struct ctdb_vnn *vnn;
4388         struct client_async_data *async_data;
4389         struct timeval timeout;
4390         TDB_DATA data;
4391         struct ctdb_client_control_state *state;
4392         bool first_add;
4393         int i, ret;
4394
4395         CTDB_NO_MEMORY(ctdb, mem_ctx);
4396
4397         /* Read IPs from local node */
4398         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4399                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4400         if (ret != 0) {
4401                 DEBUG(DEBUG_ERR,
4402                       ("Unable to fetch public IPs from local node\n"));
4403                 talloc_free(mem_ctx);
4404                 return -1;
4405         }
4406
4407         /* Read IPs file - this is safe since this is a child process */
4408         ctdb->vnn = NULL;
4409         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4410                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4411                 talloc_free(mem_ctx);
4412                 return -1;
4413         }
4414
4415         async_data = talloc_zero(mem_ctx, struct client_async_data);
4416         CTDB_NO_MEMORY(ctdb, async_data);
4417
4418         /* Compare IPs between node and file for IPs to be deleted */
4419         for (i = 0; i < ips->num; i++) {
4420                 /* */
4421                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4422                         if (ctdb_same_ip(&vnn->public_address,
4423                                          &ips->ips[i].addr)) {
4424                                 /* IP is still in file */
4425                                 break;
4426                         }
4427                 }
4428
4429                 if (vnn == NULL) {
4430                         /* Delete IP ips->ips[i] */
4431                         struct ctdb_addr_info_old *pub;
4432
4433                         DEBUG(DEBUG_NOTICE,
4434                               ("IP %s no longer configured, deleting it\n",
4435                                ctdb_addr_to_str(&ips->ips[i].addr)));
4436
4437                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4438                         CTDB_NO_MEMORY(ctdb, pub);
4439
4440                         pub->addr  = ips->ips[i].addr;
4441                         pub->mask  = 0;
4442                         pub->len   = 0;
4443
4444                         timeout = TAKEOVER_TIMEOUT();
4445
4446                         data.dsize = offsetof(struct ctdb_addr_info_old,
4447                                               iface) + pub->len;
4448                         data.dptr = (uint8_t *)pub;
4449
4450                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4451                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4452                                                   0, data, async_data,
4453                                                   &timeout, NULL);
4454                         if (state == NULL) {
4455                                 DEBUG(DEBUG_ERR,
4456                                       (__location__
4457                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4458                                 goto failed;
4459                         }
4460
4461                         ctdb_client_async_add(async_data, state);
4462                 }
4463         }
4464
4465         /* Compare IPs between node and file for IPs to be added */
4466         first_add = true;
4467         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4468                 for (i = 0; i < ips->num; i++) {
4469                         if (ctdb_same_ip(&vnn->public_address,
4470                                          &ips->ips[i].addr)) {
4471                                 /* IP already on node */
4472                                 break;
4473                         }
4474                 }
4475                 if (i == ips->num) {
4476                         /* Add IP ips->ips[i] */
4477                         struct ctdb_addr_info_old *pub;
4478                         const char *ifaces = NULL;
4479                         uint32_t len;
4480                         int iface = 0;
4481
4482                         DEBUG(DEBUG_NOTICE,
4483                               ("New IP %s configured, adding it\n",
4484                                ctdb_addr_to_str(&vnn->public_address)));
4485                         if (first_add) {
4486                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4487
4488                                 data.dsize = sizeof(pnn);
4489                                 data.dptr  = (uint8_t *)&pnn;
4490
4491                                 ret = ctdb_client_send_message(
4492                                         ctdb,
4493                                         CTDB_BROADCAST_CONNECTED,
4494                                         CTDB_SRVID_REBALANCE_NODE,
4495                                         data);
4496                                 if (ret != 0) {
4497                                         DEBUG(DEBUG_WARNING,
4498                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4499                                 }
4500
4501                                 first_add = false;
4502                         }
4503
4504                         ifaces = vnn->ifaces[0];
4505                         iface = 1;
4506                         while (vnn->ifaces[iface] != NULL) {
4507                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4508                                                          vnn->ifaces[iface]);
4509                                 iface++;
4510                         }
4511
4512                         len   = strlen(ifaces) + 1;
4513                         pub = talloc_zero_size(mem_ctx,
4514                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4515                         CTDB_NO_MEMORY(ctdb, pub);
4516
4517                         pub->addr  = vnn->public_address;
4518                         pub->mask  = vnn->public_netmask_bits;
4519                         pub->len   = len;
4520                         memcpy(&pub->iface[0], ifaces, pub->len);
4521
4522                         timeout = TAKEOVER_TIMEOUT();
4523
4524                         data.dsize = offsetof(struct ctdb_addr_info_old,
4525                                               iface) + pub->len;
4526                         data.dptr = (uint8_t *)pub;
4527
4528                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4529                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4530                                                   0, data, async_data,
4531                                                   &timeout, NULL);
4532                         if (state == NULL) {
4533                                 DEBUG(DEBUG_ERR,
4534                                       (__location__
4535                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4536                                 goto failed;
4537                         }
4538
4539                         ctdb_client_async_add(async_data, state);
4540                 }
4541         }
4542
4543         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4544                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4545                 goto failed;
4546         }
4547
4548         talloc_free(mem_ctx);
4549         return 0;
4550
4551 failed:
4552         talloc_free(mem_ctx);
4553         return -1;
4554 }
4555
4556 /* This control is sent to force the node to re-read the public addresses file
4557    and drop any addresses we should nnot longer host, and add new addresses
4558    that we are now able to host
4559 */
4560 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4561 {
4562         struct ctdb_reloadips_handle *h;
4563         pid_t parent = getpid();
4564
4565         if (ctdb->reload_ips != NULL) {
4566                 talloc_free(ctdb->reload_ips);
4567                 ctdb->reload_ips = NULL;
4568         }
4569
4570         h = talloc(ctdb, struct ctdb_reloadips_handle);
4571         CTDB_NO_MEMORY(ctdb, h);
4572         h->ctdb     = ctdb;
4573         h->c        = NULL;
4574         h->status   = -1;
4575         
4576         if (pipe(h->fd) == -1) {
4577                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4578                 talloc_free(h);
4579                 return -1;
4580         }
4581
4582         h->child = ctdb_fork(ctdb);
4583         if (h->child == (pid_t)-1) {
4584                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4585                 close(h->fd[0]);
4586                 close(h->fd[1]);
4587                 talloc_free(h);
4588                 return -1;
4589         }
4590
4591         /* child process */
4592         if (h->child == 0) {
4593                 signed char res = 0;
4594
4595                 close(h->fd[0]);
4596                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4597
4598                 prctl_set_comment("ctdb_reloadips");
4599                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4600                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4601                         res = -1;
4602                 } else {
4603                         res = ctdb_reloadips_child(ctdb);
4604                         if (res != 0) {
4605                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4606                         }
4607                 }
4608
4609                 sys_write(h->fd[1], &res, 1);
4610                 /* make sure we die when our parent dies */
4611                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4612                         sleep(5);
4613                 }
4614                 _exit(0);
4615         }
4616
4617         h->c             = talloc_steal(h, c);
4618
4619         close(h->fd[1]);
4620         set_close_on_exec(h->fd[0]);
4621
4622         talloc_set_destructor(h, ctdb_reloadips_destructor);
4623
4624
4625         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4626                                ctdb_reloadips_child_handler, (void *)h);
4627         tevent_fd_set_auto_close(h->fde);
4628
4629         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4630                          ctdb_reloadips_timeout_event, h);
4631
4632         /* we reply later */
4633         *async_reply = true;
4634         return 0;
4635 }