Fix various spelling errors
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33
34 #include "ctdb_private.h"
35 #include "ctdb_client.h"
36 #include "ctdb_logging.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42
43
44 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
45
46 #define CTDB_ARP_INTERVAL 1
47 #define CTDB_ARP_REPEAT   3
48
49 /* Flags used in IP allocation algorithms. */
50 struct ctdb_ipflags {
51         bool noiptakeover;
52         bool noiphost;
53         enum ctdb_runstate runstate;
54 };
55
56 struct ctdb_interface {
57         struct ctdb_interface *prev, *next;
58         const char *name;
59         bool link_up;
60         uint32_t references;
61 };
62
63 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
64 {
65         if (vnn->iface) {
66                 return vnn->iface->name;
67         }
68
69         return "__none__";
70 }
71
72 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
73 {
74         struct ctdb_interface *i;
75
76         /* Verify that we don't have an entry for this ip yet */
77         for (i=ctdb->ifaces;i;i=i->next) {
78                 if (strcmp(i->name, iface) == 0) {
79                         return 0;
80                 }
81         }
82
83         /* create a new structure for this interface */
84         i = talloc_zero(ctdb, struct ctdb_interface);
85         CTDB_NO_MEMORY_FATAL(ctdb, i);
86         i->name = talloc_strdup(i, iface);
87         CTDB_NO_MEMORY(ctdb, i->name);
88
89         i->link_up = true;
90
91         DLIST_ADD(ctdb->ifaces, i);
92
93         return 0;
94 }
95
96 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
97                                         const char *name)
98 {
99         int n;
100
101         for (n = 0; vnn->ifaces[n] != NULL; n++) {
102                 if (strcmp(name, vnn->ifaces[n]) == 0) {
103                         return true;
104                 }
105         }
106
107         return false;
108 }
109
110 /* If any interfaces now have no possible IPs then delete them.  This
111  * implementation is naive (i.e. simple) rather than clever
112  * (i.e. complex).  Given that this is run on delip and that operation
113  * is rare, this doesn't need to be efficient - it needs to be
114  * foolproof.  One alternative is reference counting, where the logic
115  * is distributed and can, therefore, be broken in multiple places.
116  * Another alternative is to build a red-black tree of interfaces that
117  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
118  * once) and then walking ctdb->ifaces once and deleting those not in
119  * the tree.  Let's go to one of those if the naive implementation
120  * causes problems...  :-)
121  */
122 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
123                                         struct ctdb_vnn *vnn)
124 {
125         struct ctdb_interface *i, *next;
126
127         /* For each interface, check if there's an IP using it. */
128         for (i = ctdb->ifaces; i != NULL; i = next) {
129                 struct ctdb_vnn *tv;
130                 bool found;
131                 next = i->next;
132
133                 /* Only consider interfaces named in the given VNN. */
134                 if (!vnn_has_interface_with_name(vnn, i->name)) {
135                         continue;
136                 }
137
138                 /* Is the "single IP" on this interface? */
139                 if ((ctdb->single_ip_vnn != NULL) &&
140                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
141                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
142                         /* Found, next interface please... */
143                         continue;
144                 }
145                 /* Search for a vnn with this interface. */
146                 found = false;
147                 for (tv=ctdb->vnn; tv; tv=tv->next) {
148                         if (vnn_has_interface_with_name(tv, i->name)) {
149                                 found = true;
150                                 break;
151                         }
152                 }
153
154                 if (!found) {
155                         /* None of the VNNs are using this interface. */
156                         DLIST_REMOVE(ctdb->ifaces, i);
157                         talloc_free(i);
158                 }
159         }
160 }
161
162
163 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
164                                               const char *iface)
165 {
166         struct ctdb_interface *i;
167
168         for (i=ctdb->ifaces;i;i=i->next) {
169                 if (strcmp(i->name, iface) == 0) {
170                         return i;
171                 }
172         }
173
174         return NULL;
175 }
176
177 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178                                                   struct ctdb_vnn *vnn)
179 {
180         int i;
181         struct ctdb_interface *cur = NULL;
182         struct ctdb_interface *best = NULL;
183
184         for (i=0; vnn->ifaces[i]; i++) {
185
186                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
187                 if (cur == NULL) {
188                         continue;
189                 }
190
191                 if (!cur->link_up) {
192                         continue;
193                 }
194
195                 if (best == NULL) {
196                         best = cur;
197                         continue;
198                 }
199
200                 if (cur->references < best->references) {
201                         best = cur;
202                         continue;
203                 }
204         }
205
206         return best;
207 }
208
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210                                      struct ctdb_vnn *vnn)
211 {
212         struct ctdb_interface *best = NULL;
213
214         if (vnn->iface) {
215                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216                                    "still assigned to iface '%s'\n",
217                                    ctdb_addr_to_str(&vnn->public_address),
218                                    ctdb_vnn_iface_string(vnn)));
219                 return 0;
220         }
221
222         best = ctdb_vnn_best_iface(ctdb, vnn);
223         if (best == NULL) {
224                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225                                   "cannot assign to iface any iface\n",
226                                   ctdb_addr_to_str(&vnn->public_address)));
227                 return -1;
228         }
229
230         vnn->iface = best;
231         best->references++;
232         vnn->pnn = ctdb->pnn;
233
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now assigned to iface '%s' refs[%d]\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            best->references));
239         return 0;
240 }
241
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243                                     struct ctdb_vnn *vnn)
244 {
245         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246                            "now unassigned (old iface '%s' refs[%d])\n",
247                            ctdb_addr_to_str(&vnn->public_address),
248                            ctdb_vnn_iface_string(vnn),
249                            vnn->iface?vnn->iface->references:0));
250         if (vnn->iface) {
251                 vnn->iface->references--;
252         }
253         vnn->iface = NULL;
254         if (vnn->pnn == ctdb->pnn) {
255                 vnn->pnn = -1;
256         }
257 }
258
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260                                struct ctdb_vnn *vnn)
261 {
262         int i;
263
264         if (vnn->delete_pending) {
265                 return false;
266         }
267
268         if (vnn->iface && vnn->iface->link_up) {
269                 return true;
270         }
271
272         for (i=0; vnn->ifaces[i]; i++) {
273                 struct ctdb_interface *cur;
274
275                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
276                 if (cur == NULL) {
277                         continue;
278                 }
279
280                 if (cur->link_up) {
281                         return true;
282                 }
283         }
284
285         return false;
286 }
287
288 struct ctdb_takeover_arp {
289         struct ctdb_context *ctdb;
290         uint32_t count;
291         ctdb_sock_addr addr;
292         struct ctdb_tcp_array *tcparray;
293         struct ctdb_vnn *vnn;
294 };
295
296
297 /*
298   lists of tcp endpoints
299  */
300 struct ctdb_tcp_list {
301         struct ctdb_tcp_list *prev, *next;
302         struct ctdb_connection connection;
303 };
304
305 /*
306   list of clients to kill on IP release
307  */
308 struct ctdb_client_ip {
309         struct ctdb_client_ip *prev, *next;
310         struct ctdb_context *ctdb;
311         ctdb_sock_addr addr;
312         uint32_t client_id;
313 };
314
315
316 /*
317   send a gratuitous arp
318  */
319 static void ctdb_control_send_arp(struct tevent_context *ev,
320                                   struct tevent_timer *te,
321                                   struct timeval t, void *private_data)
322 {
323         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
324                                                         struct ctdb_takeover_arp);
325         int i, ret;
326         struct ctdb_tcp_array *tcparray;
327         const char *iface = ctdb_vnn_iface_string(arp->vnn);
328
329         ret = ctdb_sys_send_arp(&arp->addr, iface);
330         if (ret != 0) {
331                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
332                                   iface, strerror(errno)));
333         }
334
335         tcparray = arp->tcparray;
336         if (tcparray) {
337                 for (i=0;i<tcparray->num;i++) {
338                         struct ctdb_connection *tcon;
339
340                         tcon = &tcparray->connections[i];
341                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
342                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
343                                 ctdb_addr_to_str(&tcon->src),
344                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
345                         ret = ctdb_sys_send_tcp(
346                                 &tcon->src,
347                                 &tcon->dst,
348                                 0, 0, 0);
349                         if (ret != 0) {
350                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
351                                         ctdb_addr_to_str(&tcon->src)));
352                         }
353                 }
354         }
355
356         arp->count++;
357
358         if (arp->count == CTDB_ARP_REPEAT) {
359                 talloc_free(arp);
360                 return;
361         }
362
363         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
364                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
365                          ctdb_control_send_arp, arp);
366 }
367
368 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
369                                        struct ctdb_vnn *vnn)
370 {
371         struct ctdb_takeover_arp *arp;
372         struct ctdb_tcp_array *tcparray;
373
374         if (!vnn->takeover_ctx) {
375                 vnn->takeover_ctx = talloc_new(vnn);
376                 if (!vnn->takeover_ctx) {
377                         return -1;
378                 }
379         }
380
381         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
382         if (!arp) {
383                 return -1;
384         }
385
386         arp->ctdb = ctdb;
387         arp->addr = vnn->public_address;
388         arp->vnn  = vnn;
389
390         tcparray = vnn->tcp_array;
391         if (tcparray) {
392                 /* add all of the known tcp connections for this IP to the
393                    list of tcp connections to send tickle acks for */
394                 arp->tcparray = talloc_steal(arp, tcparray);
395
396                 vnn->tcp_array = NULL;
397                 vnn->tcp_update_needed = true;
398         }
399
400         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
401                          timeval_zero(), ctdb_control_send_arp, arp);
402
403         return 0;
404 }
405
406 struct takeover_callback_state {
407         struct ctdb_req_control_old *c;
408         ctdb_sock_addr *addr;
409         struct ctdb_vnn *vnn;
410 };
411
412 struct ctdb_do_takeip_state {
413         struct ctdb_req_control_old *c;
414         struct ctdb_vnn *vnn;
415 };
416
417 /*
418   called when takeip event finishes
419  */
420 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
421                                     void *private_data)
422 {
423         struct ctdb_do_takeip_state *state =
424                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
425         int32_t ret;
426         TDB_DATA data;
427
428         if (status != 0) {
429                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
430         
431                 if (status == -ETIME) {
432                         ctdb_ban_self(ctdb);
433                 }
434                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
435                                  ctdb_addr_to_str(&state->vnn->public_address),
436                                  ctdb_vnn_iface_string(state->vnn)));
437                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
438
439                 node->flags |= NODE_FLAGS_UNHEALTHY;
440                 talloc_free(state);
441                 return;
442         }
443
444         if (ctdb->do_checkpublicip) {
445
446         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
447         if (ret != 0) {
448                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
449                 talloc_free(state);
450                 return;
451         }
452
453         }
454
455         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
456         data.dsize = strlen((char *)data.dptr) + 1;
457         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
458
459         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
460
461
462         /* the control succeeded */
463         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
464         talloc_free(state);
465         return;
466 }
467
468 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
469 {
470         state->vnn->update_in_flight = false;
471         return 0;
472 }
473
474 /*
475   take over an ip address
476  */
477 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
478                               struct ctdb_req_control_old *c,
479                               struct ctdb_vnn *vnn)
480 {
481         int ret;
482         struct ctdb_do_takeip_state *state;
483
484         if (vnn->update_in_flight) {
485                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
486                                     "update for this IP already in flight\n",
487                                     ctdb_addr_to_str(&vnn->public_address),
488                                     vnn->public_netmask_bits));
489                 return -1;
490         }
491
492         ret = ctdb_vnn_assign_iface(ctdb, vnn);
493         if (ret != 0) {
494                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
495                                  "assign a usable interface\n",
496                                  ctdb_addr_to_str(&vnn->public_address),
497                                  vnn->public_netmask_bits));
498                 return -1;
499         }
500
501         state = talloc(vnn, struct ctdb_do_takeip_state);
502         CTDB_NO_MEMORY(ctdb, state);
503
504         state->c = talloc_steal(ctdb, c);
505         state->vnn   = vnn;
506
507         vnn->update_in_flight = true;
508         talloc_set_destructor(state, ctdb_takeip_destructor);
509
510         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
511                             ctdb_addr_to_str(&vnn->public_address),
512                             vnn->public_netmask_bits,
513                             ctdb_vnn_iface_string(vnn)));
514
515         ret = ctdb_event_script_callback(ctdb,
516                                          state,
517                                          ctdb_do_takeip_callback,
518                                          state,
519                                          CTDB_EVENT_TAKE_IP,
520                                          "%s %s %u",
521                                          ctdb_vnn_iface_string(vnn),
522                                          ctdb_addr_to_str(&vnn->public_address),
523                                          vnn->public_netmask_bits);
524
525         if (ret != 0) {
526                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
527                         ctdb_addr_to_str(&vnn->public_address),
528                         ctdb_vnn_iface_string(vnn)));
529                 talloc_free(state);
530                 return -1;
531         }
532
533         return 0;
534 }
535
536 struct ctdb_do_updateip_state {
537         struct ctdb_req_control_old *c;
538         struct ctdb_interface *old;
539         struct ctdb_vnn *vnn;
540 };
541
542 /*
543   called when updateip event finishes
544  */
545 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
546                                       void *private_data)
547 {
548         struct ctdb_do_updateip_state *state =
549                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
550         int32_t ret;
551
552         if (status != 0) {
553                 if (status == -ETIME) {
554                         ctdb_ban_self(ctdb);
555                 }
556                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
557                         ctdb_addr_to_str(&state->vnn->public_address),
558                         state->old->name,
559                         ctdb_vnn_iface_string(state->vnn)));
560
561                 /*
562                  * All we can do is reset the old interface
563                  * and let the next run fix it
564                  */
565                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
566                 state->vnn->iface = state->old;
567                 state->vnn->iface->references++;
568
569                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
570                 talloc_free(state);
571                 return;
572         }
573
574         if (ctdb->do_checkpublicip) {
575
576         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
577         if (ret != 0) {
578                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
579                 talloc_free(state);
580                 return;
581         }
582
583         }
584
585         /* the control succeeded */
586         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
587         talloc_free(state);
588         return;
589 }
590
591 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
592 {
593         state->vnn->update_in_flight = false;
594         return 0;
595 }
596
597 /*
598   update (move) an ip address
599  */
600 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
601                                 struct ctdb_req_control_old *c,
602                                 struct ctdb_vnn *vnn)
603 {
604         int ret;
605         struct ctdb_do_updateip_state *state;
606         struct ctdb_interface *old = vnn->iface;
607         const char *new_name;
608
609         if (vnn->update_in_flight) {
610                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
611                                     "update for this IP already in flight\n",
612                                     ctdb_addr_to_str(&vnn->public_address),
613                                     vnn->public_netmask_bits));
614                 return -1;
615         }
616
617         ctdb_vnn_unassign_iface(ctdb, vnn);
618         ret = ctdb_vnn_assign_iface(ctdb, vnn);
619         if (ret != 0) {
620                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
621                                  "assin a usable interface (old iface '%s')\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  vnn->public_netmask_bits,
624                                  old->name));
625                 return -1;
626         }
627
628         new_name = ctdb_vnn_iface_string(vnn);
629         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
630                 /* A benign update from one interface onto itself.
631                  * no need to run the eventscripts in this case, just return
632                  * success.
633                  */
634                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
635                 return 0;
636         }
637
638         state = talloc(vnn, struct ctdb_do_updateip_state);
639         CTDB_NO_MEMORY(ctdb, state);
640
641         state->c = talloc_steal(ctdb, c);
642         state->old = old;
643         state->vnn = vnn;
644
645         vnn->update_in_flight = true;
646         talloc_set_destructor(state, ctdb_updateip_destructor);
647
648         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
649                             "interface %s to %s\n",
650                             ctdb_addr_to_str(&vnn->public_address),
651                             vnn->public_netmask_bits,
652                             old->name,
653                             new_name));
654
655         ret = ctdb_event_script_callback(ctdb,
656                                          state,
657                                          ctdb_do_updateip_callback,
658                                          state,
659                                          CTDB_EVENT_UPDATE_IP,
660                                          "%s %s %s %u",
661                                          state->old->name,
662                                          new_name,
663                                          ctdb_addr_to_str(&vnn->public_address),
664                                          vnn->public_netmask_bits);
665         if (ret != 0) {
666                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
667                                  ctdb_addr_to_str(&vnn->public_address),
668                                  old->name, new_name));
669                 talloc_free(state);
670                 return -1;
671         }
672
673         return 0;
674 }
675
676 /*
677   Find the vnn of the node that has a public ip address
678   returns -1 if the address is not known as a public address
679  */
680 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
681 {
682         struct ctdb_vnn *vnn;
683
684         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
685                 if (ctdb_same_ip(&vnn->public_address, addr)) {
686                         return vnn;
687                 }
688         }
689
690         return NULL;
691 }
692
693 /*
694   take over an ip address
695  */
696 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
697                                  struct ctdb_req_control_old *c,
698                                  TDB_DATA indata,
699                                  bool *async_reply)
700 {
701         int ret;
702         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
703         struct ctdb_vnn *vnn;
704         bool have_ip = false;
705         bool do_updateip = false;
706         bool do_takeip = false;
707         struct ctdb_interface *best_iface = NULL;
708
709         if (pip->pnn != ctdb->pnn) {
710                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
711                                  "with pnn %d, but we're node %d\n",
712                                  ctdb_addr_to_str(&pip->addr),
713                                  pip->pnn, ctdb->pnn));
714                 return -1;
715         }
716
717         /* update out vnn list */
718         vnn = find_public_ip_vnn(ctdb, &pip->addr);
719         if (vnn == NULL) {
720                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
721                         ctdb_addr_to_str(&pip->addr)));
722                 return 0;
723         }
724
725         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
726                 have_ip = ctdb_sys_have_ip(&pip->addr);
727         }
728         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
729         if (best_iface == NULL) {
730                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
731                                  "a usable interface (old %s, have_ip %d)\n",
732                                  ctdb_addr_to_str(&vnn->public_address),
733                                  vnn->public_netmask_bits,
734                                  ctdb_vnn_iface_string(vnn),
735                                  have_ip));
736                 return -1;
737         }
738
739         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
740                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
741                 have_ip = false;
742         }
743
744
745         if (vnn->iface == NULL && have_ip) {
746                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
747                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
748                                  ctdb_addr_to_str(&vnn->public_address)));
749                 return 0;
750         }
751
752         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
753                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
754                                   "and we have it on iface[%s], but it was assigned to node %d"
755                                   "and we are node %d, banning ourself\n",
756                                  ctdb_addr_to_str(&vnn->public_address),
757                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
758                 ctdb_ban_self(ctdb);
759                 return -1;
760         }
761
762         if (vnn->pnn == -1 && have_ip) {
763                 vnn->pnn = ctdb->pnn;
764                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
765                                   "and we already have it on iface[%s], update local daemon\n",
766                                  ctdb_addr_to_str(&vnn->public_address),
767                                   ctdb_vnn_iface_string(vnn)));
768                 return 0;
769         }
770
771         if (vnn->iface) {
772                 if (vnn->iface != best_iface) {
773                         if (!vnn->iface->link_up) {
774                                 do_updateip = true;
775                         } else if (vnn->iface->references > (best_iface->references + 1)) {
776                                 /* only move when the rebalance gains something */
777                                         do_updateip = true;
778                         }
779                 }
780         }
781
782         if (!have_ip) {
783                 if (do_updateip) {
784                         ctdb_vnn_unassign_iface(ctdb, vnn);
785                         do_updateip = false;
786                 }
787                 do_takeip = true;
788         }
789
790         if (do_takeip) {
791                 ret = ctdb_do_takeip(ctdb, c, vnn);
792                 if (ret != 0) {
793                         return -1;
794                 }
795         } else if (do_updateip) {
796                 ret = ctdb_do_updateip(ctdb, c, vnn);
797                 if (ret != 0) {
798                         return -1;
799                 }
800         } else {
801                 /*
802                  * The interface is up and the kernel known the ip
803                  * => do nothing
804                  */
805                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
806                         ctdb_addr_to_str(&pip->addr),
807                         vnn->public_netmask_bits,
808                         ctdb_vnn_iface_string(vnn)));
809                 return 0;
810         }
811
812         /* tell ctdb_control.c that we will be replying asynchronously */
813         *async_reply = true;
814
815         return 0;
816 }
817
818 /*
819   kill any clients that are registered with a IP that is being released
820  */
821 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
822 {
823         struct ctdb_client_ip *ip;
824
825         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
826                 ctdb_addr_to_str(addr)));
827
828         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
829                 ctdb_sock_addr tmp_addr;
830
831                 tmp_addr = ip->addr;
832                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
833                         ip->client_id,
834                         ctdb_addr_to_str(&ip->addr)));
835
836                 if (ctdb_same_ip(&tmp_addr, addr)) {
837                         struct ctdb_client *client = reqid_find(ctdb->idr,
838                                                                 ip->client_id,
839                                                                 struct ctdb_client);
840                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
841                                 ip->client_id,
842                                 ctdb_addr_to_str(&ip->addr),
843                                 client->pid));
844
845                         if (client->pid != 0) {
846                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
847                                         (unsigned)client->pid,
848                                         ctdb_addr_to_str(addr),
849                                         ip->client_id));
850                                 kill(client->pid, SIGKILL);
851                         }
852                 }
853         }
854 }
855
856 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
857 {
858         DLIST_REMOVE(ctdb->vnn, vnn);
859         ctdb_vnn_unassign_iface(ctdb, vnn);
860         ctdb_remove_orphaned_ifaces(ctdb, vnn);
861         talloc_free(vnn);
862 }
863
864 /*
865   called when releaseip event finishes
866  */
867 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
868                                 void *private_data)
869 {
870         struct takeover_callback_state *state = 
871                 talloc_get_type(private_data, struct takeover_callback_state);
872         TDB_DATA data;
873
874         if (status == -ETIME) {
875                 ctdb_ban_self(ctdb);
876         }
877
878         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
879                 if  (ctdb_sys_have_ip(state->addr)) {
880                         DEBUG(DEBUG_ERR,
881                               ("IP %s still hosted during release IP callback, failing\n",
882                                ctdb_addr_to_str(state->addr)));
883                         ctdb_request_control_reply(ctdb, state->c,
884                                                    NULL, -1, NULL);
885                         talloc_free(state);
886                         return;
887                 }
888         }
889
890         /* send a message to all clients of this node telling them
891            that the cluster has been reconfigured and they should
892            release any sockets on this IP */
893         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
894         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
895         data.dsize = strlen((char *)data.dptr)+1;
896
897         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
898
899         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
900
901         /* kill clients that have registered with this IP */
902         release_kill_clients(ctdb, state->addr);
903
904         ctdb_vnn_unassign_iface(ctdb, state->vnn);
905
906         /* Process the IP if it has been marked for deletion */
907         if (state->vnn->delete_pending) {
908                 do_delete_ip(ctdb, state->vnn);
909                 state->vnn = NULL;
910         }
911
912         /* the control succeeded */
913         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
914         talloc_free(state);
915 }
916
917 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
918 {
919         if (state->vnn != NULL) {
920                 state->vnn->update_in_flight = false;
921         }
922         return 0;
923 }
924
925 /*
926   release an ip address
927  */
928 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
929                                 struct ctdb_req_control_old *c,
930                                 TDB_DATA indata, 
931                                 bool *async_reply)
932 {
933         int ret;
934         struct takeover_callback_state *state;
935         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
936         struct ctdb_vnn *vnn;
937         char *iface;
938
939         /* update our vnn list */
940         vnn = find_public_ip_vnn(ctdb, &pip->addr);
941         if (vnn == NULL) {
942                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
943                         ctdb_addr_to_str(&pip->addr)));
944                 return 0;
945         }
946         vnn->pnn = pip->pnn;
947
948         /* stop any previous arps */
949         talloc_free(vnn->takeover_ctx);
950         vnn->takeover_ctx = NULL;
951
952         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
953          * lazy multicast to drop an IP from any node that isn't the
954          * intended new node.  The following causes makes ctdbd ignore
955          * a release for any address it doesn't host.
956          */
957         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
958                 if (!ctdb_sys_have_ip(&pip->addr)) {
959                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
960                                 ctdb_addr_to_str(&pip->addr),
961                                 vnn->public_netmask_bits,
962                                 ctdb_vnn_iface_string(vnn)));
963                         ctdb_vnn_unassign_iface(ctdb, vnn);
964                         return 0;
965                 }
966         } else {
967                 if (vnn->iface == NULL) {
968                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
969                                            ctdb_addr_to_str(&pip->addr),
970                                            vnn->public_netmask_bits));
971                         return 0;
972                 }
973         }
974
975         /* There is a potential race between take_ip and us because we
976          * update the VNN via a callback that run when the
977          * eventscripts have been run.  Avoid the race by allowing one
978          * update to be in flight at a time.
979          */
980         if (vnn->update_in_flight) {
981                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
982                                     "update for this IP already in flight\n",
983                                     ctdb_addr_to_str(&vnn->public_address),
984                                     vnn->public_netmask_bits));
985                 return -1;
986         }
987
988         iface = strdup(ctdb_vnn_iface_string(vnn));
989
990         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
991                 ctdb_addr_to_str(&pip->addr),
992                 vnn->public_netmask_bits,
993                 iface,
994                 pip->pnn));
995
996         state = talloc(ctdb, struct takeover_callback_state);
997         if (state == NULL) {
998                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
999                                __FILE__, __LINE__);
1000                 free(iface);
1001                 return -1;
1002         }
1003
1004         state->c = talloc_steal(state, c);
1005         state->addr = talloc(state, ctdb_sock_addr);       
1006         if (state->addr == NULL) {
1007                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1008                                __FILE__, __LINE__);
1009                 free(iface);
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013         *state->addr = pip->addr;
1014         state->vnn   = vnn;
1015
1016         vnn->update_in_flight = true;
1017         talloc_set_destructor(state, ctdb_releaseip_destructor);
1018
1019         ret = ctdb_event_script_callback(ctdb, 
1020                                          state, release_ip_callback, state,
1021                                          CTDB_EVENT_RELEASE_IP,
1022                                          "%s %s %u",
1023                                          iface,
1024                                          ctdb_addr_to_str(&pip->addr),
1025                                          vnn->public_netmask_bits);
1026         free(iface);
1027         if (ret != 0) {
1028                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1029                         ctdb_addr_to_str(&pip->addr),
1030                         ctdb_vnn_iface_string(vnn)));
1031                 talloc_free(state);
1032                 return -1;
1033         }
1034
1035         /* tell the control that we will be reply asynchronously */
1036         *async_reply = true;
1037         return 0;
1038 }
1039
1040 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1041                                    ctdb_sock_addr *addr,
1042                                    unsigned mask, const char *ifaces,
1043                                    bool check_address)
1044 {
1045         struct ctdb_vnn      *vnn;
1046         uint32_t num = 0;
1047         char *tmp;
1048         const char *iface;
1049         int i;
1050         int ret;
1051
1052         tmp = strdup(ifaces);
1053         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1054                 if (!ctdb_sys_check_iface_exists(iface)) {
1055                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1056                         free(tmp);
1057                         return -1;
1058                 }
1059         }
1060         free(tmp);
1061
1062         /* Verify that we don't have an entry for this ip yet */
1063         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1064                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1065                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1066                                 ctdb_addr_to_str(addr)));
1067                         return -1;
1068                 }               
1069         }
1070
1071         /* create a new vnn structure for this ip address */
1072         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1073         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1074         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1075         tmp = talloc_strdup(vnn, ifaces);
1076         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1077         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1078                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1080                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1081                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1082                 num++;
1083         }
1084         talloc_free(tmp);
1085         vnn->ifaces[num] = NULL;
1086         vnn->public_address      = *addr;
1087         vnn->public_netmask_bits = mask;
1088         vnn->pnn                 = -1;
1089         if (check_address) {
1090                 if (ctdb_sys_have_ip(addr)) {
1091                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1092                         vnn->pnn = ctdb->pnn;
1093                 }
1094         }
1095
1096         for (i=0; vnn->ifaces[i]; i++) {
1097                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1098                 if (ret != 0) {
1099                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1100                                            "for public_address[%s]\n",
1101                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1102                         talloc_free(vnn);
1103                         return -1;
1104                 }
1105         }
1106
1107         DLIST_ADD(ctdb->vnn, vnn);
1108
1109         return 0;
1110 }
1111
1112 /*
1113   setup the public address lists from a file
1114 */
1115 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1116 {
1117         char **lines;
1118         int nlines;
1119         int i;
1120
1121         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1122         if (lines == NULL) {
1123                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1124                 return -1;
1125         }
1126         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1127                 nlines--;
1128         }
1129
1130         for (i=0;i<nlines;i++) {
1131                 unsigned mask;
1132                 ctdb_sock_addr addr;
1133                 const char *addrstr;
1134                 const char *ifaces;
1135                 char *tok, *line;
1136
1137                 line = lines[i];
1138                 while ((*line == ' ') || (*line == '\t')) {
1139                         line++;
1140                 }
1141                 if (*line == '#') {
1142                         continue;
1143                 }
1144                 if (strcmp(line, "") == 0) {
1145                         continue;
1146                 }
1147                 tok = strtok(line, " \t");
1148                 addrstr = tok;
1149                 tok = strtok(NULL, " \t");
1150                 if (tok == NULL) {
1151                         if (NULL == ctdb->default_public_interface) {
1152                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1153                                          i+1));
1154                                 talloc_free(lines);
1155                                 return -1;
1156                         }
1157                         ifaces = ctdb->default_public_interface;
1158                 } else {
1159                         ifaces = tok;
1160                 }
1161
1162                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1163                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1164                         talloc_free(lines);
1165                         return -1;
1166                 }
1167                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1168                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1169                         talloc_free(lines);
1170                         return -1;
1171                 }
1172         }
1173
1174
1175         talloc_free(lines);
1176         return 0;
1177 }
1178
1179 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1180                               const char *iface,
1181                               const char *ip)
1182 {
1183         struct ctdb_vnn *svnn;
1184         struct ctdb_interface *cur = NULL;
1185         bool ok;
1186         int ret;
1187
1188         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1189         CTDB_NO_MEMORY(ctdb, svnn);
1190
1191         svnn->ifaces = talloc_array(svnn, const char *, 2);
1192         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1193         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1194         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1195         svnn->ifaces[1] = NULL;
1196
1197         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1198         if (!ok) {
1199                 talloc_free(svnn);
1200                 return -1;
1201         }
1202
1203         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1204         if (ret != 0) {
1205                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1206                                    "for single_ip[%s]\n",
1207                                    svnn->ifaces[0],
1208                                    ctdb_addr_to_str(&svnn->public_address)));
1209                 talloc_free(svnn);
1210                 return -1;
1211         }
1212
1213         /* assume the single public ip interface is initially "good" */
1214         cur = ctdb_find_iface(ctdb, iface);
1215         if (cur == NULL) {
1216                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1217                 return -1;
1218         }
1219         cur->link_up = true;
1220
1221         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1222         if (ret != 0) {
1223                 talloc_free(svnn);
1224                 return -1;
1225         }
1226
1227         ctdb->single_ip_vnn = svnn;
1228         return 0;
1229 }
1230
1231 struct public_ip_list {
1232         struct public_ip_list *next;
1233         uint32_t pnn;
1234         ctdb_sock_addr addr;
1235 };
1236
1237 /* Given a physical node, return the number of
1238    public addresses that is currently assigned to this node.
1239 */
1240 static int node_ip_coverage(struct ctdb_context *ctdb, int32_t pnn,
1241                             struct public_ip_list *ips)
1242 {
1243         int num=0;
1244
1245         for (;ips;ips=ips->next) {
1246                 if (ips->pnn == pnn) {
1247                         num++;
1248                 }
1249         }
1250         return num;
1251 }
1252
1253
1254 /* Can the given node host the given IP: is the public IP known to the
1255  * node and is NOIPHOST unset?
1256 */
1257 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1258                              struct ctdb_ipflags ipflags,
1259                              struct public_ip_list *ip)
1260 {
1261         struct ctdb_public_ip_list_old *public_ips;
1262         int i;
1263
1264         if (ipflags.noiphost) {
1265                 return false;
1266         }
1267
1268         public_ips = ctdb->nodes[pnn]->available_public_ips;
1269
1270         if (public_ips == NULL) {
1271                 return false;
1272         }
1273
1274         for (i=0; i<public_ips->num; i++) {
1275                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1276                         /* yes, this node can serve this public ip */
1277                         return true;
1278                 }
1279         }
1280
1281         return false;
1282 }
1283
1284 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1285                                  struct ctdb_ipflags ipflags,
1286                                  struct public_ip_list *ip)
1287 {
1288         if (ipflags.noiptakeover) {
1289                 return false;
1290         }
1291
1292         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1293 }
1294
1295 /* search the node lists list for a node to takeover this ip.
1296    pick the node that currently are serving the least number of ips
1297    so that the ips get spread out evenly.
1298 */
1299 static int find_takeover_node(struct ctdb_context *ctdb,
1300                               struct ctdb_ipflags *ipflags,
1301                               struct public_ip_list *ip,
1302                               struct public_ip_list *all_ips)
1303 {
1304         int pnn, min=0, num;
1305         int i, numnodes;
1306
1307         numnodes = talloc_array_length(ipflags);
1308         pnn    = -1;
1309         for (i=0; i<numnodes; i++) {
1310                 /* verify that this node can serve this ip */
1311                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1312                         /* no it couldnt   so skip to the next node */
1313                         continue;
1314                 }
1315
1316                 num = node_ip_coverage(ctdb, i, all_ips);
1317                 /* was this the first node we checked ? */
1318                 if (pnn == -1) {
1319                         pnn = i;
1320                         min  = num;
1321                 } else {
1322                         if (num < min) {
1323                                 pnn = i;
1324                                 min  = num;
1325                         }
1326                 }
1327         }       
1328         if (pnn == -1) {
1329                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1330                         ctdb_addr_to_str(&ip->addr)));
1331
1332                 return -1;
1333         }
1334
1335         ip->pnn = pnn;
1336         return 0;
1337 }
1338
1339 #define IP_KEYLEN       4
1340 static uint32_t *ip_key(ctdb_sock_addr *ip)
1341 {
1342         static uint32_t key[IP_KEYLEN];
1343
1344         bzero(key, sizeof(key));
1345
1346         switch (ip->sa.sa_family) {
1347         case AF_INET:
1348                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1349                 break;
1350         case AF_INET6: {
1351                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1352                 key[0]  = htonl(s6_a32[0]);
1353                 key[1]  = htonl(s6_a32[1]);
1354                 key[2]  = htonl(s6_a32[2]);
1355                 key[3]  = htonl(s6_a32[3]);
1356                 break;
1357         }
1358         default:
1359                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1360                 return key;
1361         }
1362
1363         return key;
1364 }
1365
1366 static void *add_ip_callback(void *parm, void *data)
1367 {
1368         struct public_ip_list *this_ip = parm;
1369         struct public_ip_list *prev_ip = data;
1370
1371         if (prev_ip == NULL) {
1372                 return parm;
1373         }
1374         if (this_ip->pnn == -1) {
1375                 this_ip->pnn = prev_ip->pnn;
1376         }
1377
1378         return parm;
1379 }
1380
1381 static int getips_count_callback(void *param, void *data)
1382 {
1383         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1384         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1385
1386         new_ip->next = *ip_list;
1387         *ip_list     = new_ip;
1388         return 0;
1389 }
1390
1391 static struct public_ip_list *
1392 create_merged_ip_list(struct ctdb_context *ctdb)
1393 {
1394         int i, j;
1395         struct public_ip_list *ip_list;
1396         struct ctdb_public_ip_list_old *public_ips;
1397
1398         if (ctdb->ip_tree != NULL) {
1399                 talloc_free(ctdb->ip_tree);
1400                 ctdb->ip_tree = NULL;
1401         }
1402         ctdb->ip_tree = trbt_create(ctdb, 0);
1403
1404         for (i=0;i<ctdb->num_nodes;i++) {
1405                 public_ips = ctdb->nodes[i]->known_public_ips;
1406
1407                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1408                         continue;
1409                 }
1410
1411                 /* there were no public ips for this node */
1412                 if (public_ips == NULL) {
1413                         continue;
1414                 }               
1415
1416                 for (j=0;j<public_ips->num;j++) {
1417                         struct public_ip_list *tmp_ip;
1418
1419                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1420                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1421                         /* Do not use information about IP addresses hosted
1422                          * on other nodes, it may not be accurate */
1423                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1424                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1425                         } else {
1426                                 tmp_ip->pnn = -1;
1427                         }
1428                         tmp_ip->addr = public_ips->ips[j].addr;
1429                         tmp_ip->next = NULL;
1430
1431                         trbt_insertarray32_callback(ctdb->ip_tree,
1432                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1433                                 add_ip_callback,
1434                                 tmp_ip);
1435                 }
1436         }
1437
1438         ip_list = NULL;
1439         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1440
1441         return ip_list;
1442 }
1443
1444 /* 
1445  * This is the length of the longtest common prefix between the IPs.
1446  * It is calculated by XOR-ing the 2 IPs together and counting the
1447  * number of leading zeroes.  The implementation means that all
1448  * addresses end up being 128 bits long.
1449  *
1450  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1451  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1452  * lots of nodes and IP addresses?
1453  */
1454 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1455 {
1456         uint32_t ip1_k[IP_KEYLEN];
1457         uint32_t *t;
1458         int i;
1459         uint32_t x;
1460
1461         uint32_t distance = 0;
1462
1463         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1464         t = ip_key(ip2);
1465         for (i=0; i<IP_KEYLEN; i++) {
1466                 x = ip1_k[i] ^ t[i];
1467                 if (x == 0) {
1468                         distance += 32;
1469                 } else {
1470                         /* Count number of leading zeroes. 
1471                          * FIXME? This could be optimised...
1472                          */
1473                         while ((x & (1 << 31)) == 0) {
1474                                 x <<= 1;
1475                                 distance += 1;
1476                         }
1477                 }
1478         }
1479
1480         return distance;
1481 }
1482
1483 /* Calculate the IP distance for the given IP relative to IPs on the
1484    given node.  The ips argument is generally the all_ips variable
1485    used in the main part of the algorithm.
1486  */
1487 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1488                                   struct public_ip_list *ips,
1489                                   int pnn)
1490 {
1491         struct public_ip_list *t;
1492         uint32_t d;
1493
1494         uint32_t sum = 0;
1495
1496         for (t=ips; t != NULL; t=t->next) {
1497                 if (t->pnn != pnn) {
1498                         continue;
1499                 }
1500
1501                 /* Optimisation: We never calculate the distance
1502                  * between an address and itself.  This allows us to
1503                  * calculate the effect of removing an address from a
1504                  * node by simply calculating the distance between
1505                  * that address and all of the exitsing addresses.
1506                  * Moreover, we assume that we're only ever dealing
1507                  * with addresses from all_ips so we can identify an
1508                  * address via a pointer rather than doing a more
1509                  * expensive address comparison. */
1510                 if (&(t->addr) == ip) {
1511                         continue;
1512                 }
1513
1514                 d = ip_distance(ip, &(t->addr));
1515                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1516         }
1517
1518         return sum;
1519 }
1520
1521 /* Return the LCP2 imbalance metric for addresses currently assigned
1522    to the given node.
1523  */
1524 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1525 {
1526         struct public_ip_list *t;
1527
1528         uint32_t imbalance = 0;
1529
1530         for (t=all_ips; t!=NULL; t=t->next) {
1531                 if (t->pnn != pnn) {
1532                         continue;
1533                 }
1534                 /* Pass the rest of the IPs rather than the whole
1535                    all_ips input list.
1536                 */
1537                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1538         }
1539
1540         return imbalance;
1541 }
1542
1543 /* Allocate any unassigned IPs just by looping through the IPs and
1544  * finding the best node for each.
1545  */
1546 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1547                                       struct ctdb_ipflags *ipflags,
1548                                       struct public_ip_list *all_ips)
1549 {
1550         struct public_ip_list *tmp_ip;
1551
1552         /* loop over all ip's and find a physical node to cover for 
1553            each unassigned ip.
1554         */
1555         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1556                 if (tmp_ip->pnn == -1) {
1557                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1558                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1559                                         ctdb_addr_to_str(&tmp_ip->addr)));
1560                         }
1561                 }
1562         }
1563 }
1564
1565 /* Basic non-deterministic rebalancing algorithm.
1566  */
1567 static void basic_failback(struct ctdb_context *ctdb,
1568                            struct ctdb_ipflags *ipflags,
1569                            struct public_ip_list *all_ips,
1570                            int num_ips)
1571 {
1572         int i, numnodes;
1573         int maxnode, maxnum, minnode, minnum, num, retries;
1574         struct public_ip_list *tmp_ip;
1575
1576         numnodes = talloc_array_length(ipflags);
1577         retries = 0;
1578
1579 try_again:
1580         maxnum=0;
1581         minnum=0;
1582
1583         /* for each ip address, loop over all nodes that can serve
1584            this ip and make sure that the difference between the node
1585            serving the most and the node serving the least ip's are
1586            not greater than 1.
1587         */
1588         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1589                 if (tmp_ip->pnn == -1) {
1590                         continue;
1591                 }
1592
1593                 /* Get the highest and lowest number of ips's served by any 
1594                    valid node which can serve this ip.
1595                 */
1596                 maxnode = -1;
1597                 minnode = -1;
1598                 for (i=0; i<numnodes; i++) {
1599                         /* only check nodes that can actually serve this ip */
1600                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1601                                 /* no it couldnt   so skip to the next node */
1602                                 continue;
1603                         }
1604
1605                         num = node_ip_coverage(ctdb, i, all_ips);
1606                         if (maxnode == -1) {
1607                                 maxnode = i;
1608                                 maxnum  = num;
1609                         } else {
1610                                 if (num > maxnum) {
1611                                         maxnode = i;
1612                                         maxnum  = num;
1613                                 }
1614                         }
1615                         if (minnode == -1) {
1616                                 minnode = i;
1617                                 minnum  = num;
1618                         } else {
1619                                 if (num < minnum) {
1620                                         minnode = i;
1621                                         minnum  = num;
1622                                 }
1623                         }
1624                 }
1625                 if (maxnode == -1) {
1626                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1627                                 ctdb_addr_to_str(&tmp_ip->addr)));
1628
1629                         continue;
1630                 }
1631
1632                 /* if the spread between the smallest and largest coverage by
1633                    a node is >=2 we steal one of the ips from the node with
1634                    most coverage to even things out a bit.
1635                    try to do this a limited number of times since we dont
1636                    want to spend too much time balancing the ip coverage.
1637                 */
1638                 if ( (maxnum > minnum+1)
1639                      && (retries < (num_ips + 5)) ){
1640                         struct public_ip_list *tmp;
1641
1642                         /* Reassign one of maxnode's VNNs */
1643                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1644                                 if (tmp->pnn == maxnode) {
1645                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1646                                         retries++;
1647                                         goto try_again;;
1648                                 }
1649                         }
1650                 }
1651         }
1652 }
1653
1654 static void lcp2_init(struct ctdb_context *tmp_ctx,
1655                       struct ctdb_ipflags *ipflags,
1656                       struct public_ip_list *all_ips,
1657                       uint32_t *force_rebalance_nodes,
1658                       uint32_t **lcp2_imbalances,
1659                       bool **rebalance_candidates)
1660 {
1661         int i, numnodes;
1662         struct public_ip_list *tmp_ip;
1663
1664         numnodes = talloc_array_length(ipflags);
1665
1666         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1667         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1668         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1669         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1670
1671         for (i=0; i<numnodes; i++) {
1672                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1673                 /* First step: assume all nodes are candidates */
1674                 (*rebalance_candidates)[i] = true;
1675         }
1676
1677         /* 2nd step: if a node has IPs assigned then it must have been
1678          * healthy before, so we remove it from consideration.  This
1679          * is overkill but is all we have because we don't maintain
1680          * state between takeover runs.  An alternative would be to
1681          * keep state and invalidate it every time the recovery master
1682          * changes.
1683          */
1684         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1685                 if (tmp_ip->pnn != -1) {
1686                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1687                 }
1688         }
1689
1690         /* 3rd step: if a node is forced to re-balance then
1691            we allow failback onto the node */
1692         if (force_rebalance_nodes == NULL) {
1693                 return;
1694         }
1695         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1696                 uint32_t pnn = force_rebalance_nodes[i];
1697                 if (pnn >= numnodes) {
1698                         DEBUG(DEBUG_ERR,
1699                               (__location__ "unknown node %u\n", pnn));
1700                         continue;
1701                 }
1702
1703                 DEBUG(DEBUG_NOTICE,
1704                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1705                 (*rebalance_candidates)[pnn] = true;
1706         }
1707 }
1708
1709 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1710  * the IP/node combination that will cost the least.
1711  */
1712 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1713                                      struct ctdb_ipflags *ipflags,
1714                                      struct public_ip_list *all_ips,
1715                                      uint32_t *lcp2_imbalances)
1716 {
1717         struct public_ip_list *tmp_ip;
1718         int dstnode, numnodes;
1719
1720         int minnode;
1721         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1722         struct public_ip_list *minip;
1723
1724         bool should_loop = true;
1725         bool have_unassigned = true;
1726
1727         numnodes = talloc_array_length(ipflags);
1728
1729         while (have_unassigned && should_loop) {
1730                 should_loop = false;
1731
1732                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1733                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1734
1735                 minnode = -1;
1736                 mindsum = 0;
1737                 minip = NULL;
1738
1739                 /* loop over each unassigned ip. */
1740                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1741                         if (tmp_ip->pnn != -1) {
1742                                 continue;
1743                         }
1744
1745                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1746                                 /* only check nodes that can actually takeover this ip */
1747                                 if (!can_node_takeover_ip(ctdb, dstnode,
1748                                                           ipflags[dstnode],
1749                                                           tmp_ip)) {
1750                                         /* no it couldnt   so skip to the next node */
1751                                         continue;
1752                                 }
1753
1754                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1755                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1756                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1757                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1758                                                    dstnode,
1759                                                    dstimbl - lcp2_imbalances[dstnode]));
1760
1761
1762                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1763                                         minnode = dstnode;
1764                                         minimbl = dstimbl;
1765                                         mindsum = dstdsum;
1766                                         minip = tmp_ip;
1767                                         should_loop = true;
1768                                 }
1769                         }
1770                 }
1771
1772                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1773
1774                 /* If we found one then assign it to the given node. */
1775                 if (minnode != -1) {
1776                         minip->pnn = minnode;
1777                         lcp2_imbalances[minnode] = minimbl;
1778                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1779                                           ctdb_addr_to_str(&(minip->addr)),
1780                                           minnode,
1781                                           mindsum));
1782                 }
1783
1784                 /* There might be a better way but at least this is clear. */
1785                 have_unassigned = false;
1786                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1787                         if (tmp_ip->pnn == -1) {
1788                                 have_unassigned = true;
1789                         }
1790                 }
1791         }
1792
1793         /* We know if we have an unassigned addresses so we might as
1794          * well optimise.
1795          */
1796         if (have_unassigned) {
1797                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1798                         if (tmp_ip->pnn == -1) {
1799                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1800                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1801                         }
1802                 }
1803         }
1804 }
1805
1806 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1807  * to move IPs from, determines the best IP/destination node
1808  * combination to move from the source node.
1809  */
1810 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1811                                     struct ctdb_ipflags *ipflags,
1812                                     struct public_ip_list *all_ips,
1813                                     int srcnode,
1814                                     uint32_t *lcp2_imbalances,
1815                                     bool *rebalance_candidates)
1816 {
1817         int dstnode, mindstnode, numnodes;
1818         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1819         uint32_t minsrcimbl, mindstimbl;
1820         struct public_ip_list *minip;
1821         struct public_ip_list *tmp_ip;
1822
1823         /* Find an IP and destination node that best reduces imbalance. */
1824         srcimbl = 0;
1825         minip = NULL;
1826         minsrcimbl = 0;
1827         mindstnode = -1;
1828         mindstimbl = 0;
1829
1830         numnodes = talloc_array_length(ipflags);
1831
1832         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1833         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1834                            srcnode, lcp2_imbalances[srcnode]));
1835
1836         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1837                 /* Only consider addresses on srcnode. */
1838                 if (tmp_ip->pnn != srcnode) {
1839                         continue;
1840                 }
1841
1842                 /* What is this IP address costing the source node? */
1843                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1844                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1845
1846                 /* Consider this IP address would cost each potential
1847                  * destination node.  Destination nodes are limited to
1848                  * those that are newly healthy, since we don't want
1849                  * to do gratuitous failover of IPs just to make minor
1850                  * balance improvements.
1851                  */
1852                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1853                         if (!rebalance_candidates[dstnode]) {
1854                                 continue;
1855                         }
1856
1857                         /* only check nodes that can actually takeover this ip */
1858                         if (!can_node_takeover_ip(ctdb, dstnode,
1859                                                   ipflags[dstnode], tmp_ip)) {
1860                                 /* no it couldnt   so skip to the next node */
1861                                 continue;
1862                         }
1863
1864                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1865                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1866                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1867                                            srcnode, -srcdsum,
1868                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1869                                            dstnode, dstdsum));
1870
1871                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1872                             (dstdsum < srcdsum) &&                      \
1873                             ((mindstnode == -1) ||                              \
1874                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1875
1876                                 minip = tmp_ip;
1877                                 minsrcimbl = srcimbl;
1878                                 mindstnode = dstnode;
1879                                 mindstimbl = dstimbl;
1880                         }
1881                 }
1882         }
1883         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1884
1885         if (mindstnode != -1) {
1886                 /* We found a move that makes things better... */
1887                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1888                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1889                                   ctdb_addr_to_str(&(minip->addr)),
1890                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1891
1892
1893                 lcp2_imbalances[srcnode] = minsrcimbl;
1894                 lcp2_imbalances[mindstnode] = mindstimbl;
1895                 minip->pnn = mindstnode;
1896
1897                 return true;
1898         }
1899
1900         return false;
1901         
1902 }
1903
1904 struct lcp2_imbalance_pnn {
1905         uint32_t imbalance;
1906         int pnn;
1907 };
1908
1909 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1910 {
1911         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1912         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1913
1914         if (lipa->imbalance > lipb->imbalance) {
1915                 return -1;
1916         } else if (lipa->imbalance == lipb->imbalance) {
1917                 return 0;
1918         } else {
1919                 return 1;
1920         }
1921 }
1922
1923 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1924  * node with the highest LCP2 imbalance, and then determines the best
1925  * IP/destination node combination to move from the source node.
1926  */
1927 static void lcp2_failback(struct ctdb_context *ctdb,
1928                           struct ctdb_ipflags *ipflags,
1929                           struct public_ip_list *all_ips,
1930                           uint32_t *lcp2_imbalances,
1931                           bool *rebalance_candidates)
1932 {
1933         int i, numnodes;
1934         struct lcp2_imbalance_pnn * lips;
1935         bool again;
1936
1937         numnodes = talloc_array_length(ipflags);
1938
1939 try_again:
1940         /* Put the imbalances and nodes into an array, sort them and
1941          * iterate through candidates.  Usually the 1st one will be
1942          * used, so this doesn't cost much...
1943          */
1944         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1945         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1946         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1947         for (i=0; i<numnodes; i++) {
1948                 lips[i].imbalance = lcp2_imbalances[i];
1949                 lips[i].pnn = i;
1950                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1951         }
1952         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1953               lcp2_cmp_imbalance_pnn);
1954
1955         again = false;
1956         for (i=0; i<numnodes; i++) {
1957                 /* This means that all nodes had 0 or 1 addresses, so
1958                  * can't be imbalanced.
1959                  */
1960                 if (lips[i].imbalance == 0) {
1961                         break;
1962                 }
1963
1964                 if (lcp2_failback_candidate(ctdb,
1965                                             ipflags,
1966                                             all_ips,
1967                                             lips[i].pnn,
1968                                             lcp2_imbalances,
1969                                             rebalance_candidates)) {
1970                         again = true;
1971                         break;
1972                 }
1973         }
1974
1975         talloc_free(lips);
1976         if (again) {
1977                 goto try_again;
1978         }
1979 }
1980
1981 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1982                                     struct ctdb_ipflags *ipflags,
1983                                     struct public_ip_list *all_ips)
1984 {
1985         struct public_ip_list *tmp_ip;
1986
1987         /* verify that the assigned nodes can serve that public ip
1988            and set it to -1 if not
1989         */
1990         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1991                 if (tmp_ip->pnn == -1) {
1992                         continue;
1993                 }
1994                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1995                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1996                         /* this node can not serve this ip. */
1997                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1998                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1999                                            tmp_ip->pnn));
2000                         tmp_ip->pnn = -1;
2001                 }
2002         }
2003 }
2004
2005 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2006                                        struct ctdb_ipflags *ipflags,
2007                                        struct public_ip_list *all_ips)
2008 {
2009         struct public_ip_list *tmp_ip;
2010         int i, numnodes;
2011
2012         numnodes = talloc_array_length(ipflags);
2013
2014         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2015        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2016         *  always be allocated the same way for a specific set of
2017         *  available/unavailable nodes.
2018         */
2019
2020         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2021                 tmp_ip->pnn = i % numnodes;
2022         }
2023
2024         /* IP failback doesn't make sense with deterministic
2025          * IPs, since the modulo step above implicitly fails
2026          * back IPs to their "home" node.
2027          */
2028         if (1 == ctdb->tunable.no_ip_failback) {
2029                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2030         }
2031
2032         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2033
2034         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2035
2036         /* No failback here! */
2037 }
2038
2039 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2040                                           struct ctdb_ipflags *ipflags,
2041                                           struct public_ip_list *all_ips)
2042 {
2043         /* This should be pushed down into basic_failback. */
2044         struct public_ip_list *tmp_ip;
2045         int num_ips = 0;
2046         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2047                 num_ips++;
2048         }
2049
2050         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2051
2052         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2053
2054         /* If we don't want IPs to fail back then don't rebalance IPs. */
2055         if (1 == ctdb->tunable.no_ip_failback) {
2056                 return;
2057         }
2058
2059         /* Now, try to make sure the ip adresses are evenly distributed
2060            across the nodes.
2061         */
2062         basic_failback(ctdb, ipflags, all_ips, num_ips);
2063 }
2064
2065 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2066                           struct ctdb_ipflags *ipflags,
2067                           struct public_ip_list *all_ips,
2068                           uint32_t *force_rebalance_nodes)
2069 {
2070         uint32_t *lcp2_imbalances;
2071         bool *rebalance_candidates;
2072         int numnodes, num_rebalance_candidates, i;
2073
2074         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2075
2076         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2077
2078         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2079                   &lcp2_imbalances, &rebalance_candidates);
2080
2081         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2082
2083         /* If we don't want IPs to fail back then don't rebalance IPs. */
2084         if (1 == ctdb->tunable.no_ip_failback) {
2085                 goto finished;
2086         }
2087
2088         /* It is only worth continuing if we have suitable target
2089          * nodes to transfer IPs to.  This check is much cheaper than
2090          * continuing on...
2091          */
2092         numnodes = talloc_array_length(ipflags);
2093         num_rebalance_candidates = 0;
2094         for (i=0; i<numnodes; i++) {
2095                 if (rebalance_candidates[i]) {
2096                         num_rebalance_candidates++;
2097                 }
2098         }
2099         if (num_rebalance_candidates == 0) {
2100                 goto finished;
2101         }
2102
2103         /* Now, try to make sure the ip adresses are evenly distributed
2104            across the nodes.
2105         */
2106         lcp2_failback(ctdb, ipflags, all_ips,
2107                       lcp2_imbalances, rebalance_candidates);
2108
2109 finished:
2110         talloc_free(tmp_ctx);
2111 }
2112
2113 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2114 {
2115         int i;
2116
2117         for (i=0;i<nodemap->num;i++) {
2118                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2119                         /* Found one completely healthy node */
2120                         return false;
2121                 }
2122         }
2123
2124         return true;
2125 }
2126
2127 /* The calculation part of the IP allocation algorithm. */
2128 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2129                                    struct ctdb_ipflags *ipflags,
2130                                    struct public_ip_list **all_ips_p,
2131                                    uint32_t *force_rebalance_nodes)
2132 {
2133         /* since nodes only know about those public addresses that
2134            can be served by that particular node, no single node has
2135            a full list of all public addresses that exist in the cluster.
2136            Walk over all node structures and create a merged list of
2137            all public addresses that exist in the cluster.
2138
2139            keep the tree of ips around as ctdb->ip_tree
2140         */
2141         *all_ips_p = create_merged_ip_list(ctdb);
2142
2143         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2144                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2145         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2146                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2147         } else {
2148                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2149         }
2150
2151         /* at this point ->pnn is the node which will own each IP
2152            or -1 if there is no node that can cover this ip
2153         */
2154
2155         return;
2156 }
2157
2158 struct get_tunable_callback_data {
2159         const char *tunable;
2160         uint32_t *out;
2161         bool fatal;
2162 };
2163
2164 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2165                                  int32_t res, TDB_DATA outdata,
2166                                  void *callback)
2167 {
2168         struct get_tunable_callback_data *cd =
2169                 (struct get_tunable_callback_data *)callback;
2170         int size;
2171
2172         if (res != 0) {
2173                 /* Already handled in fail callback */
2174                 return;
2175         }
2176
2177         if (outdata.dsize != sizeof(uint32_t)) {
2178                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2179                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2180                                  (int)outdata.dsize));
2181                 cd->fatal = true;
2182                 return;
2183         }
2184
2185         size = talloc_array_length(cd->out);
2186         if (pnn >= size) {
2187                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2188                                  cd->tunable, pnn, size));
2189                 return;
2190         }
2191
2192                 
2193         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2194 }
2195
2196 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2197                                        int32_t res, TDB_DATA outdata,
2198                                        void *callback)
2199 {
2200         struct get_tunable_callback_data *cd =
2201                 (struct get_tunable_callback_data *)callback;
2202
2203         switch (res) {
2204         case -ETIME:
2205                 DEBUG(DEBUG_ERR,
2206                       ("Timed out getting tunable \"%s\" from node %d\n",
2207                        cd->tunable, pnn));
2208                 cd->fatal = true;
2209                 break;
2210         case -EINVAL:
2211         case -1:
2212                 DEBUG(DEBUG_WARNING,
2213                       ("Tunable \"%s\" not implemented on node %d\n",
2214                        cd->tunable, pnn));
2215                 break;
2216         default:
2217                 DEBUG(DEBUG_ERR,
2218                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2219                        cd->tunable, pnn));
2220                 cd->fatal = true;
2221         }
2222 }
2223
2224 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2225                                         TALLOC_CTX *tmp_ctx,
2226                                         struct ctdb_node_map_old *nodemap,
2227                                         const char *tunable,
2228                                         uint32_t default_value)
2229 {
2230         TDB_DATA data;
2231         struct ctdb_control_get_tunable *t;
2232         uint32_t *nodes;
2233         uint32_t *tvals;
2234         struct get_tunable_callback_data callback_data;
2235         int i;
2236
2237         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2238         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2239         for (i=0; i<nodemap->num; i++) {
2240                 tvals[i] = default_value;
2241         }
2242                 
2243         callback_data.out = tvals;
2244         callback_data.tunable = tunable;
2245         callback_data.fatal = false;
2246
2247         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2248         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2249         t = (struct ctdb_control_get_tunable *)data.dptr;
2250         t->length = strlen(tunable)+1;
2251         memcpy(t->name, tunable, t->length);
2252         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2253         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2254                                       nodes, 0, TAKEOVER_TIMEOUT(),
2255                                       false, data,
2256                                       get_tunable_callback,
2257                                       get_tunable_fail_callback,
2258                                       &callback_data) != 0) {
2259                 if (callback_data.fatal) {
2260                         talloc_free(tvals);
2261                         tvals = NULL;
2262                 }
2263         }
2264         talloc_free(nodes);
2265         talloc_free(data.dptr);
2266
2267         return tvals;
2268 }
2269
2270 struct get_runstate_callback_data {
2271         enum ctdb_runstate *out;
2272         bool fatal;
2273 };
2274
2275 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2276                                   int32_t res, TDB_DATA outdata,
2277                                   void *callback_data)
2278 {
2279         struct get_runstate_callback_data *cd =
2280                 (struct get_runstate_callback_data *)callback_data;
2281         int size;
2282
2283         if (res != 0) {
2284                 /* Already handled in fail callback */
2285                 return;
2286         }
2287
2288         if (outdata.dsize != sizeof(uint32_t)) {
2289                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2290                                  pnn, (int)sizeof(uint32_t),
2291                                  (int)outdata.dsize));
2292                 cd->fatal = true;
2293                 return;
2294         }
2295
2296         size = talloc_array_length(cd->out);
2297         if (pnn >= size) {
2298                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2299                                  pnn, size));
2300                 return;
2301         }
2302
2303         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2304 }
2305
2306 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2307                                        int32_t res, TDB_DATA outdata,
2308                                        void *callback)
2309 {
2310         struct get_runstate_callback_data *cd =
2311                 (struct get_runstate_callback_data *)callback;
2312
2313         switch (res) {
2314         case -ETIME:
2315                 DEBUG(DEBUG_ERR,
2316                       ("Timed out getting runstate from node %d\n", pnn));
2317                 cd->fatal = true;
2318                 break;
2319         default:
2320                 DEBUG(DEBUG_WARNING,
2321                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2322                        pnn));
2323         }
2324 }
2325
2326 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2327                                                     TALLOC_CTX *tmp_ctx,
2328                                                     struct ctdb_node_map_old *nodemap,
2329                                                     enum ctdb_runstate default_value)
2330 {
2331         uint32_t *nodes;
2332         enum ctdb_runstate *rs;
2333         struct get_runstate_callback_data callback_data;
2334         int i;
2335
2336         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2337         CTDB_NO_MEMORY_NULL(ctdb, rs);
2338         for (i=0; i<nodemap->num; i++) {
2339                 rs[i] = default_value;
2340         }
2341
2342         callback_data.out = rs;
2343         callback_data.fatal = false;
2344
2345         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2346         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2347                                       nodes, 0, TAKEOVER_TIMEOUT(),
2348                                       true, tdb_null,
2349                                       get_runstate_callback,
2350                                       get_runstate_fail_callback,
2351                                       &callback_data) != 0) {
2352                 if (callback_data.fatal) {
2353                         free(rs);
2354                         rs = NULL;
2355                 }
2356         }
2357         talloc_free(nodes);
2358
2359         return rs;
2360 }
2361
2362 /* Set internal flags for IP allocation:
2363  *   Clear ip flags
2364  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2365  *   Set NOIPHOST ip flag for each INACTIVE node
2366  *   if all nodes are disabled:
2367  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2368  *   else
2369  *     Set NOIPHOST ip flags for disabled nodes
2370  */
2371 static struct ctdb_ipflags *
2372 set_ipflags_internal(struct ctdb_context *ctdb,
2373                      TALLOC_CTX *tmp_ctx,
2374                      struct ctdb_node_map_old *nodemap,
2375                      uint32_t *tval_noiptakeover,
2376                      uint32_t *tval_noiphostonalldisabled,
2377                      enum ctdb_runstate *runstate)
2378 {
2379         int i;
2380         struct ctdb_ipflags *ipflags;
2381
2382         /* Clear IP flags - implicit due to talloc_zero */
2383         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2384         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2385
2386         for (i=0;i<nodemap->num;i++) {
2387                 /* Can not take IPs on node with NoIPTakeover set */
2388                 if (tval_noiptakeover[i] != 0) {
2389                         ipflags[i].noiptakeover = true;
2390                 }
2391
2392                 /* Can not host IPs on node not in RUNNING state */
2393                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2394                         ipflags[i].noiphost = true;
2395                         continue;
2396                 }
2397                 /* Can not host IPs on INACTIVE node */
2398                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2399                         ipflags[i].noiphost = true;
2400                 }
2401                 /* Remember the runstate */
2402                 ipflags[i].runstate = runstate[i];
2403         }
2404
2405         if (all_nodes_are_disabled(nodemap)) {
2406                 /* If all nodes are disabled, can not host IPs on node
2407                  * with NoIPHostOnAllDisabled set
2408                  */
2409                 for (i=0;i<nodemap->num;i++) {
2410                         if (tval_noiphostonalldisabled[i] != 0) {
2411                                 ipflags[i].noiphost = true;
2412                         }
2413                 }
2414         } else {
2415                 /* If some nodes are not disabled, then can not host
2416                  * IPs on DISABLED node
2417                  */
2418                 for (i=0;i<nodemap->num;i++) {
2419                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2420                                 ipflags[i].noiphost = true;
2421                         }
2422                 }
2423         }
2424
2425         return ipflags;
2426 }
2427
2428 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2429                                         TALLOC_CTX *tmp_ctx,
2430                                         struct ctdb_node_map_old *nodemap)
2431 {
2432         uint32_t *tval_noiptakeover;
2433         uint32_t *tval_noiphostonalldisabled;
2434         struct ctdb_ipflags *ipflags;
2435         enum ctdb_runstate *runstate;
2436
2437
2438         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2439                                                    "NoIPTakeover", 0);
2440         if (tval_noiptakeover == NULL) {
2441                 return NULL;
2442         }
2443
2444         tval_noiphostonalldisabled =
2445                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2446                                        "NoIPHostOnAllDisabled", 0);
2447         if (tval_noiphostonalldisabled == NULL) {
2448                 /* Caller frees tmp_ctx */
2449                 return NULL;
2450         }
2451
2452         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2453          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2454          * reasonable behaviour on a mixed cluster during upgrade.
2455          */
2456         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2457                                            CTDB_RUNSTATE_RUNNING);
2458         if (runstate == NULL) {
2459                 /* Caller frees tmp_ctx */
2460                 return NULL;
2461         }
2462
2463         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2464                                        tval_noiptakeover,
2465                                        tval_noiphostonalldisabled,
2466                                        runstate);
2467
2468         talloc_free(tval_noiptakeover);
2469         talloc_free(tval_noiphostonalldisabled);
2470         talloc_free(runstate);
2471
2472         return ipflags;
2473 }
2474
2475 struct iprealloc_callback_data {
2476         bool *retry_nodes;
2477         int retry_count;
2478         client_async_callback fail_callback;
2479         void *fail_callback_data;
2480         struct ctdb_node_map_old *nodemap;
2481 };
2482
2483 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2484                                         int32_t res, TDB_DATA outdata,
2485                                         void *callback)
2486 {
2487         int numnodes;
2488         struct iprealloc_callback_data *cd =
2489                 (struct iprealloc_callback_data *)callback;
2490
2491         numnodes = talloc_array_length(cd->retry_nodes);
2492         if (pnn > numnodes) {
2493                 DEBUG(DEBUG_ERR,
2494                       ("ipreallocated failure from node %d, "
2495                        "but only %d nodes in nodemap\n",
2496                        pnn, numnodes));
2497                 return;
2498         }
2499
2500         /* Can't run the "ipreallocated" event on a INACTIVE node */
2501         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2502                 DEBUG(DEBUG_WARNING,
2503                       ("ipreallocated failed on inactive node %d, ignoring\n",
2504                        pnn));
2505                 return;
2506         }
2507
2508         switch (res) {
2509         case -ETIME:
2510                 /* If the control timed out then that's a real error,
2511                  * so call the real fail callback
2512                  */
2513                 if (cd->fail_callback) {
2514                         cd->fail_callback(ctdb, pnn, res, outdata,
2515                                           cd->fail_callback_data);
2516                 } else {
2517                         DEBUG(DEBUG_WARNING,
2518                               ("iprealloc timed out but no callback registered\n"));
2519                 }
2520                 break;
2521         default:
2522                 /* If not a timeout then either the ipreallocated
2523                  * eventscript (or some setup) failed.  This might
2524                  * have failed because the IPREALLOCATED control isn't
2525                  * implemented - right now there is no way of knowing
2526                  * because the error codes are all folded down to -1.
2527                  * Consider retrying using EVENTSCRIPT control...
2528                  */
2529                 DEBUG(DEBUG_WARNING,
2530                       ("ipreallocated failure from node %d, flagging retry\n",
2531                        pnn));
2532                 cd->retry_nodes[pnn] = true;
2533                 cd->retry_count++;
2534         }
2535 }
2536
2537 struct takeover_callback_data {
2538         bool *node_failed;
2539         client_async_callback fail_callback;
2540         void *fail_callback_data;
2541         struct ctdb_node_map_old *nodemap;
2542 };
2543
2544 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2545                                        uint32_t node_pnn, int32_t res,
2546                                        TDB_DATA outdata, void *callback_data)
2547 {
2548         struct takeover_callback_data *cd =
2549                 talloc_get_type_abort(callback_data,
2550                                       struct takeover_callback_data);
2551         int i;
2552
2553         for (i = 0; i < cd->nodemap->num; i++) {
2554                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2555                         break;
2556                 }
2557         }
2558
2559         if (i == cd->nodemap->num) {
2560                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2561                 return;
2562         }
2563
2564         if (!cd->node_failed[i]) {
2565                 cd->node_failed[i] = true;
2566                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2567                                   cd->fail_callback_data);
2568         }
2569 }
2570
2571 /*
2572   make any IP alias changes for public addresses that are necessary 
2573  */
2574 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2575                       uint32_t *force_rebalance_nodes,
2576                       client_async_callback fail_callback, void *callback_data)
2577 {
2578         int i, j, ret;
2579         struct ctdb_public_ip ip;
2580         uint32_t *nodes;
2581         struct public_ip_list *all_ips, *tmp_ip;
2582         TDB_DATA data;
2583         struct timeval timeout;
2584         struct client_async_data *async_data;
2585         struct ctdb_client_control_state *state;
2586         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2587         struct ctdb_ipflags *ipflags;
2588         struct takeover_callback_data *takeover_data;
2589         struct iprealloc_callback_data iprealloc_data;
2590         bool *retry_data;
2591         bool can_host_ips;
2592
2593         /*
2594          * ip failover is completely disabled, just send out the 
2595          * ipreallocated event.
2596          */
2597         if (ctdb->tunable.disable_ip_failover != 0) {
2598                 goto ipreallocated;
2599         }
2600
2601         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2602         if (ipflags == NULL) {
2603                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2604                 talloc_free(tmp_ctx);
2605                 return -1;
2606         }
2607
2608         /* Short-circuit IP allocation if no nodes are in the RUNNING
2609          * runstate yet, since no nodes will be able to host IPs */
2610         can_host_ips = false;
2611         for (i=0; i<nodemap->num; i++) {
2612                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2613                         can_host_ips = true;
2614                 }
2615         }
2616         if (!can_host_ips) {
2617                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2618                 return 0;
2619         }
2620
2621         /* Do the IP reassignment calculations */
2622         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2623
2624         /* Now tell all nodes to release any public IPs should not
2625          * host.  This will be a NOOP on nodes that don't currently
2626          * hold the given IP.
2627          */
2628         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2629         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2630
2631         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2632                                                        bool, nodemap->num);
2633         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2634         takeover_data->fail_callback = fail_callback;
2635         takeover_data->fail_callback_data = callback_data;
2636         takeover_data->nodemap = nodemap;
2637
2638         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2639         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2640
2641         async_data->fail_callback = takeover_run_fail_callback;
2642         async_data->callback_data = takeover_data;
2643
2644         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2645
2646         /* Send a RELEASE_IP to all nodes that should not be hosting
2647          * each IP.  For each IP, all but one of these will be
2648          * redundant.  However, the redundant ones are used to tell
2649          * nodes which node should be hosting the IP so that commands
2650          * like "ctdb ip" can display a particular nodes idea of who
2651          * is hosting what. */
2652         for (i=0;i<nodemap->num;i++) {
2653                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2654                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2655                         continue;
2656                 }
2657
2658                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2659                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2660                                 /* This node should be serving this
2661                                    vnn so don't tell it to release the ip
2662                                 */
2663                                 continue;
2664                         }
2665                         ip.pnn  = tmp_ip->pnn;
2666                         ip.addr = tmp_ip->addr;
2667
2668                         timeout = TAKEOVER_TIMEOUT();
2669                         data.dsize = sizeof(ip);
2670                         data.dptr  = (uint8_t *)&ip;
2671                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2672                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2673                                                   data, async_data,
2674                                                   &timeout, NULL);
2675                         if (state == NULL) {
2676                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2677                                 talloc_free(tmp_ctx);
2678                                 return -1;
2679                         }
2680
2681                         ctdb_client_async_add(async_data, state);
2682                 }
2683         }
2684         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2685                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2686                 talloc_free(tmp_ctx);
2687                 return -1;
2688         }
2689         talloc_free(async_data);
2690
2691
2692         /* For each IP, send a TAKOVER_IP to the node that should be
2693          * hosting it.  Many of these will often be redundant (since
2694          * the allocation won't have changed) but they can be useful
2695          * to recover from inconsistencies. */
2696         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2697         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2698
2699         async_data->fail_callback = fail_callback;
2700         async_data->callback_data = callback_data;
2701
2702         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2703                 if (tmp_ip->pnn == -1) {
2704                         /* this IP won't be taken over */
2705                         continue;
2706                 }
2707
2708                 ip.pnn  = tmp_ip->pnn;
2709                 ip.addr = tmp_ip->addr;
2710
2711                 timeout = TAKEOVER_TIMEOUT();
2712                 data.dsize = sizeof(ip);
2713                 data.dptr  = (uint8_t *)&ip;
2714                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2715                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2716                                           data, async_data, &timeout, NULL);
2717                 if (state == NULL) {
2718                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2719                         talloc_free(tmp_ctx);
2720                         return -1;
2721                 }
2722
2723                 ctdb_client_async_add(async_data, state);
2724         }
2725         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2726                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2727                 talloc_free(tmp_ctx);
2728                 return -1;
2729         }
2730
2731 ipreallocated:
2732         /*
2733          * Tell all nodes to run eventscripts to process the
2734          * "ipreallocated" event.  This can do a lot of things,
2735          * including restarting services to reconfigure them if public
2736          * IPs have moved.  Once upon a time this event only used to
2737          * update natgw.
2738          */
2739         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2740         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2741         iprealloc_data.retry_nodes = retry_data;
2742         iprealloc_data.retry_count = 0;
2743         iprealloc_data.fail_callback = fail_callback;
2744         iprealloc_data.fail_callback_data = callback_data;
2745         iprealloc_data.nodemap = nodemap;
2746
2747         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2748         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2749                                         nodes, 0, TAKEOVER_TIMEOUT(),
2750                                         false, tdb_null,
2751                                         NULL, iprealloc_fail_callback,
2752                                         &iprealloc_data);
2753         if (ret != 0) {
2754                 /* If the control failed then we should retry to any
2755                  * nodes flagged by iprealloc_fail_callback using the
2756                  * EVENTSCRIPT control.  This is a best-effort at
2757                  * backward compatiblity when running a mixed cluster
2758                  * where some nodes have not yet been upgraded to
2759                  * support the IPREALLOCATED control.
2760                  */
2761                 DEBUG(DEBUG_WARNING,
2762                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2763
2764                 nodes = talloc_array(tmp_ctx, uint32_t,
2765                                      iprealloc_data.retry_count);
2766                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2767
2768                 j = 0;
2769                 for (i=0; i<nodemap->num; i++) {
2770                         if (iprealloc_data.retry_nodes[i]) {
2771                                 nodes[j] = i;
2772                                 j++;
2773                         }
2774                 }
2775
2776                 data.dptr  = discard_const("ipreallocated");
2777                 data.dsize = strlen((char *)data.dptr) + 1; 
2778                 ret = ctdb_client_async_control(ctdb,
2779                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2780                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2781                                                 false, data,
2782                                                 NULL, fail_callback,
2783                                                 callback_data);
2784                 if (ret != 0) {
2785                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2786                 }
2787         }
2788
2789         talloc_free(tmp_ctx);
2790         return ret;
2791 }
2792
2793
2794 /*
2795   destroy a ctdb_client_ip structure
2796  */
2797 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2798 {
2799         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2800                 ctdb_addr_to_str(&ip->addr),
2801                 ntohs(ip->addr.ip.sin_port),
2802                 ip->client_id));
2803
2804         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2805         return 0;
2806 }
2807
2808 /*
2809   called by a client to inform us of a TCP connection that it is managing
2810   that should tickled with an ACK when IP takeover is done
2811  */
2812 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2813                                 TDB_DATA indata)
2814 {
2815         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2816         struct ctdb_connection *tcp_sock = NULL;
2817         struct ctdb_tcp_list *tcp;
2818         struct ctdb_connection t;
2819         int ret;
2820         TDB_DATA data;
2821         struct ctdb_client_ip *ip;
2822         struct ctdb_vnn *vnn;
2823         ctdb_sock_addr addr;
2824
2825         /* If we don't have public IPs, tickles are useless */
2826         if (ctdb->vnn == NULL) {
2827                 return 0;
2828         }
2829
2830         tcp_sock = (struct ctdb_connection *)indata.dptr;
2831
2832         addr = tcp_sock->src;
2833         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2834         addr = tcp_sock->dst;
2835         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2836
2837         ZERO_STRUCT(addr);
2838         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2839         vnn = find_public_ip_vnn(ctdb, &addr);
2840         if (vnn == NULL) {
2841                 switch (addr.sa.sa_family) {
2842                 case AF_INET:
2843                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2844                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2845                                         ctdb_addr_to_str(&addr)));
2846                         }
2847                         break;
2848                 case AF_INET6:
2849                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2850                                 ctdb_addr_to_str(&addr)));
2851                         break;
2852                 default:
2853                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2854                 }
2855
2856                 return 0;
2857         }
2858
2859         if (vnn->pnn != ctdb->pnn) {
2860                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2861                         ctdb_addr_to_str(&addr),
2862                         client_id, client->pid));
2863                 /* failing this call will tell smbd to die */
2864                 return -1;
2865         }
2866
2867         ip = talloc(client, struct ctdb_client_ip);
2868         CTDB_NO_MEMORY(ctdb, ip);
2869
2870         ip->ctdb      = ctdb;
2871         ip->addr      = addr;
2872         ip->client_id = client_id;
2873         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2874         DLIST_ADD(ctdb->client_ip_list, ip);
2875
2876         tcp = talloc(client, struct ctdb_tcp_list);
2877         CTDB_NO_MEMORY(ctdb, tcp);
2878
2879         tcp->connection.src = tcp_sock->src;
2880         tcp->connection.dst = tcp_sock->dst;
2881
2882         DLIST_ADD(client->tcp_list, tcp);
2883
2884         t.src = tcp_sock->src;
2885         t.dst = tcp_sock->dst;
2886
2887         data.dptr = (uint8_t *)&t;
2888         data.dsize = sizeof(t);
2889
2890         switch (addr.sa.sa_family) {
2891         case AF_INET:
2892                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2893                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2894                         ctdb_addr_to_str(&tcp_sock->src),
2895                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2896                 break;
2897         case AF_INET6:
2898                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2899                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2900                         ctdb_addr_to_str(&tcp_sock->src),
2901                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2902                 break;
2903         default:
2904                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2905         }
2906
2907
2908         /* tell all nodes about this tcp connection */
2909         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2910                                        CTDB_CONTROL_TCP_ADD,
2911                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2912         if (ret != 0) {
2913                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2914                 return -1;
2915         }
2916
2917         return 0;
2918 }
2919
2920 /*
2921   find a tcp address on a list
2922  */
2923 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2924                                            struct ctdb_connection *tcp)
2925 {
2926         int i;
2927
2928         if (array == NULL) {
2929                 return NULL;
2930         }
2931
2932         for (i=0;i<array->num;i++) {
2933                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2934                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2935                         return &array->connections[i];
2936                 }
2937         }
2938         return NULL;
2939 }
2940
2941
2942
2943 /*
2944   called by a daemon to inform us of a TCP connection that one of its
2945   clients managing that should tickled with an ACK when IP takeover is
2946   done
2947  */
2948 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2949 {
2950         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2951         struct ctdb_tcp_array *tcparray;
2952         struct ctdb_connection tcp;
2953         struct ctdb_vnn *vnn;
2954
2955         /* If we don't have public IPs, tickles are useless */
2956         if (ctdb->vnn == NULL) {
2957                 return 0;
2958         }
2959
2960         vnn = find_public_ip_vnn(ctdb, &p->dst);
2961         if (vnn == NULL) {
2962                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2963                         ctdb_addr_to_str(&p->dst)));
2964
2965                 return -1;
2966         }
2967
2968
2969         tcparray = vnn->tcp_array;
2970
2971         /* If this is the first tickle */
2972         if (tcparray == NULL) {
2973                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2974                 CTDB_NO_MEMORY(ctdb, tcparray);
2975                 vnn->tcp_array = tcparray;
2976
2977                 tcparray->num = 0;
2978                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2979                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2980
2981                 tcparray->connections[tcparray->num].src = p->src;
2982                 tcparray->connections[tcparray->num].dst = p->dst;
2983                 tcparray->num++;
2984
2985                 if (tcp_update_needed) {
2986                         vnn->tcp_update_needed = true;
2987                 }
2988                 return 0;
2989         }
2990
2991
2992         /* Do we already have this tickle ?*/
2993         tcp.src = p->src;
2994         tcp.dst = p->dst;
2995         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2996                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2997                         ctdb_addr_to_str(&tcp.dst),
2998                         ntohs(tcp.dst.ip.sin_port),
2999                         vnn->pnn));
3000                 return 0;
3001         }
3002
3003         /* A new tickle, we must add it to the array */
3004         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3005                                         struct ctdb_connection,
3006                                         tcparray->num+1);
3007         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3008
3009         tcparray->connections[tcparray->num].src = p->src;
3010         tcparray->connections[tcparray->num].dst = p->dst;
3011         tcparray->num++;
3012
3013         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3014                 ctdb_addr_to_str(&tcp.dst),
3015                 ntohs(tcp.dst.ip.sin_port),
3016                 vnn->pnn));
3017
3018         if (tcp_update_needed) {
3019                 vnn->tcp_update_needed = true;
3020         }
3021
3022         return 0;
3023 }
3024
3025
3026 /*
3027   called by a daemon to inform us of a TCP connection that one of its
3028   clients managing that should tickled with an ACK when IP takeover is
3029   done
3030  */
3031 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3032 {
3033         struct ctdb_connection *tcpp;
3034         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3035
3036         if (vnn == NULL) {
3037                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3038                         ctdb_addr_to_str(&conn->dst)));
3039                 return;
3040         }
3041
3042         /* if the array is empty we cant remove it
3043            and we don't need to do anything
3044          */
3045         if (vnn->tcp_array == NULL) {
3046                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3047                         ctdb_addr_to_str(&conn->dst),
3048                         ntohs(conn->dst.ip.sin_port)));
3049                 return;
3050         }
3051
3052
3053         /* See if we know this connection
3054            if we don't know this connection  then we dont need to do anything
3055          */
3056         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3057         if (tcpp == NULL) {
3058                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3059                         ctdb_addr_to_str(&conn->dst),
3060                         ntohs(conn->dst.ip.sin_port)));
3061                 return;
3062         }
3063
3064
3065         /* We need to remove this entry from the array.
3066            Instead of allocating a new array and copying data to it
3067            we cheat and just copy the last entry in the existing array
3068            to the entry that is to be removed and just shring the 
3069            ->num field
3070          */
3071         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3072         vnn->tcp_array->num--;
3073
3074         /* If we deleted the last entry we also need to remove the entire array
3075          */
3076         if (vnn->tcp_array->num == 0) {
3077                 talloc_free(vnn->tcp_array);
3078                 vnn->tcp_array = NULL;
3079         }               
3080
3081         vnn->tcp_update_needed = true;
3082
3083         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3084                 ctdb_addr_to_str(&conn->src),
3085                 ntohs(conn->src.ip.sin_port)));
3086 }
3087
3088
3089 /*
3090   called by a daemon to inform us of a TCP connection that one of its
3091   clients used are no longer needed in the tickle database
3092  */
3093 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3094 {
3095         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3096
3097         /* If we don't have public IPs, tickles are useless */
3098         if (ctdb->vnn == NULL) {
3099                 return 0;
3100         }
3101
3102         ctdb_remove_connection(ctdb, conn);
3103
3104         return 0;
3105 }
3106
3107
3108 /*
3109   Called when another daemon starts - causes all tickles for all
3110   public addresses we are serving to be sent to the new node on the
3111   next check.  This actually causes the next scheduled call to
3112   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3113   doesn't require careful error handling.
3114  */
3115 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3116 {
3117         struct ctdb_vnn *vnn;
3118
3119         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3120                            (unsigned long) pnn));
3121
3122         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3123                 vnn->tcp_update_needed = true;
3124         }
3125
3126         return 0;
3127 }
3128
3129
3130 /*
3131   called when a client structure goes away - hook to remove
3132   elements from the tcp_list in all daemons
3133  */
3134 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3135 {
3136         while (client->tcp_list) {
3137                 struct ctdb_tcp_list *tcp = client->tcp_list;
3138                 DLIST_REMOVE(client->tcp_list, tcp);
3139                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3140         }
3141 }
3142
3143
3144 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3145 {
3146         struct ctdb_vnn *vnn;
3147         int count = 0;
3148
3149         if (ctdb->tunable.disable_ip_failover == 1) {
3150                 return;
3151         }
3152
3153         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3154                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3155                         ctdb_vnn_unassign_iface(ctdb, vnn);
3156                         continue;
3157                 }
3158                 if (!vnn->iface) {
3159                         continue;
3160                 }
3161
3162                 /* Don't allow multiple releases at once.  Some code,
3163                  * particularly ctdb_tickle_sentenced_connections() is
3164                  * not re-entrant */
3165                 if (vnn->update_in_flight) {
3166                         DEBUG(DEBUG_WARNING,
3167                               (__location__
3168                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3169                                     ctdb_addr_to_str(&vnn->public_address),
3170                                     vnn->public_netmask_bits,
3171                                     ctdb_vnn_iface_string(vnn)));
3172                         continue;
3173                 }
3174                 vnn->update_in_flight = true;
3175
3176                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3177                                     ctdb_addr_to_str(&vnn->public_address),
3178                                     vnn->public_netmask_bits,
3179                                     ctdb_vnn_iface_string(vnn)));
3180
3181                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3182                                   ctdb_vnn_iface_string(vnn),
3183                                   ctdb_addr_to_str(&vnn->public_address),
3184                                   vnn->public_netmask_bits);
3185                 release_kill_clients(ctdb, &vnn->public_address);
3186                 ctdb_vnn_unassign_iface(ctdb, vnn);
3187                 vnn->update_in_flight = false;
3188                 count++;
3189         }
3190
3191         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3192 }
3193
3194
3195 /*
3196   get list of public IPs
3197  */
3198 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3199                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3200 {
3201         int i, num, len;
3202         struct ctdb_public_ip_list_old *ips;
3203         struct ctdb_vnn *vnn;
3204         bool only_available = false;
3205
3206         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3207                 only_available = true;
3208         }
3209
3210         /* count how many public ip structures we have */
3211         num = 0;
3212         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3213                 num++;
3214         }
3215
3216         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3217                 num*sizeof(struct ctdb_public_ip);
3218         ips = talloc_zero_size(outdata, len);
3219         CTDB_NO_MEMORY(ctdb, ips);
3220
3221         i = 0;
3222         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3223                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3224                         continue;
3225                 }
3226                 ips->ips[i].pnn  = vnn->pnn;
3227                 ips->ips[i].addr = vnn->public_address;
3228                 i++;
3229         }
3230         ips->num = i;
3231         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3232                 i*sizeof(struct ctdb_public_ip);
3233
3234         outdata->dsize = len;
3235         outdata->dptr  = (uint8_t *)ips;
3236
3237         return 0;
3238 }
3239
3240
3241 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3242                                         struct ctdb_req_control_old *c,
3243                                         TDB_DATA indata,
3244                                         TDB_DATA *outdata)
3245 {
3246         int i, num, len;
3247         ctdb_sock_addr *addr;
3248         struct ctdb_public_ip_info_old *info;
3249         struct ctdb_vnn *vnn;
3250
3251         addr = (ctdb_sock_addr *)indata.dptr;
3252
3253         vnn = find_public_ip_vnn(ctdb, addr);
3254         if (vnn == NULL) {
3255                 /* if it is not a public ip   it could be our 'single ip' */
3256                 if (ctdb->single_ip_vnn) {
3257                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3258                                 vnn = ctdb->single_ip_vnn;
3259                         }
3260                 }
3261         }
3262         if (vnn == NULL) {
3263                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3264                                  "'%s'not a public address\n",
3265                                  ctdb_addr_to_str(addr)));
3266                 return -1;
3267         }
3268
3269         /* count how many public ip structures we have */
3270         num = 0;
3271         for (;vnn->ifaces[num];) {
3272                 num++;
3273         }
3274
3275         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3276                 num*sizeof(struct ctdb_iface);
3277         info = talloc_zero_size(outdata, len);
3278         CTDB_NO_MEMORY(ctdb, info);
3279
3280         info->ip.addr = vnn->public_address;
3281         info->ip.pnn = vnn->pnn;
3282         info->active_idx = 0xFFFFFFFF;
3283
3284         for (i=0; vnn->ifaces[i]; i++) {
3285                 struct ctdb_interface *cur;
3286
3287                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3288                 if (cur == NULL) {
3289                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3290                                            vnn->ifaces[i]));
3291                         return -1;
3292                 }
3293                 if (vnn->iface == cur) {
3294                         info->active_idx = i;
3295                 }
3296                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3297                 info->ifaces[i].link_state = cur->link_up;
3298                 info->ifaces[i].references = cur->references;
3299         }
3300         info->num = i;
3301         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3302                 i*sizeof(struct ctdb_iface);
3303
3304         outdata->dsize = len;
3305         outdata->dptr  = (uint8_t *)info;
3306
3307         return 0;
3308 }
3309
3310 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3311                                 struct ctdb_req_control_old *c,
3312                                 TDB_DATA *outdata)
3313 {
3314         int i, num, len;
3315         struct ctdb_iface_list_old *ifaces;
3316         struct ctdb_interface *cur;
3317
3318         /* count how many public ip structures we have */
3319         num = 0;
3320         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3321                 num++;
3322         }
3323
3324         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3325                 num*sizeof(struct ctdb_iface);
3326         ifaces = talloc_zero_size(outdata, len);
3327         CTDB_NO_MEMORY(ctdb, ifaces);
3328
3329         i = 0;
3330         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3331                 strcpy(ifaces->ifaces[i].name, cur->name);
3332                 ifaces->ifaces[i].link_state = cur->link_up;
3333                 ifaces->ifaces[i].references = cur->references;
3334                 i++;
3335         }
3336         ifaces->num = i;
3337         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3338                 i*sizeof(struct ctdb_iface);
3339
3340         outdata->dsize = len;
3341         outdata->dptr  = (uint8_t *)ifaces;
3342
3343         return 0;
3344 }
3345
3346 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3347                                     struct ctdb_req_control_old *c,
3348                                     TDB_DATA indata)
3349 {
3350         struct ctdb_iface *info;
3351         struct ctdb_interface *iface;
3352         bool link_up = false;
3353
3354         info = (struct ctdb_iface *)indata.dptr;
3355
3356         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3357                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3358                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3359                                   len, len, info->name));
3360                 return -1;
3361         }
3362
3363         switch (info->link_state) {
3364         case 0:
3365                 link_up = false;
3366                 break;
3367         case 1:
3368                 link_up = true;
3369                 break;
3370         default:
3371                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3372                                   (unsigned int)info->link_state));
3373                 return -1;
3374         }
3375
3376         if (info->references != 0) {
3377                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3378                                   (unsigned int)info->references));
3379                 return -1;
3380         }
3381
3382         iface = ctdb_find_iface(ctdb, info->name);
3383         if (iface == NULL) {
3384                 return -1;
3385         }
3386
3387         if (link_up == iface->link_up) {
3388                 return 0;
3389         }
3390
3391         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3392               ("iface[%s] has changed it's link status %s => %s\n",
3393                iface->name,
3394                iface->link_up?"up":"down",
3395                link_up?"up":"down"));
3396
3397         iface->link_up = link_up;
3398         return 0;
3399 }
3400
3401
3402 /* 
3403    structure containing the listening socket and the list of tcp connections
3404    that the ctdb daemon is to kill
3405 */
3406 struct ctdb_kill_tcp {
3407         struct ctdb_vnn *vnn;
3408         struct ctdb_context *ctdb;
3409         int capture_fd;
3410         struct tevent_fd *fde;
3411         trbt_tree_t *connections;
3412         void *private_data;
3413 };
3414
3415 /*
3416   a tcp connection that is to be killed
3417  */
3418 struct ctdb_killtcp_con {
3419         ctdb_sock_addr src_addr;
3420         ctdb_sock_addr dst_addr;
3421         int count;
3422         struct ctdb_kill_tcp *killtcp;
3423 };
3424
3425 /* this function is used to create a key to represent this socketpair
3426    in the killtcp tree.
3427    this key is used to insert and lookup matching socketpairs that are
3428    to be tickled and RST
3429 */
3430 #define KILLTCP_KEYLEN  10
3431 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3432 {
3433         static uint32_t key[KILLTCP_KEYLEN];
3434
3435         bzero(key, sizeof(key));
3436
3437         if (src->sa.sa_family != dst->sa.sa_family) {
3438                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3439                 return key;
3440         }
3441         
3442         switch (src->sa.sa_family) {
3443         case AF_INET:
3444                 key[0]  = dst->ip.sin_addr.s_addr;
3445                 key[1]  = src->ip.sin_addr.s_addr;
3446                 key[2]  = dst->ip.sin_port;
3447                 key[3]  = src->ip.sin_port;
3448                 break;
3449         case AF_INET6: {
3450                 uint32_t *dst6_addr32 =
3451                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3452                 uint32_t *src6_addr32 =
3453                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3454                 key[0]  = dst6_addr32[3];
3455                 key[1]  = src6_addr32[3];
3456                 key[2]  = dst6_addr32[2];
3457                 key[3]  = src6_addr32[2];
3458                 key[4]  = dst6_addr32[1];
3459                 key[5]  = src6_addr32[1];
3460                 key[6]  = dst6_addr32[0];
3461                 key[7]  = src6_addr32[0];
3462                 key[8]  = dst->ip6.sin6_port;
3463                 key[9]  = src->ip6.sin6_port;
3464                 break;
3465         }
3466         default:
3467                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3468                 return key;
3469         }
3470
3471         return key;
3472 }
3473
3474 /*
3475   called when we get a read event on the raw socket
3476  */
3477 static void capture_tcp_handler(struct tevent_context *ev,
3478                                 struct tevent_fd *fde,
3479                                 uint16_t flags, void *private_data)
3480 {
3481         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3482         struct ctdb_killtcp_con *con;
3483         ctdb_sock_addr src, dst;
3484         uint32_t ack_seq, seq;
3485
3486         if (!(flags & TEVENT_FD_READ)) {
3487                 return;
3488         }
3489
3490         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3491                                 killtcp->private_data,
3492                                 &src, &dst,
3493                                 &ack_seq, &seq) != 0) {
3494                 /* probably a non-tcp ACK packet */
3495                 return;
3496         }
3497
3498         /* check if we have this guy in our list of connections
3499            to kill
3500         */
3501         con = trbt_lookuparray32(killtcp->connections, 
3502                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3503         if (con == NULL) {
3504                 /* no this was some other packet we can just ignore */
3505                 return;
3506         }
3507
3508         /* This one has been tickled !
3509            now reset him and remove him from the list.
3510          */
3511         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3512                 ntohs(con->dst_addr.ip.sin_port),
3513                 ctdb_addr_to_str(&con->src_addr),
3514                 ntohs(con->src_addr.ip.sin_port)));
3515
3516         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3517         talloc_free(con);
3518 }
3519
3520
3521 /* when traversing the list of all tcp connections to send tickle acks to
3522    (so that we can capture the ack coming back and kill the connection
3523     by a RST)
3524    this callback is called for each connection we are currently trying to kill
3525 */
3526 static int tickle_connection_traverse(void *param, void *data)
3527 {
3528         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3529
3530         /* have tried too many times, just give up */
3531         if (con->count >= 5) {
3532                 /* can't delete in traverse: reparent to delete_cons */
3533                 talloc_steal(param, con);
3534                 return 0;
3535         }
3536
3537         /* othervise, try tickling it again */
3538         con->count++;
3539         ctdb_sys_send_tcp(
3540                 (ctdb_sock_addr *)&con->dst_addr,
3541                 (ctdb_sock_addr *)&con->src_addr,
3542                 0, 0, 0);
3543         return 0;
3544 }
3545
3546
3547 /* 
3548    called every second until all sentenced connections have been reset
3549  */
3550 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3551                                               struct tevent_timer *te,
3552                                               struct timeval t, void *private_data)
3553 {
3554         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3555         void *delete_cons = talloc_new(NULL);
3556
3557         /* loop over all connections sending tickle ACKs */
3558         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3559
3560         /* now we've finished traverse, it's safe to do deletion. */
3561         talloc_free(delete_cons);
3562
3563         /* If there are no more connections to kill we can remove the
3564            entire killtcp structure
3565          */
3566         if ( (killtcp->connections == NULL) || 
3567              (killtcp->connections->root == NULL) ) {
3568                 talloc_free(killtcp);
3569                 return;
3570         }
3571
3572         /* try tickling them again in a seconds time
3573          */
3574         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3575                          timeval_current_ofs(1, 0),
3576                          ctdb_tickle_sentenced_connections, killtcp);
3577 }
3578
3579 /*
3580   destroy the killtcp structure
3581  */
3582 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3583 {
3584         struct ctdb_vnn *tmpvnn;
3585
3586         /* verify that this vnn is still active */
3587         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3588                 if (tmpvnn == killtcp->vnn) {
3589                         break;
3590                 }
3591         }
3592
3593         if (tmpvnn == NULL) {
3594                 return 0;
3595         }
3596
3597         if (killtcp->vnn->killtcp != killtcp) {
3598                 return 0;
3599         }
3600
3601         killtcp->vnn->killtcp = NULL;
3602
3603         return 0;
3604 }
3605
3606
3607 /* nothing fancy here, just unconditionally replace any existing
3608    connection structure with the new one.
3609
3610    don't even free the old one if it did exist, that one is talloc_stolen
3611    by the same node in the tree anyway and will be deleted when the new data 
3612    is deleted
3613 */
3614 static void *add_killtcp_callback(void *parm, void *data)
3615 {
3616         return parm;
3617 }
3618
3619 /*
3620   add a tcp socket to the list of connections we want to RST
3621  */
3622 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3623                                        ctdb_sock_addr *s,
3624                                        ctdb_sock_addr *d)
3625 {
3626         ctdb_sock_addr src, dst;
3627         struct ctdb_kill_tcp *killtcp;
3628         struct ctdb_killtcp_con *con;
3629         struct ctdb_vnn *vnn;
3630
3631         ctdb_canonicalize_ip(s, &src);
3632         ctdb_canonicalize_ip(d, &dst);
3633
3634         vnn = find_public_ip_vnn(ctdb, &dst);
3635         if (vnn == NULL) {
3636                 vnn = find_public_ip_vnn(ctdb, &src);
3637         }
3638         if (vnn == NULL) {
3639                 /* if it is not a public ip   it could be our 'single ip' */
3640                 if (ctdb->single_ip_vnn) {
3641                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3642                                 vnn = ctdb->single_ip_vnn;
3643                         }
3644                 }
3645         }
3646         if (vnn == NULL) {
3647                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3648                 return -1;
3649         }
3650
3651         killtcp = vnn->killtcp;
3652         
3653         /* If this is the first connection to kill we must allocate
3654            a new structure
3655          */
3656         if (killtcp == NULL) {
3657                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3658                 CTDB_NO_MEMORY(ctdb, killtcp);
3659
3660                 killtcp->vnn         = vnn;
3661                 killtcp->ctdb        = ctdb;
3662                 killtcp->capture_fd  = -1;
3663                 killtcp->connections = trbt_create(killtcp, 0);
3664
3665                 vnn->killtcp         = killtcp;
3666                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3667         }
3668
3669
3670
3671         /* create a structure that describes this connection we want to
3672            RST and store it in killtcp->connections
3673         */
3674         con = talloc(killtcp, struct ctdb_killtcp_con);
3675         CTDB_NO_MEMORY(ctdb, con);
3676         con->src_addr = src;
3677         con->dst_addr = dst;
3678         con->count    = 0;
3679         con->killtcp  = killtcp;
3680
3681
3682         trbt_insertarray32_callback(killtcp->connections,
3683                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3684                         add_killtcp_callback, con);
3685
3686         /* 
3687            If we don't have a socket to listen on yet we must create it
3688          */
3689         if (killtcp->capture_fd == -1) {
3690                 const char *iface = ctdb_vnn_iface_string(vnn);
3691                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3692                 if (killtcp->capture_fd == -1) {
3693                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3694                                           "socket on iface '%s' for killtcp (%s)\n",
3695                                           iface, strerror(errno)));
3696                         goto failed;
3697                 }
3698         }
3699
3700
3701         if (killtcp->fde == NULL) {
3702                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3703                                              killtcp->capture_fd,
3704                                              TEVENT_FD_READ,
3705                                              capture_tcp_handler, killtcp);
3706                 tevent_fd_set_auto_close(killtcp->fde);
3707
3708                 /* We also need to set up some events to tickle all these connections
3709                    until they are all reset
3710                 */
3711                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3712                                  ctdb_tickle_sentenced_connections, killtcp);
3713         }
3714
3715         /* tickle him once now */
3716         ctdb_sys_send_tcp(
3717                 &con->dst_addr,
3718                 &con->src_addr,
3719                 0, 0, 0);
3720
3721         return 0;
3722
3723 failed:
3724         talloc_free(vnn->killtcp);
3725         vnn->killtcp = NULL;
3726         return -1;
3727 }
3728
3729 /*
3730   kill a TCP connection.
3731  */
3732 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3733 {
3734         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3735
3736         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3737 }
3738
3739 /*
3740   called by a daemon to inform us of the entire list of TCP tickles for
3741   a particular public address.
3742   this control should only be sent by the node that is currently serving
3743   that public address.
3744  */
3745 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3746 {
3747         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3748         struct ctdb_tcp_array *tcparray;
3749         struct ctdb_vnn *vnn;
3750
3751         /* We must at least have tickles.num or else we cant verify the size
3752            of the received data blob
3753          */
3754         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3755                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3756                 return -1;
3757         }
3758
3759         /* verify that the size of data matches what we expect */
3760         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3761                          + sizeof(struct ctdb_connection) * list->num) {
3762                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3763                 return -1;
3764         }
3765
3766         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3767                            ctdb_addr_to_str(&list->addr)));
3768
3769         vnn = find_public_ip_vnn(ctdb, &list->addr);
3770         if (vnn == NULL) {
3771                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3772                         ctdb_addr_to_str(&list->addr)));
3773
3774                 return 1;
3775         }
3776
3777         /* remove any old ticklelist we might have */
3778         talloc_free(vnn->tcp_array);
3779         vnn->tcp_array = NULL;
3780
3781         tcparray = talloc(vnn, struct ctdb_tcp_array);
3782         CTDB_NO_MEMORY(ctdb, tcparray);
3783
3784         tcparray->num = list->num;
3785
3786         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3787         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3788
3789         memcpy(tcparray->connections, &list->connections[0],
3790                sizeof(struct ctdb_connection)*tcparray->num);
3791
3792         /* We now have a new fresh tickle list array for this vnn */
3793         vnn->tcp_array = tcparray;
3794
3795         return 0;
3796 }
3797
3798 /*
3799   called to return the full list of tickles for the puclic address associated 
3800   with the provided vnn
3801  */
3802 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3803 {
3804         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3805         struct ctdb_tickle_list_old *list;
3806         struct ctdb_tcp_array *tcparray;
3807         int num;
3808         struct ctdb_vnn *vnn;
3809
3810         vnn = find_public_ip_vnn(ctdb, addr);
3811         if (vnn == NULL) {
3812                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3813                         ctdb_addr_to_str(addr)));
3814
3815                 return 1;
3816         }
3817
3818         tcparray = vnn->tcp_array;
3819         if (tcparray) {
3820                 num = tcparray->num;
3821         } else {
3822                 num = 0;
3823         }
3824
3825         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3826                         + sizeof(struct ctdb_connection) * num;
3827
3828         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3829         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3830         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3831
3832         list->addr = *addr;
3833         list->num = num;
3834         if (num) {
3835                 memcpy(&list->connections[0], tcparray->connections,
3836                         sizeof(struct ctdb_connection) * num);
3837         }
3838
3839         return 0;
3840 }
3841
3842
3843 /*
3844   set the list of all tcp tickles for a public address
3845  */
3846 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3847                                             ctdb_sock_addr *addr,
3848                                             struct ctdb_tcp_array *tcparray)
3849 {
3850         int ret, num;
3851         TDB_DATA data;
3852         struct ctdb_tickle_list_old *list;
3853
3854         if (tcparray) {
3855                 num = tcparray->num;
3856         } else {
3857                 num = 0;
3858         }
3859
3860         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3861                         sizeof(struct ctdb_connection) * num;
3862         data.dptr = talloc_size(ctdb, data.dsize);
3863         CTDB_NO_MEMORY(ctdb, data.dptr);
3864
3865         list = (struct ctdb_tickle_list_old *)data.dptr;
3866         list->addr = *addr;
3867         list->num = num;
3868         if (tcparray) {
3869                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3870         }
3871
3872         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3873                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3874                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3875         if (ret != 0) {
3876                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3877                 return -1;
3878         }
3879
3880         talloc_free(data.dptr);
3881
3882         return ret;
3883 }
3884
3885
3886 /*
3887   perform tickle updates if required
3888  */
3889 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3890                                     struct tevent_timer *te,
3891                                     struct timeval t, void *private_data)
3892 {
3893         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3894         int ret;
3895         struct ctdb_vnn *vnn;
3896
3897         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3898                 /* we only send out updates for public addresses that 
3899                    we have taken over
3900                  */
3901                 if (ctdb->pnn != vnn->pnn) {
3902                         continue;
3903                 }
3904                 /* We only send out the updates if we need to */
3905                 if (!vnn->tcp_update_needed) {
3906                         continue;
3907                 }
3908                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3909                                                        &vnn->public_address,
3910                                                        vnn->tcp_array);
3911                 if (ret != 0) {
3912                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3913                                 ctdb_addr_to_str(&vnn->public_address)));
3914                 } else {
3915                         DEBUG(DEBUG_INFO,
3916                               ("Sent tickle update for public address %s\n",
3917                                ctdb_addr_to_str(&vnn->public_address)));
3918                         vnn->tcp_update_needed = false;
3919                 }
3920         }
3921
3922         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3923                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3924                          ctdb_update_tcp_tickles, ctdb);
3925 }
3926
3927 /*
3928   start periodic update of tcp tickles
3929  */
3930 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3931 {
3932         ctdb->tickle_update_context = talloc_new(ctdb);
3933
3934         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3935                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3936                          ctdb_update_tcp_tickles, ctdb);
3937 }
3938
3939
3940
3941
3942 struct control_gratious_arp {
3943         struct ctdb_context *ctdb;
3944         ctdb_sock_addr addr;
3945         const char *iface;
3946         int count;
3947 };
3948
3949 /*
3950   send a control_gratuitous arp
3951  */
3952 static void send_gratious_arp(struct tevent_context *ev,
3953                               struct tevent_timer *te,
3954                               struct timeval t, void *private_data)
3955 {
3956         int ret;
3957         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3958                                                         struct control_gratious_arp);
3959
3960         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3961         if (ret != 0) {
3962                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3963                                  arp->iface, strerror(errno)));
3964         }
3965
3966
3967         arp->count++;
3968         if (arp->count == CTDB_ARP_REPEAT) {
3969                 talloc_free(arp);
3970                 return;
3971         }
3972
3973         tevent_add_timer(arp->ctdb->ev, arp,
3974                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3975                          send_gratious_arp, arp);
3976 }
3977
3978
3979 /*
3980   send a gratious arp 
3981  */
3982 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3983 {
3984         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3985         struct control_gratious_arp *arp;
3986
3987         /* verify the size of indata */
3988         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3989                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3990                                  (unsigned)indata.dsize, 
3991                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3992                 return -1;
3993         }
3994         if (indata.dsize != 
3995                 ( offsetof(struct ctdb_addr_info_old, iface)
3996                 + gratious_arp->len ) ){
3997
3998                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3999                         "but should be %u bytes\n", 
4000                          (unsigned)indata.dsize, 
4001                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4002                 return -1;
4003         }
4004
4005
4006         arp = talloc(ctdb, struct control_gratious_arp);
4007         CTDB_NO_MEMORY(ctdb, arp);
4008
4009         arp->ctdb  = ctdb;
4010         arp->addr   = gratious_arp->addr;
4011         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4012         CTDB_NO_MEMORY(ctdb, arp->iface);
4013         arp->count = 0;
4014
4015         tevent_add_timer(arp->ctdb->ev, arp,
4016                          timeval_zero(), send_gratious_arp, arp);
4017
4018         return 0;
4019 }
4020
4021 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4022 {
4023         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4024         int ret;
4025
4026         /* verify the size of indata */
4027         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4028                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4029                 return -1;
4030         }
4031         if (indata.dsize != 
4032                 ( offsetof(struct ctdb_addr_info_old, iface)
4033                 + pub->len ) ){
4034
4035                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4036                         "but should be %u bytes\n", 
4037                          (unsigned)indata.dsize, 
4038                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4039                 return -1;
4040         }
4041
4042         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4043
4044         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4045
4046         if (ret != 0) {
4047                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4048                 return -1;
4049         }
4050
4051         return 0;
4052 }
4053
4054 struct delete_ip_callback_state {
4055         struct ctdb_req_control_old *c;
4056 };
4057
4058 /*
4059   called when releaseip event finishes for del_public_address
4060  */
4061 static void delete_ip_callback(struct ctdb_context *ctdb,
4062                                int32_t status, TDB_DATA data,
4063                                const char *errormsg,
4064                                void *private_data)
4065 {
4066         struct delete_ip_callback_state *state =
4067                 talloc_get_type(private_data, struct delete_ip_callback_state);
4068
4069         /* If release failed then fail. */
4070         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4071         talloc_free(private_data);
4072 }
4073
4074 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4075                                         struct ctdb_req_control_old *c,
4076                                         TDB_DATA indata, bool *async_reply)
4077 {
4078         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4079         struct ctdb_vnn *vnn;
4080
4081         /* verify the size of indata */
4082         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4083                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4084                 return -1;
4085         }
4086         if (indata.dsize != 
4087                 ( offsetof(struct ctdb_addr_info_old, iface)
4088                 + pub->len ) ){
4089
4090                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4091                         "but should be %u bytes\n", 
4092                          (unsigned)indata.dsize, 
4093                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4094                 return -1;
4095         }
4096
4097         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4098
4099         /* walk over all public addresses until we find a match */
4100         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4101                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4102                         if (vnn->pnn == ctdb->pnn) {
4103                                 struct delete_ip_callback_state *state;
4104                                 struct ctdb_public_ip *ip;
4105                                 TDB_DATA data;
4106                                 int ret;
4107
4108                                 vnn->delete_pending = true;
4109
4110                                 state = talloc(ctdb,
4111                                                struct delete_ip_callback_state);
4112                                 CTDB_NO_MEMORY(ctdb, state);
4113                                 state->c = c;
4114
4115                                 ip = talloc(state, struct ctdb_public_ip);
4116                                 if (ip == NULL) {
4117                                         DEBUG(DEBUG_ERR,
4118                                               (__location__ " Out of memory\n"));
4119                                         talloc_free(state);
4120                                         return -1;
4121                                 }
4122                                 ip->pnn = -1;
4123                                 ip->addr = pub->addr;
4124
4125                                 data.dsize = sizeof(struct ctdb_public_ip);
4126                                 data.dptr = (unsigned char *)ip;
4127
4128                                 ret = ctdb_daemon_send_control(ctdb,
4129                                                                ctdb_get_pnn(ctdb),
4130                                                                0,
4131                                                                CTDB_CONTROL_RELEASE_IP,
4132                                                                0, 0,
4133                                                                data,
4134                                                                delete_ip_callback,
4135                                                                state);
4136                                 if (ret == -1) {
4137                                         DEBUG(DEBUG_ERR,
4138                                               (__location__ "Unable to send "
4139                                                "CTDB_CONTROL_RELEASE_IP\n"));
4140                                         talloc_free(state);
4141                                         return -1;
4142                                 }
4143
4144                                 state->c = talloc_steal(state, c);
4145                                 *async_reply = true;
4146                         } else {
4147                                 /* This IP is not hosted on the
4148                                  * current node so just delete it
4149                                  * now. */
4150                                 do_delete_ip(ctdb, vnn);
4151                         }
4152
4153                         return 0;
4154                 }
4155         }
4156
4157         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4158                          ctdb_addr_to_str(&pub->addr)));
4159         return -1;
4160 }
4161
4162
4163 struct ipreallocated_callback_state {
4164         struct ctdb_req_control_old *c;
4165 };
4166
4167 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4168                                         int status, void *p)
4169 {
4170         struct ipreallocated_callback_state *state =
4171                 talloc_get_type(p, struct ipreallocated_callback_state);
4172
4173         if (status != 0) {
4174                 DEBUG(DEBUG_ERR,
4175                       (" \"ipreallocated\" event script failed (status %d)\n",
4176                        status));
4177                 if (status == -ETIME) {
4178                         ctdb_ban_self(ctdb);
4179                 }
4180         }
4181
4182         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4183         talloc_free(state);
4184 }
4185
4186 /* A control to run the ipreallocated event */
4187 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4188                                    struct ctdb_req_control_old *c,
4189                                    bool *async_reply)
4190 {
4191         int ret;
4192         struct ipreallocated_callback_state *state;
4193
4194         state = talloc(ctdb, struct ipreallocated_callback_state);
4195         CTDB_NO_MEMORY(ctdb, state);
4196
4197         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4198
4199         ret = ctdb_event_script_callback(ctdb, state,
4200                                          ctdb_ipreallocated_callback, state,
4201                                          CTDB_EVENT_IPREALLOCATED,
4202                                          "%s", "");
4203
4204         if (ret != 0) {
4205                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4206                 talloc_free(state);
4207                 return -1;
4208         }
4209
4210         /* tell the control that we will be reply asynchronously */
4211         state->c    = talloc_steal(state, c);
4212         *async_reply = true;
4213
4214         return 0;
4215 }
4216
4217
4218 /* This function is called from the recovery daemon to verify that a remote
4219    node has the expected ip allocation.
4220    This is verified against ctdb->ip_tree
4221 */
4222 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4223                                 struct ctdb_public_ip_list_old *ips,
4224                                 uint32_t pnn)
4225 {
4226         struct public_ip_list *tmp_ip;
4227         int i;
4228
4229         if (ctdb->ip_tree == NULL) {
4230                 /* don't know the expected allocation yet, assume remote node
4231                    is correct. */
4232                 return 0;
4233         }
4234
4235         if (ips == NULL) {
4236                 return 0;
4237         }
4238
4239         for (i=0; i<ips->num; i++) {
4240                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4241                 if (tmp_ip == NULL) {
4242                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4243                         return -1;
4244                 }
4245
4246                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4247                         continue;
4248                 }
4249
4250                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4251                         DEBUG(DEBUG_ERR,
4252                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4253                                pnn,
4254                                ctdb_addr_to_str(&ips->ips[i].addr),
4255                                ips->ips[i].pnn, tmp_ip->pnn));
4256                         return -1;
4257                 }
4258         }
4259
4260         return 0;
4261 }
4262
4263 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4264 {
4265         struct public_ip_list *tmp_ip;
4266
4267         /* IP tree is never built if DisableIPFailover is set */
4268         if (ctdb->tunable.disable_ip_failover != 0) {
4269                 return 0;
4270         }
4271
4272         if (ctdb->ip_tree == NULL) {
4273                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4274                 return -1;
4275         }
4276
4277         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4278         if (tmp_ip == NULL) {
4279                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4280                 return -1;
4281         }
4282
4283         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4284         tmp_ip->pnn = ip->pnn;
4285
4286         return 0;
4287 }
4288
4289 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4290 {
4291         TALLOC_FREE(ctdb->ip_tree);
4292 }
4293
4294 struct ctdb_reloadips_handle {
4295         struct ctdb_context *ctdb;
4296         struct ctdb_req_control_old *c;
4297         int status;
4298         int fd[2];
4299         pid_t child;
4300         struct tevent_fd *fde;
4301 };
4302
4303 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4304 {
4305         if (h == h->ctdb->reload_ips) {
4306                 h->ctdb->reload_ips = NULL;
4307         }
4308         if (h->c != NULL) {
4309                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4310                 h->c = NULL;
4311         }
4312         ctdb_kill(h->ctdb, h->child, SIGKILL);
4313         return 0;
4314 }
4315
4316 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4317                                          struct tevent_timer *te,
4318                                          struct timeval t, void *private_data)
4319 {
4320         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4321
4322         talloc_free(h);
4323 }
4324
4325 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4326                                          struct tevent_fd *fde,
4327                                          uint16_t flags, void *private_data)
4328 {
4329         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4330
4331         char res;
4332         int ret;
4333
4334         ret = sys_read(h->fd[0], &res, 1);
4335         if (ret < 1 || res != 0) {
4336                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4337                 res = 1;
4338         }
4339         h->status = res;
4340
4341         talloc_free(h);
4342 }
4343
4344 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4345 {
4346         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4347         struct ctdb_public_ip_list_old *ips;
4348         struct ctdb_vnn *vnn;
4349         struct client_async_data *async_data;
4350         struct timeval timeout;
4351         TDB_DATA data;
4352         struct ctdb_client_control_state *state;
4353         bool first_add;
4354         int i, ret;
4355
4356         CTDB_NO_MEMORY(ctdb, mem_ctx);
4357
4358         /* Read IPs from local node */
4359         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4360                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4361         if (ret != 0) {
4362                 DEBUG(DEBUG_ERR,
4363                       ("Unable to fetch public IPs from local node\n"));
4364                 talloc_free(mem_ctx);
4365                 return -1;
4366         }
4367
4368         /* Read IPs file - this is safe since this is a child process */
4369         ctdb->vnn = NULL;
4370         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4371                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4372                 talloc_free(mem_ctx);
4373                 return -1;
4374         }
4375
4376         async_data = talloc_zero(mem_ctx, struct client_async_data);
4377         CTDB_NO_MEMORY(ctdb, async_data);
4378
4379         /* Compare IPs between node and file for IPs to be deleted */
4380         for (i = 0; i < ips->num; i++) {
4381                 /* */
4382                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4383                         if (ctdb_same_ip(&vnn->public_address,
4384                                          &ips->ips[i].addr)) {
4385                                 /* IP is still in file */
4386                                 break;
4387                         }
4388                 }
4389
4390                 if (vnn == NULL) {
4391                         /* Delete IP ips->ips[i] */
4392                         struct ctdb_addr_info_old *pub;
4393
4394                         DEBUG(DEBUG_NOTICE,
4395                               ("IP %s no longer configured, deleting it\n",
4396                                ctdb_addr_to_str(&ips->ips[i].addr)));
4397
4398                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4399                         CTDB_NO_MEMORY(ctdb, pub);
4400
4401                         pub->addr  = ips->ips[i].addr;
4402                         pub->mask  = 0;
4403                         pub->len   = 0;
4404
4405                         timeout = TAKEOVER_TIMEOUT();
4406
4407                         data.dsize = offsetof(struct ctdb_addr_info_old,
4408                                               iface) + pub->len;
4409                         data.dptr = (uint8_t *)pub;
4410
4411                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4412                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4413                                                   0, data, async_data,
4414                                                   &timeout, NULL);
4415                         if (state == NULL) {
4416                                 DEBUG(DEBUG_ERR,
4417                                       (__location__
4418                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4419                                 goto failed;
4420                         }
4421
4422                         ctdb_client_async_add(async_data, state);
4423                 }
4424         }
4425
4426         /* Compare IPs between node and file for IPs to be added */
4427         first_add = true;
4428         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4429                 for (i = 0; i < ips->num; i++) {
4430                         if (ctdb_same_ip(&vnn->public_address,
4431                                          &ips->ips[i].addr)) {
4432                                 /* IP already on node */
4433                                 break;
4434                         }
4435                 }
4436                 if (i == ips->num) {
4437                         /* Add IP ips->ips[i] */
4438                         struct ctdb_addr_info_old *pub;
4439                         const char *ifaces = NULL;
4440                         uint32_t len;
4441                         int iface = 0;
4442
4443                         DEBUG(DEBUG_NOTICE,
4444                               ("New IP %s configured, adding it\n",
4445                                ctdb_addr_to_str(&vnn->public_address)));
4446                         if (first_add) {
4447                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4448
4449                                 data.dsize = sizeof(pnn);
4450                                 data.dptr  = (uint8_t *)&pnn;
4451
4452                                 ret = ctdb_client_send_message(
4453                                         ctdb,
4454                                         CTDB_BROADCAST_CONNECTED,
4455                                         CTDB_SRVID_REBALANCE_NODE,
4456                                         data);
4457                                 if (ret != 0) {
4458                                         DEBUG(DEBUG_WARNING,
4459                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4460                                 }
4461
4462                                 first_add = false;
4463                         }
4464
4465                         ifaces = vnn->ifaces[0];
4466                         iface = 1;
4467                         while (vnn->ifaces[iface] != NULL) {
4468                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4469                                                          vnn->ifaces[iface]);
4470                                 iface++;
4471                         }
4472
4473                         len   = strlen(ifaces) + 1;
4474                         pub = talloc_zero_size(mem_ctx,
4475                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4476                         CTDB_NO_MEMORY(ctdb, pub);
4477
4478                         pub->addr  = vnn->public_address;
4479                         pub->mask  = vnn->public_netmask_bits;
4480                         pub->len   = len;
4481                         memcpy(&pub->iface[0], ifaces, pub->len);
4482
4483                         timeout = TAKEOVER_TIMEOUT();
4484
4485                         data.dsize = offsetof(struct ctdb_addr_info_old,
4486                                               iface) + pub->len;
4487                         data.dptr = (uint8_t *)pub;
4488
4489                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4490                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4491                                                   0, data, async_data,
4492                                                   &timeout, NULL);
4493                         if (state == NULL) {
4494                                 DEBUG(DEBUG_ERR,
4495                                       (__location__
4496                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4497                                 goto failed;
4498                         }
4499
4500                         ctdb_client_async_add(async_data, state);
4501                 }
4502         }
4503
4504         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4505                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4506                 goto failed;
4507         }
4508
4509         talloc_free(mem_ctx);
4510         return 0;
4511
4512 failed:
4513         talloc_free(mem_ctx);
4514         return -1;
4515 }
4516
4517 /* This control is sent to force the node to re-read the public addresses file
4518    and drop any addresses we should nnot longer host, and add new addresses
4519    that we are now able to host
4520 */
4521 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4522 {
4523         struct ctdb_reloadips_handle *h;
4524         pid_t parent = getpid();
4525
4526         if (ctdb->reload_ips != NULL) {
4527                 talloc_free(ctdb->reload_ips);
4528                 ctdb->reload_ips = NULL;
4529         }
4530
4531         h = talloc(ctdb, struct ctdb_reloadips_handle);
4532         CTDB_NO_MEMORY(ctdb, h);
4533         h->ctdb     = ctdb;
4534         h->c        = NULL;
4535         h->status   = -1;
4536         
4537         if (pipe(h->fd) == -1) {
4538                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4539                 talloc_free(h);
4540                 return -1;
4541         }
4542
4543         h->child = ctdb_fork(ctdb);
4544         if (h->child == (pid_t)-1) {
4545                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4546                 close(h->fd[0]);
4547                 close(h->fd[1]);
4548                 talloc_free(h);
4549                 return -1;
4550         }
4551
4552         /* child process */
4553         if (h->child == 0) {
4554                 signed char res = 0;
4555
4556                 close(h->fd[0]);
4557                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4558
4559                 ctdb_set_process_name("ctdb_reloadips");
4560                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4561                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4562                         res = -1;
4563                 } else {
4564                         res = ctdb_reloadips_child(ctdb);
4565                         if (res != 0) {
4566                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4567                         }
4568                 }
4569
4570                 sys_write(h->fd[1], &res, 1);
4571                 /* make sure we die when our parent dies */
4572                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4573                         sleep(5);
4574                 }
4575                 _exit(0);
4576         }
4577
4578         h->c             = talloc_steal(h, c);
4579
4580         close(h->fd[1]);
4581         set_close_on_exec(h->fd[0]);
4582
4583         talloc_set_destructor(h, ctdb_reloadips_destructor);
4584
4585
4586         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4587                                ctdb_reloadips_child_handler, (void *)h);
4588         tevent_fd_set_auto_close(h->fde);
4589
4590         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4591                          ctdb_reloadips_timeout_event, h);
4592
4593         /* we reply later */
4594         *async_reply = true;
4595         return 0;
4596 }