ctdb-daemon: Separate prototypes for common client/server functions
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33
34 #include "ctdb_private.h"
35 #include "ctdb_client.h"
36 #include "ctdb_logging.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42
43
44 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
45
46 #define CTDB_ARP_INTERVAL 1
47 #define CTDB_ARP_REPEAT   3
48
49 /* Flags used in IP allocation algorithms. */
50 struct ctdb_ipflags {
51         bool noiptakeover;
52         bool noiphost;
53         enum ctdb_runstate runstate;
54 };
55
56 struct ctdb_iface {
57         struct ctdb_iface *prev, *next;
58         const char *name;
59         bool link_up;
60         uint32_t references;
61 };
62
63 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
64 {
65         if (vnn->iface) {
66                 return vnn->iface->name;
67         }
68
69         return "__none__";
70 }
71
72 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
73 {
74         struct ctdb_iface *i;
75
76         /* Verify that we dont have an entry for this ip yet */
77         for (i=ctdb->ifaces;i;i=i->next) {
78                 if (strcmp(i->name, iface) == 0) {
79                         return 0;
80                 }
81         }
82
83         /* create a new structure for this interface */
84         i = talloc_zero(ctdb, struct ctdb_iface);
85         CTDB_NO_MEMORY_FATAL(ctdb, i);
86         i->name = talloc_strdup(i, iface);
87         CTDB_NO_MEMORY(ctdb, i->name);
88
89         i->link_up = true;
90
91         DLIST_ADD(ctdb->ifaces, i);
92
93         return 0;
94 }
95
96 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
97                                         const char *name)
98 {
99         int n;
100
101         for (n = 0; vnn->ifaces[n] != NULL; n++) {
102                 if (strcmp(name, vnn->ifaces[n]) == 0) {
103                         return true;
104                 }
105         }
106
107         return false;
108 }
109
110 /* If any interfaces now have no possible IPs then delete them.  This
111  * implementation is naive (i.e. simple) rather than clever
112  * (i.e. complex).  Given that this is run on delip and that operation
113  * is rare, this doesn't need to be efficient - it needs to be
114  * foolproof.  One alternative is reference counting, where the logic
115  * is distributed and can, therefore, be broken in multiple places.
116  * Another alternative is to build a red-black tree of interfaces that
117  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
118  * once) and then walking ctdb->ifaces once and deleting those not in
119  * the tree.  Let's go to one of those if the naive implementation
120  * causes problems...  :-)
121  */
122 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
123                                         struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *i, *next;
126
127         /* For each interface, check if there's an IP using it. */
128         for (i = ctdb->ifaces; i != NULL; i = next) {
129                 struct ctdb_vnn *tv;
130                 bool found;
131                 next = i->next;
132
133                 /* Only consider interfaces named in the given VNN. */
134                 if (!vnn_has_interface_with_name(vnn, i->name)) {
135                         continue;
136                 }
137
138                 /* Is the "single IP" on this interface? */
139                 if ((ctdb->single_ip_vnn != NULL) &&
140                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
141                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
142                         /* Found, next interface please... */
143                         continue;
144                 }
145                 /* Search for a vnn with this interface. */
146                 found = false;
147                 for (tv=ctdb->vnn; tv; tv=tv->next) {
148                         if (vnn_has_interface_with_name(tv, i->name)) {
149                                 found = true;
150                                 break;
151                         }
152                 }
153
154                 if (!found) {
155                         /* None of the VNNs are using this interface. */
156                         DLIST_REMOVE(ctdb->ifaces, i);
157                         talloc_free(i);
158                 }
159         }
160 }
161
162
163 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
164                                           const char *iface)
165 {
166         struct ctdb_iface *i;
167
168         for (i=ctdb->ifaces;i;i=i->next) {
169                 if (strcmp(i->name, iface) == 0) {
170                         return i;
171                 }
172         }
173
174         return NULL;
175 }
176
177 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178                                               struct ctdb_vnn *vnn)
179 {
180         int i;
181         struct ctdb_iface *cur = NULL;
182         struct ctdb_iface *best = NULL;
183
184         for (i=0; vnn->ifaces[i]; i++) {
185
186                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
187                 if (cur == NULL) {
188                         continue;
189                 }
190
191                 if (!cur->link_up) {
192                         continue;
193                 }
194
195                 if (best == NULL) {
196                         best = cur;
197                         continue;
198                 }
199
200                 if (cur->references < best->references) {
201                         best = cur;
202                         continue;
203                 }
204         }
205
206         return best;
207 }
208
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210                                      struct ctdb_vnn *vnn)
211 {
212         struct ctdb_iface *best = NULL;
213
214         if (vnn->iface) {
215                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216                                    "still assigned to iface '%s'\n",
217                                    ctdb_addr_to_str(&vnn->public_address),
218                                    ctdb_vnn_iface_string(vnn)));
219                 return 0;
220         }
221
222         best = ctdb_vnn_best_iface(ctdb, vnn);
223         if (best == NULL) {
224                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225                                   "cannot assign to iface any iface\n",
226                                   ctdb_addr_to_str(&vnn->public_address)));
227                 return -1;
228         }
229
230         vnn->iface = best;
231         best->references++;
232         vnn->pnn = ctdb->pnn;
233
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now assigned to iface '%s' refs[%d]\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            best->references));
239         return 0;
240 }
241
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243                                     struct ctdb_vnn *vnn)
244 {
245         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246                            "now unassigned (old iface '%s' refs[%d])\n",
247                            ctdb_addr_to_str(&vnn->public_address),
248                            ctdb_vnn_iface_string(vnn),
249                            vnn->iface?vnn->iface->references:0));
250         if (vnn->iface) {
251                 vnn->iface->references--;
252         }
253         vnn->iface = NULL;
254         if (vnn->pnn == ctdb->pnn) {
255                 vnn->pnn = -1;
256         }
257 }
258
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260                                struct ctdb_vnn *vnn)
261 {
262         int i;
263
264         if (vnn->delete_pending) {
265                 return false;
266         }
267
268         if (vnn->iface && vnn->iface->link_up) {
269                 return true;
270         }
271
272         for (i=0; vnn->ifaces[i]; i++) {
273                 struct ctdb_iface *cur;
274
275                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
276                 if (cur == NULL) {
277                         continue;
278                 }
279
280                 if (cur->link_up) {
281                         return true;
282                 }
283         }
284
285         return false;
286 }
287
288 struct ctdb_takeover_arp {
289         struct ctdb_context *ctdb;
290         uint32_t count;
291         ctdb_sock_addr addr;
292         struct ctdb_tcp_array *tcparray;
293         struct ctdb_vnn *vnn;
294 };
295
296
297 /*
298   lists of tcp endpoints
299  */
300 struct ctdb_tcp_list {
301         struct ctdb_tcp_list *prev, *next;
302         struct ctdb_tcp_connection connection;
303 };
304
305 /*
306   list of clients to kill on IP release
307  */
308 struct ctdb_client_ip {
309         struct ctdb_client_ip *prev, *next;
310         struct ctdb_context *ctdb;
311         ctdb_sock_addr addr;
312         uint32_t client_id;
313 };
314
315
316 /*
317   send a gratuitous arp
318  */
319 static void ctdb_control_send_arp(struct tevent_context *ev,
320                                   struct tevent_timer *te,
321                                   struct timeval t, void *private_data)
322 {
323         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
324                                                         struct ctdb_takeover_arp);
325         int i, ret;
326         struct ctdb_tcp_array *tcparray;
327         const char *iface = ctdb_vnn_iface_string(arp->vnn);
328
329         ret = ctdb_sys_send_arp(&arp->addr, iface);
330         if (ret != 0) {
331                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
332                                   iface, strerror(errno)));
333         }
334
335         tcparray = arp->tcparray;
336         if (tcparray) {
337                 for (i=0;i<tcparray->num;i++) {
338                         struct ctdb_tcp_connection *tcon;
339
340                         tcon = &tcparray->connections[i];
341                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
342                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
343                                 ctdb_addr_to_str(&tcon->src_addr),
344                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
345                         ret = ctdb_sys_send_tcp(
346                                 &tcon->src_addr, 
347                                 &tcon->dst_addr,
348                                 0, 0, 0);
349                         if (ret != 0) {
350                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
351                                         ctdb_addr_to_str(&tcon->src_addr)));
352                         }
353                 }
354         }
355
356         arp->count++;
357
358         if (arp->count == CTDB_ARP_REPEAT) {
359                 talloc_free(arp);
360                 return;
361         }
362
363         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
364                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
365                          ctdb_control_send_arp, arp);
366 }
367
368 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
369                                        struct ctdb_vnn *vnn)
370 {
371         struct ctdb_takeover_arp *arp;
372         struct ctdb_tcp_array *tcparray;
373
374         if (!vnn->takeover_ctx) {
375                 vnn->takeover_ctx = talloc_new(vnn);
376                 if (!vnn->takeover_ctx) {
377                         return -1;
378                 }
379         }
380
381         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
382         if (!arp) {
383                 return -1;
384         }
385
386         arp->ctdb = ctdb;
387         arp->addr = vnn->public_address;
388         arp->vnn  = vnn;
389
390         tcparray = vnn->tcp_array;
391         if (tcparray) {
392                 /* add all of the known tcp connections for this IP to the
393                    list of tcp connections to send tickle acks for */
394                 arp->tcparray = talloc_steal(arp, tcparray);
395
396                 vnn->tcp_array = NULL;
397                 vnn->tcp_update_needed = true;
398         }
399
400         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
401                          timeval_zero(), ctdb_control_send_arp, arp);
402
403         return 0;
404 }
405
406 struct takeover_callback_state {
407         struct ctdb_req_control *c;
408         ctdb_sock_addr *addr;
409         struct ctdb_vnn *vnn;
410 };
411
412 struct ctdb_do_takeip_state {
413         struct ctdb_req_control *c;
414         struct ctdb_vnn *vnn;
415 };
416
417 /*
418   called when takeip event finishes
419  */
420 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
421                                     void *private_data)
422 {
423         struct ctdb_do_takeip_state *state =
424                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
425         int32_t ret;
426         TDB_DATA data;
427
428         if (status != 0) {
429                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
430         
431                 if (status == -ETIME) {
432                         ctdb_ban_self(ctdb);
433                 }
434                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
435                                  ctdb_addr_to_str(&state->vnn->public_address),
436                                  ctdb_vnn_iface_string(state->vnn)));
437                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
438
439                 node->flags |= NODE_FLAGS_UNHEALTHY;
440                 talloc_free(state);
441                 return;
442         }
443
444         if (ctdb->do_checkpublicip) {
445
446         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
447         if (ret != 0) {
448                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
449                 talloc_free(state);
450                 return;
451         }
452
453         }
454
455         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
456         data.dsize = strlen((char *)data.dptr) + 1;
457         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
458
459         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
460
461
462         /* the control succeeded */
463         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
464         talloc_free(state);
465         return;
466 }
467
468 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
469 {
470         state->vnn->update_in_flight = false;
471         return 0;
472 }
473
474 /*
475   take over an ip address
476  */
477 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
478                               struct ctdb_req_control *c,
479                               struct ctdb_vnn *vnn)
480 {
481         int ret;
482         struct ctdb_do_takeip_state *state;
483
484         if (vnn->update_in_flight) {
485                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
486                                     "update for this IP already in flight\n",
487                                     ctdb_addr_to_str(&vnn->public_address),
488                                     vnn->public_netmask_bits));
489                 return -1;
490         }
491
492         ret = ctdb_vnn_assign_iface(ctdb, vnn);
493         if (ret != 0) {
494                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
495                                  "assign a usable interface\n",
496                                  ctdb_addr_to_str(&vnn->public_address),
497                                  vnn->public_netmask_bits));
498                 return -1;
499         }
500
501         state = talloc(vnn, struct ctdb_do_takeip_state);
502         CTDB_NO_MEMORY(ctdb, state);
503
504         state->c = talloc_steal(ctdb, c);
505         state->vnn   = vnn;
506
507         vnn->update_in_flight = true;
508         talloc_set_destructor(state, ctdb_takeip_destructor);
509
510         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
511                             ctdb_addr_to_str(&vnn->public_address),
512                             vnn->public_netmask_bits,
513                             ctdb_vnn_iface_string(vnn)));
514
515         ret = ctdb_event_script_callback(ctdb,
516                                          state,
517                                          ctdb_do_takeip_callback,
518                                          state,
519                                          CTDB_EVENT_TAKE_IP,
520                                          "%s %s %u",
521                                          ctdb_vnn_iface_string(vnn),
522                                          ctdb_addr_to_str(&vnn->public_address),
523                                          vnn->public_netmask_bits);
524
525         if (ret != 0) {
526                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
527                         ctdb_addr_to_str(&vnn->public_address),
528                         ctdb_vnn_iface_string(vnn)));
529                 talloc_free(state);
530                 return -1;
531         }
532
533         return 0;
534 }
535
536 struct ctdb_do_updateip_state {
537         struct ctdb_req_control *c;
538         struct ctdb_iface *old;
539         struct ctdb_vnn *vnn;
540 };
541
542 /*
543   called when updateip event finishes
544  */
545 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
546                                       void *private_data)
547 {
548         struct ctdb_do_updateip_state *state =
549                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
550         int32_t ret;
551
552         if (status != 0) {
553                 if (status == -ETIME) {
554                         ctdb_ban_self(ctdb);
555                 }
556                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
557                         ctdb_addr_to_str(&state->vnn->public_address),
558                         state->old->name,
559                         ctdb_vnn_iface_string(state->vnn)));
560
561                 /*
562                  * All we can do is reset the old interface
563                  * and let the next run fix it
564                  */
565                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
566                 state->vnn->iface = state->old;
567                 state->vnn->iface->references++;
568
569                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
570                 talloc_free(state);
571                 return;
572         }
573
574         if (ctdb->do_checkpublicip) {
575
576         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
577         if (ret != 0) {
578                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
579                 talloc_free(state);
580                 return;
581         }
582
583         }
584
585         /* the control succeeded */
586         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
587         talloc_free(state);
588         return;
589 }
590
591 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
592 {
593         state->vnn->update_in_flight = false;
594         return 0;
595 }
596
597 /*
598   update (move) an ip address
599  */
600 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
601                                 struct ctdb_req_control *c,
602                                 struct ctdb_vnn *vnn)
603 {
604         int ret;
605         struct ctdb_do_updateip_state *state;
606         struct ctdb_iface *old = vnn->iface;
607         const char *new_name;
608
609         if (vnn->update_in_flight) {
610                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
611                                     "update for this IP already in flight\n",
612                                     ctdb_addr_to_str(&vnn->public_address),
613                                     vnn->public_netmask_bits));
614                 return -1;
615         }
616
617         ctdb_vnn_unassign_iface(ctdb, vnn);
618         ret = ctdb_vnn_assign_iface(ctdb, vnn);
619         if (ret != 0) {
620                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
621                                  "assin a usable interface (old iface '%s')\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  vnn->public_netmask_bits,
624                                  old->name));
625                 return -1;
626         }
627
628         new_name = ctdb_vnn_iface_string(vnn);
629         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
630                 /* A benign update from one interface onto itself.
631                  * no need to run the eventscripts in this case, just return
632                  * success.
633                  */
634                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
635                 return 0;
636         }
637
638         state = talloc(vnn, struct ctdb_do_updateip_state);
639         CTDB_NO_MEMORY(ctdb, state);
640
641         state->c = talloc_steal(ctdb, c);
642         state->old = old;
643         state->vnn = vnn;
644
645         vnn->update_in_flight = true;
646         talloc_set_destructor(state, ctdb_updateip_destructor);
647
648         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
649                             "interface %s to %s\n",
650                             ctdb_addr_to_str(&vnn->public_address),
651                             vnn->public_netmask_bits,
652                             old->name,
653                             new_name));
654
655         ret = ctdb_event_script_callback(ctdb,
656                                          state,
657                                          ctdb_do_updateip_callback,
658                                          state,
659                                          CTDB_EVENT_UPDATE_IP,
660                                          "%s %s %s %u",
661                                          state->old->name,
662                                          new_name,
663                                          ctdb_addr_to_str(&vnn->public_address),
664                                          vnn->public_netmask_bits);
665         if (ret != 0) {
666                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
667                                  ctdb_addr_to_str(&vnn->public_address),
668                                  old->name, new_name));
669                 talloc_free(state);
670                 return -1;
671         }
672
673         return 0;
674 }
675
676 /*
677   Find the vnn of the node that has a public ip address
678   returns -1 if the address is not known as a public address
679  */
680 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
681 {
682         struct ctdb_vnn *vnn;
683
684         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
685                 if (ctdb_same_ip(&vnn->public_address, addr)) {
686                         return vnn;
687                 }
688         }
689
690         return NULL;
691 }
692
693 /*
694   take over an ip address
695  */
696 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
697                                  struct ctdb_req_control *c,
698                                  TDB_DATA indata,
699                                  bool *async_reply)
700 {
701         int ret;
702         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
703         struct ctdb_vnn *vnn;
704         bool have_ip = false;
705         bool do_updateip = false;
706         bool do_takeip = false;
707         struct ctdb_iface *best_iface = NULL;
708
709         if (pip->pnn != ctdb->pnn) {
710                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
711                                  "with pnn %d, but we're node %d\n",
712                                  ctdb_addr_to_str(&pip->addr),
713                                  pip->pnn, ctdb->pnn));
714                 return -1;
715         }
716
717         /* update out vnn list */
718         vnn = find_public_ip_vnn(ctdb, &pip->addr);
719         if (vnn == NULL) {
720                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
721                         ctdb_addr_to_str(&pip->addr)));
722                 return 0;
723         }
724
725         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
726                 have_ip = ctdb_sys_have_ip(&pip->addr);
727         }
728         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
729         if (best_iface == NULL) {
730                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
731                                  "a usable interface (old %s, have_ip %d)\n",
732                                  ctdb_addr_to_str(&vnn->public_address),
733                                  vnn->public_netmask_bits,
734                                  ctdb_vnn_iface_string(vnn),
735                                  have_ip));
736                 return -1;
737         }
738
739         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
740                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
741                 have_ip = false;
742         }
743
744
745         if (vnn->iface == NULL && have_ip) {
746                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
747                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
748                                  ctdb_addr_to_str(&vnn->public_address)));
749                 return 0;
750         }
751
752         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
753                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
754                                   "and we have it on iface[%s], but it was assigned to node %d"
755                                   "and we are node %d, banning ourself\n",
756                                  ctdb_addr_to_str(&vnn->public_address),
757                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
758                 ctdb_ban_self(ctdb);
759                 return -1;
760         }
761
762         if (vnn->pnn == -1 && have_ip) {
763                 vnn->pnn = ctdb->pnn;
764                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
765                                   "and we already have it on iface[%s], update local daemon\n",
766                                  ctdb_addr_to_str(&vnn->public_address),
767                                   ctdb_vnn_iface_string(vnn)));
768                 return 0;
769         }
770
771         if (vnn->iface) {
772                 if (vnn->iface != best_iface) {
773                         if (!vnn->iface->link_up) {
774                                 do_updateip = true;
775                         } else if (vnn->iface->references > (best_iface->references + 1)) {
776                                 /* only move when the rebalance gains something */
777                                         do_updateip = true;
778                         }
779                 }
780         }
781
782         if (!have_ip) {
783                 if (do_updateip) {
784                         ctdb_vnn_unassign_iface(ctdb, vnn);
785                         do_updateip = false;
786                 }
787                 do_takeip = true;
788         }
789
790         if (do_takeip) {
791                 ret = ctdb_do_takeip(ctdb, c, vnn);
792                 if (ret != 0) {
793                         return -1;
794                 }
795         } else if (do_updateip) {
796                 ret = ctdb_do_updateip(ctdb, c, vnn);
797                 if (ret != 0) {
798                         return -1;
799                 }
800         } else {
801                 /*
802                  * The interface is up and the kernel known the ip
803                  * => do nothing
804                  */
805                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
806                         ctdb_addr_to_str(&pip->addr),
807                         vnn->public_netmask_bits,
808                         ctdb_vnn_iface_string(vnn)));
809                 return 0;
810         }
811
812         /* tell ctdb_control.c that we will be replying asynchronously */
813         *async_reply = true;
814
815         return 0;
816 }
817
818 /*
819   kill any clients that are registered with a IP that is being released
820  */
821 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
822 {
823         struct ctdb_client_ip *ip;
824
825         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
826                 ctdb_addr_to_str(addr)));
827
828         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
829                 ctdb_sock_addr tmp_addr;
830
831                 tmp_addr = ip->addr;
832                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
833                         ip->client_id,
834                         ctdb_addr_to_str(&ip->addr)));
835
836                 if (ctdb_same_ip(&tmp_addr, addr)) {
837                         struct ctdb_client *client = reqid_find(ctdb->idr,
838                                                                 ip->client_id,
839                                                                 struct ctdb_client);
840                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
841                                 ip->client_id,
842                                 ctdb_addr_to_str(&ip->addr),
843                                 client->pid));
844
845                         if (client->pid != 0) {
846                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
847                                         (unsigned)client->pid,
848                                         ctdb_addr_to_str(addr),
849                                         ip->client_id));
850                                 kill(client->pid, SIGKILL);
851                         }
852                 }
853         }
854 }
855
856 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
857 {
858         DLIST_REMOVE(ctdb->vnn, vnn);
859         ctdb_vnn_unassign_iface(ctdb, vnn);
860         ctdb_remove_orphaned_ifaces(ctdb, vnn);
861         talloc_free(vnn);
862 }
863
864 /*
865   called when releaseip event finishes
866  */
867 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
868                                 void *private_data)
869 {
870         struct takeover_callback_state *state = 
871                 talloc_get_type(private_data, struct takeover_callback_state);
872         TDB_DATA data;
873
874         if (status == -ETIME) {
875                 ctdb_ban_self(ctdb);
876         }
877
878         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
879                 if  (ctdb_sys_have_ip(state->addr)) {
880                         DEBUG(DEBUG_ERR,
881                               ("IP %s still hosted during release IP callback, failing\n",
882                                ctdb_addr_to_str(state->addr)));
883                         ctdb_request_control_reply(ctdb, state->c,
884                                                    NULL, -1, NULL);
885                         talloc_free(state);
886                         return;
887                 }
888         }
889
890         /* send a message to all clients of this node telling them
891            that the cluster has been reconfigured and they should
892            release any sockets on this IP */
893         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
894         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
895         data.dsize = strlen((char *)data.dptr)+1;
896
897         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
898
899         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
900
901         /* kill clients that have registered with this IP */
902         release_kill_clients(ctdb, state->addr);
903
904         ctdb_vnn_unassign_iface(ctdb, state->vnn);
905
906         /* Process the IP if it has been marked for deletion */
907         if (state->vnn->delete_pending) {
908                 do_delete_ip(ctdb, state->vnn);
909                 state->vnn = NULL;
910         }
911
912         /* the control succeeded */
913         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
914         talloc_free(state);
915 }
916
917 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
918 {
919         if (state->vnn != NULL) {
920                 state->vnn->update_in_flight = false;
921         }
922         return 0;
923 }
924
925 /*
926   release an ip address
927  */
928 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
929                                 struct ctdb_req_control *c,
930                                 TDB_DATA indata, 
931                                 bool *async_reply)
932 {
933         int ret;
934         struct takeover_callback_state *state;
935         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
936         struct ctdb_vnn *vnn;
937         char *iface;
938
939         /* update our vnn list */
940         vnn = find_public_ip_vnn(ctdb, &pip->addr);
941         if (vnn == NULL) {
942                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
943                         ctdb_addr_to_str(&pip->addr)));
944                 return 0;
945         }
946         vnn->pnn = pip->pnn;
947
948         /* stop any previous arps */
949         talloc_free(vnn->takeover_ctx);
950         vnn->takeover_ctx = NULL;
951
952         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
953          * lazy multicast to drop an IP from any node that isn't the
954          * intended new node.  The following causes makes ctdbd ignore
955          * a release for any address it doesn't host.
956          */
957         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
958                 if (!ctdb_sys_have_ip(&pip->addr)) {
959                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
960                                 ctdb_addr_to_str(&pip->addr),
961                                 vnn->public_netmask_bits,
962                                 ctdb_vnn_iface_string(vnn)));
963                         ctdb_vnn_unassign_iface(ctdb, vnn);
964                         return 0;
965                 }
966         } else {
967                 if (vnn->iface == NULL) {
968                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
969                                            ctdb_addr_to_str(&pip->addr),
970                                            vnn->public_netmask_bits));
971                         return 0;
972                 }
973         }
974
975         /* There is a potential race between take_ip and us because we
976          * update the VNN via a callback that run when the
977          * eventscripts have been run.  Avoid the race by allowing one
978          * update to be in flight at a time.
979          */
980         if (vnn->update_in_flight) {
981                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
982                                     "update for this IP already in flight\n",
983                                     ctdb_addr_to_str(&vnn->public_address),
984                                     vnn->public_netmask_bits));
985                 return -1;
986         }
987
988         iface = strdup(ctdb_vnn_iface_string(vnn));
989
990         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
991                 ctdb_addr_to_str(&pip->addr),
992                 vnn->public_netmask_bits,
993                 iface,
994                 pip->pnn));
995
996         state = talloc(ctdb, struct takeover_callback_state);
997         if (state == NULL) {
998                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
999                                __FILE__, __LINE__);
1000                 free(iface);
1001                 return -1;
1002         }
1003
1004         state->c = talloc_steal(state, c);
1005         state->addr = talloc(state, ctdb_sock_addr);       
1006         if (state->addr == NULL) {
1007                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1008                                __FILE__, __LINE__);
1009                 free(iface);
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013         *state->addr = pip->addr;
1014         state->vnn   = vnn;
1015
1016         vnn->update_in_flight = true;
1017         talloc_set_destructor(state, ctdb_releaseip_destructor);
1018
1019         ret = ctdb_event_script_callback(ctdb, 
1020                                          state, release_ip_callback, state,
1021                                          CTDB_EVENT_RELEASE_IP,
1022                                          "%s %s %u",
1023                                          iface,
1024                                          ctdb_addr_to_str(&pip->addr),
1025                                          vnn->public_netmask_bits);
1026         free(iface);
1027         if (ret != 0) {
1028                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1029                         ctdb_addr_to_str(&pip->addr),
1030                         ctdb_vnn_iface_string(vnn)));
1031                 talloc_free(state);
1032                 return -1;
1033         }
1034
1035         /* tell the control that we will be reply asynchronously */
1036         *async_reply = true;
1037         return 0;
1038 }
1039
1040 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1041                                    ctdb_sock_addr *addr,
1042                                    unsigned mask, const char *ifaces,
1043                                    bool check_address)
1044 {
1045         struct ctdb_vnn      *vnn;
1046         uint32_t num = 0;
1047         char *tmp;
1048         const char *iface;
1049         int i;
1050         int ret;
1051
1052         tmp = strdup(ifaces);
1053         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1054                 if (!ctdb_sys_check_iface_exists(iface)) {
1055                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1056                         free(tmp);
1057                         return -1;
1058                 }
1059         }
1060         free(tmp);
1061
1062         /* Verify that we dont have an entry for this ip yet */
1063         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1064                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1065                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1066                                 ctdb_addr_to_str(addr)));
1067                         return -1;
1068                 }               
1069         }
1070
1071         /* create a new vnn structure for this ip address */
1072         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1073         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1074         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1075         tmp = talloc_strdup(vnn, ifaces);
1076         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1077         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1078                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1080                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1081                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1082                 num++;
1083         }
1084         talloc_free(tmp);
1085         vnn->ifaces[num] = NULL;
1086         vnn->public_address      = *addr;
1087         vnn->public_netmask_bits = mask;
1088         vnn->pnn                 = -1;
1089         if (check_address) {
1090                 if (ctdb_sys_have_ip(addr)) {
1091                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1092                         vnn->pnn = ctdb->pnn;
1093                 }
1094         }
1095
1096         for (i=0; vnn->ifaces[i]; i++) {
1097                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1098                 if (ret != 0) {
1099                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1100                                            "for public_address[%s]\n",
1101                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1102                         talloc_free(vnn);
1103                         return -1;
1104                 }
1105         }
1106
1107         DLIST_ADD(ctdb->vnn, vnn);
1108
1109         return 0;
1110 }
1111
1112 /*
1113   setup the public address lists from a file
1114 */
1115 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1116 {
1117         char **lines;
1118         int nlines;
1119         int i;
1120
1121         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1122         if (lines == NULL) {
1123                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1124                 return -1;
1125         }
1126         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1127                 nlines--;
1128         }
1129
1130         for (i=0;i<nlines;i++) {
1131                 unsigned mask;
1132                 ctdb_sock_addr addr;
1133                 const char *addrstr;
1134                 const char *ifaces;
1135                 char *tok, *line;
1136
1137                 line = lines[i];
1138                 while ((*line == ' ') || (*line == '\t')) {
1139                         line++;
1140                 }
1141                 if (*line == '#') {
1142                         continue;
1143                 }
1144                 if (strcmp(line, "") == 0) {
1145                         continue;
1146                 }
1147                 tok = strtok(line, " \t");
1148                 addrstr = tok;
1149                 tok = strtok(NULL, " \t");
1150                 if (tok == NULL) {
1151                         if (NULL == ctdb->default_public_interface) {
1152                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1153                                          i+1));
1154                                 talloc_free(lines);
1155                                 return -1;
1156                         }
1157                         ifaces = ctdb->default_public_interface;
1158                 } else {
1159                         ifaces = tok;
1160                 }
1161
1162                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1163                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1164                         talloc_free(lines);
1165                         return -1;
1166                 }
1167                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1168                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1169                         talloc_free(lines);
1170                         return -1;
1171                 }
1172         }
1173
1174
1175         talloc_free(lines);
1176         return 0;
1177 }
1178
1179 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1180                               const char *iface,
1181                               const char *ip)
1182 {
1183         struct ctdb_vnn *svnn;
1184         struct ctdb_iface *cur = NULL;
1185         bool ok;
1186         int ret;
1187
1188         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1189         CTDB_NO_MEMORY(ctdb, svnn);
1190
1191         svnn->ifaces = talloc_array(svnn, const char *, 2);
1192         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1193         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1194         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1195         svnn->ifaces[1] = NULL;
1196
1197         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1198         if (!ok) {
1199                 talloc_free(svnn);
1200                 return -1;
1201         }
1202
1203         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1204         if (ret != 0) {
1205                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1206                                    "for single_ip[%s]\n",
1207                                    svnn->ifaces[0],
1208                                    ctdb_addr_to_str(&svnn->public_address)));
1209                 talloc_free(svnn);
1210                 return -1;
1211         }
1212
1213         /* assume the single public ip interface is initially "good" */
1214         cur = ctdb_find_iface(ctdb, iface);
1215         if (cur == NULL) {
1216                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1217                 return -1;
1218         }
1219         cur->link_up = true;
1220
1221         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1222         if (ret != 0) {
1223                 talloc_free(svnn);
1224                 return -1;
1225         }
1226
1227         ctdb->single_ip_vnn = svnn;
1228         return 0;
1229 }
1230
1231 struct ctdb_public_ip_list {
1232         struct ctdb_public_ip_list *next;
1233         uint32_t pnn;
1234         ctdb_sock_addr addr;
1235 };
1236
1237 /* Given a physical node, return the number of
1238    public addresses that is currently assigned to this node.
1239 */
1240 static int node_ip_coverage(struct ctdb_context *ctdb, 
1241         int32_t pnn,
1242         struct ctdb_public_ip_list *ips)
1243 {
1244         int num=0;
1245
1246         for (;ips;ips=ips->next) {
1247                 if (ips->pnn == pnn) {
1248                         num++;
1249                 }
1250         }
1251         return num;
1252 }
1253
1254
1255 /* Can the given node host the given IP: is the public IP known to the
1256  * node and is NOIPHOST unset?
1257 */
1258 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1259                              struct ctdb_ipflags ipflags,
1260                              struct ctdb_public_ip_list *ip)
1261 {
1262         struct ctdb_all_public_ips *public_ips;
1263         int i;
1264
1265         if (ipflags.noiphost) {
1266                 return false;
1267         }
1268
1269         public_ips = ctdb->nodes[pnn]->available_public_ips;
1270
1271         if (public_ips == NULL) {
1272                 return false;
1273         }
1274
1275         for (i=0; i<public_ips->num; i++) {
1276                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1277                         /* yes, this node can serve this public ip */
1278                         return true;
1279                 }
1280         }
1281
1282         return false;
1283 }
1284
1285 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1286                                  struct ctdb_ipflags ipflags,
1287                                  struct ctdb_public_ip_list *ip)
1288 {
1289         if (ipflags.noiptakeover) {
1290                 return false;
1291         }
1292
1293         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1294 }
1295
1296 /* search the node lists list for a node to takeover this ip.
1297    pick the node that currently are serving the least number of ips
1298    so that the ips get spread out evenly.
1299 */
1300 static int find_takeover_node(struct ctdb_context *ctdb, 
1301                 struct ctdb_ipflags *ipflags,
1302                 struct ctdb_public_ip_list *ip,
1303                 struct ctdb_public_ip_list *all_ips)
1304 {
1305         int pnn, min=0, num;
1306         int i, numnodes;
1307
1308         numnodes = talloc_array_length(ipflags);
1309         pnn    = -1;
1310         for (i=0; i<numnodes; i++) {
1311                 /* verify that this node can serve this ip */
1312                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1313                         /* no it couldnt   so skip to the next node */
1314                         continue;
1315                 }
1316
1317                 num = node_ip_coverage(ctdb, i, all_ips);
1318                 /* was this the first node we checked ? */
1319                 if (pnn == -1) {
1320                         pnn = i;
1321                         min  = num;
1322                 } else {
1323                         if (num < min) {
1324                                 pnn = i;
1325                                 min  = num;
1326                         }
1327                 }
1328         }       
1329         if (pnn == -1) {
1330                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1331                         ctdb_addr_to_str(&ip->addr)));
1332
1333                 return -1;
1334         }
1335
1336         ip->pnn = pnn;
1337         return 0;
1338 }
1339
1340 #define IP_KEYLEN       4
1341 static uint32_t *ip_key(ctdb_sock_addr *ip)
1342 {
1343         static uint32_t key[IP_KEYLEN];
1344
1345         bzero(key, sizeof(key));
1346
1347         switch (ip->sa.sa_family) {
1348         case AF_INET:
1349                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1350                 break;
1351         case AF_INET6: {
1352                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1353                 key[0]  = htonl(s6_a32[0]);
1354                 key[1]  = htonl(s6_a32[1]);
1355                 key[2]  = htonl(s6_a32[2]);
1356                 key[3]  = htonl(s6_a32[3]);
1357                 break;
1358         }
1359         default:
1360                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1361                 return key;
1362         }
1363
1364         return key;
1365 }
1366
1367 static void *add_ip_callback(void *parm, void *data)
1368 {
1369         struct ctdb_public_ip_list *this_ip = parm; 
1370         struct ctdb_public_ip_list *prev_ip = data; 
1371
1372         if (prev_ip == NULL) {
1373                 return parm;
1374         }
1375         if (this_ip->pnn == -1) {
1376                 this_ip->pnn = prev_ip->pnn;
1377         }
1378
1379         return parm;
1380 }
1381
1382 static int getips_count_callback(void *param, void *data)
1383 {
1384         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1385         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1386
1387         new_ip->next = *ip_list;
1388         *ip_list     = new_ip;
1389         return 0;
1390 }
1391
1392 static struct ctdb_public_ip_list *
1393 create_merged_ip_list(struct ctdb_context *ctdb)
1394 {
1395         int i, j;
1396         struct ctdb_public_ip_list *ip_list;
1397         struct ctdb_all_public_ips *public_ips;
1398
1399         if (ctdb->ip_tree != NULL) {
1400                 talloc_free(ctdb->ip_tree);
1401                 ctdb->ip_tree = NULL;
1402         }
1403         ctdb->ip_tree = trbt_create(ctdb, 0);
1404
1405         for (i=0;i<ctdb->num_nodes;i++) {
1406                 public_ips = ctdb->nodes[i]->known_public_ips;
1407
1408                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1409                         continue;
1410                 }
1411
1412                 /* there were no public ips for this node */
1413                 if (public_ips == NULL) {
1414                         continue;
1415                 }               
1416
1417                 for (j=0;j<public_ips->num;j++) {
1418                         struct ctdb_public_ip_list *tmp_ip; 
1419
1420                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1421                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1422                         /* Do not use information about IP addresses hosted
1423                          * on other nodes, it may not be accurate */
1424                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1425                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1426                         } else {
1427                                 tmp_ip->pnn = -1;
1428                         }
1429                         tmp_ip->addr = public_ips->ips[j].addr;
1430                         tmp_ip->next = NULL;
1431
1432                         trbt_insertarray32_callback(ctdb->ip_tree,
1433                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1434                                 add_ip_callback,
1435                                 tmp_ip);
1436                 }
1437         }
1438
1439         ip_list = NULL;
1440         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1441
1442         return ip_list;
1443 }
1444
1445 /* 
1446  * This is the length of the longtest common prefix between the IPs.
1447  * It is calculated by XOR-ing the 2 IPs together and counting the
1448  * number of leading zeroes.  The implementation means that all
1449  * addresses end up being 128 bits long.
1450  *
1451  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1452  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1453  * lots of nodes and IP addresses?
1454  */
1455 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1456 {
1457         uint32_t ip1_k[IP_KEYLEN];
1458         uint32_t *t;
1459         int i;
1460         uint32_t x;
1461
1462         uint32_t distance = 0;
1463
1464         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1465         t = ip_key(ip2);
1466         for (i=0; i<IP_KEYLEN; i++) {
1467                 x = ip1_k[i] ^ t[i];
1468                 if (x == 0) {
1469                         distance += 32;
1470                 } else {
1471                         /* Count number of leading zeroes. 
1472                          * FIXME? This could be optimised...
1473                          */
1474                         while ((x & (1 << 31)) == 0) {
1475                                 x <<= 1;
1476                                 distance += 1;
1477                         }
1478                 }
1479         }
1480
1481         return distance;
1482 }
1483
1484 /* Calculate the IP distance for the given IP relative to IPs on the
1485    given node.  The ips argument is generally the all_ips variable
1486    used in the main part of the algorithm.
1487  */
1488 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1489                                   struct ctdb_public_ip_list *ips,
1490                                   int pnn)
1491 {
1492         struct ctdb_public_ip_list *t;
1493         uint32_t d;
1494
1495         uint32_t sum = 0;
1496
1497         for (t=ips; t != NULL; t=t->next) {
1498                 if (t->pnn != pnn) {
1499                         continue;
1500                 }
1501
1502                 /* Optimisation: We never calculate the distance
1503                  * between an address and itself.  This allows us to
1504                  * calculate the effect of removing an address from a
1505                  * node by simply calculating the distance between
1506                  * that address and all of the exitsing addresses.
1507                  * Moreover, we assume that we're only ever dealing
1508                  * with addresses from all_ips so we can identify an
1509                  * address via a pointer rather than doing a more
1510                  * expensive address comparison. */
1511                 if (&(t->addr) == ip) {
1512                         continue;
1513                 }
1514
1515                 d = ip_distance(ip, &(t->addr));
1516                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1517         }
1518
1519         return sum;
1520 }
1521
1522 /* Return the LCP2 imbalance metric for addresses currently assigned
1523    to the given node.
1524  */
1525 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1526 {
1527         struct ctdb_public_ip_list *t;
1528
1529         uint32_t imbalance = 0;
1530
1531         for (t=all_ips; t!=NULL; t=t->next) {
1532                 if (t->pnn != pnn) {
1533                         continue;
1534                 }
1535                 /* Pass the rest of the IPs rather than the whole
1536                    all_ips input list.
1537                 */
1538                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1539         }
1540
1541         return imbalance;
1542 }
1543
1544 /* Allocate any unassigned IPs just by looping through the IPs and
1545  * finding the best node for each.
1546  */
1547 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1548                                       struct ctdb_ipflags *ipflags,
1549                                       struct ctdb_public_ip_list *all_ips)
1550 {
1551         struct ctdb_public_ip_list *tmp_ip;
1552
1553         /* loop over all ip's and find a physical node to cover for 
1554            each unassigned ip.
1555         */
1556         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1557                 if (tmp_ip->pnn == -1) {
1558                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1559                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1560                                         ctdb_addr_to_str(&tmp_ip->addr)));
1561                         }
1562                 }
1563         }
1564 }
1565
1566 /* Basic non-deterministic rebalancing algorithm.
1567  */
1568 static void basic_failback(struct ctdb_context *ctdb,
1569                            struct ctdb_ipflags *ipflags,
1570                            struct ctdb_public_ip_list *all_ips,
1571                            int num_ips)
1572 {
1573         int i, numnodes;
1574         int maxnode, maxnum, minnode, minnum, num, retries;
1575         struct ctdb_public_ip_list *tmp_ip;
1576
1577         numnodes = talloc_array_length(ipflags);
1578         retries = 0;
1579
1580 try_again:
1581         maxnum=0;
1582         minnum=0;
1583
1584         /* for each ip address, loop over all nodes that can serve
1585            this ip and make sure that the difference between the node
1586            serving the most and the node serving the least ip's are
1587            not greater than 1.
1588         */
1589         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1590                 if (tmp_ip->pnn == -1) {
1591                         continue;
1592                 }
1593
1594                 /* Get the highest and lowest number of ips's served by any 
1595                    valid node which can serve this ip.
1596                 */
1597                 maxnode = -1;
1598                 minnode = -1;
1599                 for (i=0; i<numnodes; i++) {
1600                         /* only check nodes that can actually serve this ip */
1601                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1602                                 /* no it couldnt   so skip to the next node */
1603                                 continue;
1604                         }
1605
1606                         num = node_ip_coverage(ctdb, i, all_ips);
1607                         if (maxnode == -1) {
1608                                 maxnode = i;
1609                                 maxnum  = num;
1610                         } else {
1611                                 if (num > maxnum) {
1612                                         maxnode = i;
1613                                         maxnum  = num;
1614                                 }
1615                         }
1616                         if (minnode == -1) {
1617                                 minnode = i;
1618                                 minnum  = num;
1619                         } else {
1620                                 if (num < minnum) {
1621                                         minnode = i;
1622                                         minnum  = num;
1623                                 }
1624                         }
1625                 }
1626                 if (maxnode == -1) {
1627                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1628                                 ctdb_addr_to_str(&tmp_ip->addr)));
1629
1630                         continue;
1631                 }
1632
1633                 /* if the spread between the smallest and largest coverage by
1634                    a node is >=2 we steal one of the ips from the node with
1635                    most coverage to even things out a bit.
1636                    try to do this a limited number of times since we dont
1637                    want to spend too much time balancing the ip coverage.
1638                 */
1639                 if ( (maxnum > minnum+1)
1640                      && (retries < (num_ips + 5)) ){
1641                         struct ctdb_public_ip_list *tmp;
1642
1643                         /* Reassign one of maxnode's VNNs */
1644                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1645                                 if (tmp->pnn == maxnode) {
1646                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1647                                         retries++;
1648                                         goto try_again;;
1649                                 }
1650                         }
1651                 }
1652         }
1653 }
1654
1655 static void lcp2_init(struct ctdb_context *tmp_ctx,
1656                       struct ctdb_ipflags *ipflags,
1657                       struct ctdb_public_ip_list *all_ips,
1658                       uint32_t *force_rebalance_nodes,
1659                       uint32_t **lcp2_imbalances,
1660                       bool **rebalance_candidates)
1661 {
1662         int i, numnodes;
1663         struct ctdb_public_ip_list *tmp_ip;
1664
1665         numnodes = talloc_array_length(ipflags);
1666
1667         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1668         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1669         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1670         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1671
1672         for (i=0; i<numnodes; i++) {
1673                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1674                 /* First step: assume all nodes are candidates */
1675                 (*rebalance_candidates)[i] = true;
1676         }
1677
1678         /* 2nd step: if a node has IPs assigned then it must have been
1679          * healthy before, so we remove it from consideration.  This
1680          * is overkill but is all we have because we don't maintain
1681          * state between takeover runs.  An alternative would be to
1682          * keep state and invalidate it every time the recovery master
1683          * changes.
1684          */
1685         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1686                 if (tmp_ip->pnn != -1) {
1687                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1688                 }
1689         }
1690
1691         /* 3rd step: if a node is forced to re-balance then
1692            we allow failback onto the node */
1693         if (force_rebalance_nodes == NULL) {
1694                 return;
1695         }
1696         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1697                 uint32_t pnn = force_rebalance_nodes[i];
1698                 if (pnn >= numnodes) {
1699                         DEBUG(DEBUG_ERR,
1700                               (__location__ "unknown node %u\n", pnn));
1701                         continue;
1702                 }
1703
1704                 DEBUG(DEBUG_NOTICE,
1705                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1706                 (*rebalance_candidates)[pnn] = true;
1707         }
1708 }
1709
1710 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1711  * the IP/node combination that will cost the least.
1712  */
1713 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1714                                      struct ctdb_ipflags *ipflags,
1715                                      struct ctdb_public_ip_list *all_ips,
1716                                      uint32_t *lcp2_imbalances)
1717 {
1718         struct ctdb_public_ip_list *tmp_ip;
1719         int dstnode, numnodes;
1720
1721         int minnode;
1722         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1723         struct ctdb_public_ip_list *minip;
1724
1725         bool should_loop = true;
1726         bool have_unassigned = true;
1727
1728         numnodes = talloc_array_length(ipflags);
1729
1730         while (have_unassigned && should_loop) {
1731                 should_loop = false;
1732
1733                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1734                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1735
1736                 minnode = -1;
1737                 mindsum = 0;
1738                 minip = NULL;
1739
1740                 /* loop over each unassigned ip. */
1741                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1742                         if (tmp_ip->pnn != -1) {
1743                                 continue;
1744                         }
1745
1746                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1747                                 /* only check nodes that can actually takeover this ip */
1748                                 if (!can_node_takeover_ip(ctdb, dstnode,
1749                                                           ipflags[dstnode],
1750                                                           tmp_ip)) {
1751                                         /* no it couldnt   so skip to the next node */
1752                                         continue;
1753                                 }
1754
1755                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1756                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1757                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1758                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1759                                                    dstnode,
1760                                                    dstimbl - lcp2_imbalances[dstnode]));
1761
1762
1763                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1764                                         minnode = dstnode;
1765                                         minimbl = dstimbl;
1766                                         mindsum = dstdsum;
1767                                         minip = tmp_ip;
1768                                         should_loop = true;
1769                                 }
1770                         }
1771                 }
1772
1773                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1774
1775                 /* If we found one then assign it to the given node. */
1776                 if (minnode != -1) {
1777                         minip->pnn = minnode;
1778                         lcp2_imbalances[minnode] = minimbl;
1779                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1780                                           ctdb_addr_to_str(&(minip->addr)),
1781                                           minnode,
1782                                           mindsum));
1783                 }
1784
1785                 /* There might be a better way but at least this is clear. */
1786                 have_unassigned = false;
1787                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1788                         if (tmp_ip->pnn == -1) {
1789                                 have_unassigned = true;
1790                         }
1791                 }
1792         }
1793
1794         /* We know if we have an unassigned addresses so we might as
1795          * well optimise.
1796          */
1797         if (have_unassigned) {
1798                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1799                         if (tmp_ip->pnn == -1) {
1800                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1801                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1802                         }
1803                 }
1804         }
1805 }
1806
1807 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1808  * to move IPs from, determines the best IP/destination node
1809  * combination to move from the source node.
1810  */
1811 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1812                                     struct ctdb_ipflags *ipflags,
1813                                     struct ctdb_public_ip_list *all_ips,
1814                                     int srcnode,
1815                                     uint32_t *lcp2_imbalances,
1816                                     bool *rebalance_candidates)
1817 {
1818         int dstnode, mindstnode, numnodes;
1819         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1820         uint32_t minsrcimbl, mindstimbl;
1821         struct ctdb_public_ip_list *minip;
1822         struct ctdb_public_ip_list *tmp_ip;
1823
1824         /* Find an IP and destination node that best reduces imbalance. */
1825         srcimbl = 0;
1826         minip = NULL;
1827         minsrcimbl = 0;
1828         mindstnode = -1;
1829         mindstimbl = 0;
1830
1831         numnodes = talloc_array_length(ipflags);
1832
1833         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1834         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1835                            srcnode, lcp2_imbalances[srcnode]));
1836
1837         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1838                 /* Only consider addresses on srcnode. */
1839                 if (tmp_ip->pnn != srcnode) {
1840                         continue;
1841                 }
1842
1843                 /* What is this IP address costing the source node? */
1844                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1845                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1846
1847                 /* Consider this IP address would cost each potential
1848                  * destination node.  Destination nodes are limited to
1849                  * those that are newly healthy, since we don't want
1850                  * to do gratuitous failover of IPs just to make minor
1851                  * balance improvements.
1852                  */
1853                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1854                         if (!rebalance_candidates[dstnode]) {
1855                                 continue;
1856                         }
1857
1858                         /* only check nodes that can actually takeover this ip */
1859                         if (!can_node_takeover_ip(ctdb, dstnode,
1860                                                   ipflags[dstnode], tmp_ip)) {
1861                                 /* no it couldnt   so skip to the next node */
1862                                 continue;
1863                         }
1864
1865                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1866                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1867                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1868                                            srcnode, -srcdsum,
1869                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1870                                            dstnode, dstdsum));
1871
1872                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1873                             (dstdsum < srcdsum) &&                      \
1874                             ((mindstnode == -1) ||                              \
1875                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1876
1877                                 minip = tmp_ip;
1878                                 minsrcimbl = srcimbl;
1879                                 mindstnode = dstnode;
1880                                 mindstimbl = dstimbl;
1881                         }
1882                 }
1883         }
1884         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1885
1886         if (mindstnode != -1) {
1887                 /* We found a move that makes things better... */
1888                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1889                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1890                                   ctdb_addr_to_str(&(minip->addr)),
1891                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1892
1893
1894                 lcp2_imbalances[srcnode] = minsrcimbl;
1895                 lcp2_imbalances[mindstnode] = mindstimbl;
1896                 minip->pnn = mindstnode;
1897
1898                 return true;
1899         }
1900
1901         return false;
1902         
1903 }
1904
1905 struct lcp2_imbalance_pnn {
1906         uint32_t imbalance;
1907         int pnn;
1908 };
1909
1910 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1911 {
1912         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1913         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1914
1915         if (lipa->imbalance > lipb->imbalance) {
1916                 return -1;
1917         } else if (lipa->imbalance == lipb->imbalance) {
1918                 return 0;
1919         } else {
1920                 return 1;
1921         }
1922 }
1923
1924 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1925  * node with the highest LCP2 imbalance, and then determines the best
1926  * IP/destination node combination to move from the source node.
1927  */
1928 static void lcp2_failback(struct ctdb_context *ctdb,
1929                           struct ctdb_ipflags *ipflags,
1930                           struct ctdb_public_ip_list *all_ips,
1931                           uint32_t *lcp2_imbalances,
1932                           bool *rebalance_candidates)
1933 {
1934         int i, numnodes;
1935         struct lcp2_imbalance_pnn * lips;
1936         bool again;
1937
1938         numnodes = talloc_array_length(ipflags);
1939
1940 try_again:
1941         /* Put the imbalances and nodes into an array, sort them and
1942          * iterate through candidates.  Usually the 1st one will be
1943          * used, so this doesn't cost much...
1944          */
1945         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1946         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1947         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1948         for (i=0; i<numnodes; i++) {
1949                 lips[i].imbalance = lcp2_imbalances[i];
1950                 lips[i].pnn = i;
1951                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1952         }
1953         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1954               lcp2_cmp_imbalance_pnn);
1955
1956         again = false;
1957         for (i=0; i<numnodes; i++) {
1958                 /* This means that all nodes had 0 or 1 addresses, so
1959                  * can't be imbalanced.
1960                  */
1961                 if (lips[i].imbalance == 0) {
1962                         break;
1963                 }
1964
1965                 if (lcp2_failback_candidate(ctdb,
1966                                             ipflags,
1967                                             all_ips,
1968                                             lips[i].pnn,
1969                                             lcp2_imbalances,
1970                                             rebalance_candidates)) {
1971                         again = true;
1972                         break;
1973                 }
1974         }
1975
1976         talloc_free(lips);
1977         if (again) {
1978                 goto try_again;
1979         }
1980 }
1981
1982 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1983                                     struct ctdb_ipflags *ipflags,
1984                                     struct ctdb_public_ip_list *all_ips)
1985 {
1986         struct ctdb_public_ip_list *tmp_ip;
1987
1988         /* verify that the assigned nodes can serve that public ip
1989            and set it to -1 if not
1990         */
1991         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1992                 if (tmp_ip->pnn == -1) {
1993                         continue;
1994                 }
1995                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1996                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1997                         /* this node can not serve this ip. */
1998                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1999                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2000                                            tmp_ip->pnn));
2001                         tmp_ip->pnn = -1;
2002                 }
2003         }
2004 }
2005
2006 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2007                                        struct ctdb_ipflags *ipflags,
2008                                        struct ctdb_public_ip_list *all_ips)
2009 {
2010         struct ctdb_public_ip_list *tmp_ip;
2011         int i, numnodes;
2012
2013         numnodes = talloc_array_length(ipflags);
2014
2015         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2016        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2017         *  always be allocated the same way for a specific set of
2018         *  available/unavailable nodes.
2019         */
2020
2021         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2022                 tmp_ip->pnn = i % numnodes;
2023         }
2024
2025         /* IP failback doesn't make sense with deterministic
2026          * IPs, since the modulo step above implicitly fails
2027          * back IPs to their "home" node.
2028          */
2029         if (1 == ctdb->tunable.no_ip_failback) {
2030                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2031         }
2032
2033         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2034
2035         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2036
2037         /* No failback here! */
2038 }
2039
2040 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2041                                           struct ctdb_ipflags *ipflags,
2042                                           struct ctdb_public_ip_list *all_ips)
2043 {
2044         /* This should be pushed down into basic_failback. */
2045         struct ctdb_public_ip_list *tmp_ip;
2046         int num_ips = 0;
2047         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2048                 num_ips++;
2049         }
2050
2051         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2052
2053         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2054
2055         /* If we don't want IPs to fail back then don't rebalance IPs. */
2056         if (1 == ctdb->tunable.no_ip_failback) {
2057                 return;
2058         }
2059
2060         /* Now, try to make sure the ip adresses are evenly distributed
2061            across the nodes.
2062         */
2063         basic_failback(ctdb, ipflags, all_ips, num_ips);
2064 }
2065
2066 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2067                           struct ctdb_ipflags *ipflags,
2068                           struct ctdb_public_ip_list *all_ips,
2069                           uint32_t *force_rebalance_nodes)
2070 {
2071         uint32_t *lcp2_imbalances;
2072         bool *rebalance_candidates;
2073         int numnodes, num_rebalance_candidates, i;
2074
2075         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2076
2077         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2078
2079         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2080                   &lcp2_imbalances, &rebalance_candidates);
2081
2082         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2083
2084         /* If we don't want IPs to fail back then don't rebalance IPs. */
2085         if (1 == ctdb->tunable.no_ip_failback) {
2086                 goto finished;
2087         }
2088
2089         /* It is only worth continuing if we have suitable target
2090          * nodes to transfer IPs to.  This check is much cheaper than
2091          * continuing on...
2092          */
2093         numnodes = talloc_array_length(ipflags);
2094         num_rebalance_candidates = 0;
2095         for (i=0; i<numnodes; i++) {
2096                 if (rebalance_candidates[i]) {
2097                         num_rebalance_candidates++;
2098                 }
2099         }
2100         if (num_rebalance_candidates == 0) {
2101                 goto finished;
2102         }
2103
2104         /* Now, try to make sure the ip adresses are evenly distributed
2105            across the nodes.
2106         */
2107         lcp2_failback(ctdb, ipflags, all_ips,
2108                       lcp2_imbalances, rebalance_candidates);
2109
2110 finished:
2111         talloc_free(tmp_ctx);
2112 }
2113
2114 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2115 {
2116         int i;
2117
2118         for (i=0;i<nodemap->num;i++) {
2119                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2120                         /* Found one completely healthy node */
2121                         return false;
2122                 }
2123         }
2124
2125         return true;
2126 }
2127
2128 /* The calculation part of the IP allocation algorithm. */
2129 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2130                                    struct ctdb_ipflags *ipflags,
2131                                    struct ctdb_public_ip_list **all_ips_p,
2132                                    uint32_t *force_rebalance_nodes)
2133 {
2134         /* since nodes only know about those public addresses that
2135            can be served by that particular node, no single node has
2136            a full list of all public addresses that exist in the cluster.
2137            Walk over all node structures and create a merged list of
2138            all public addresses that exist in the cluster.
2139
2140            keep the tree of ips around as ctdb->ip_tree
2141         */
2142         *all_ips_p = create_merged_ip_list(ctdb);
2143
2144         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2145                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2146         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2147                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2148         } else {
2149                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2150         }
2151
2152         /* at this point ->pnn is the node which will own each IP
2153            or -1 if there is no node that can cover this ip
2154         */
2155
2156         return;
2157 }
2158
2159 struct get_tunable_callback_data {
2160         const char *tunable;
2161         uint32_t *out;
2162         bool fatal;
2163 };
2164
2165 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2166                                  int32_t res, TDB_DATA outdata,
2167                                  void *callback)
2168 {
2169         struct get_tunable_callback_data *cd =
2170                 (struct get_tunable_callback_data *)callback;
2171         int size;
2172
2173         if (res != 0) {
2174                 /* Already handled in fail callback */
2175                 return;
2176         }
2177
2178         if (outdata.dsize != sizeof(uint32_t)) {
2179                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2180                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2181                                  (int)outdata.dsize));
2182                 cd->fatal = true;
2183                 return;
2184         }
2185
2186         size = talloc_array_length(cd->out);
2187         if (pnn >= size) {
2188                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2189                                  cd->tunable, pnn, size));
2190                 return;
2191         }
2192
2193                 
2194         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2195 }
2196
2197 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2198                                        int32_t res, TDB_DATA outdata,
2199                                        void *callback)
2200 {
2201         struct get_tunable_callback_data *cd =
2202                 (struct get_tunable_callback_data *)callback;
2203
2204         switch (res) {
2205         case -ETIME:
2206                 DEBUG(DEBUG_ERR,
2207                       ("Timed out getting tunable \"%s\" from node %d\n",
2208                        cd->tunable, pnn));
2209                 cd->fatal = true;
2210                 break;
2211         case -EINVAL:
2212         case -1:
2213                 DEBUG(DEBUG_WARNING,
2214                       ("Tunable \"%s\" not implemented on node %d\n",
2215                        cd->tunable, pnn));
2216                 break;
2217         default:
2218                 DEBUG(DEBUG_ERR,
2219                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2220                        cd->tunable, pnn));
2221                 cd->fatal = true;
2222         }
2223 }
2224
2225 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2226                                         TALLOC_CTX *tmp_ctx,
2227                                         struct ctdb_node_map *nodemap,
2228                                         const char *tunable,
2229                                         uint32_t default_value)
2230 {
2231         TDB_DATA data;
2232         struct ctdb_control_get_tunable *t;
2233         uint32_t *nodes;
2234         uint32_t *tvals;
2235         struct get_tunable_callback_data callback_data;
2236         int i;
2237
2238         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2239         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2240         for (i=0; i<nodemap->num; i++) {
2241                 tvals[i] = default_value;
2242         }
2243                 
2244         callback_data.out = tvals;
2245         callback_data.tunable = tunable;
2246         callback_data.fatal = false;
2247
2248         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2249         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2250         t = (struct ctdb_control_get_tunable *)data.dptr;
2251         t->length = strlen(tunable)+1;
2252         memcpy(t->name, tunable, t->length);
2253         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2254         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2255                                       nodes, 0, TAKEOVER_TIMEOUT(),
2256                                       false, data,
2257                                       get_tunable_callback,
2258                                       get_tunable_fail_callback,
2259                                       &callback_data) != 0) {
2260                 if (callback_data.fatal) {
2261                         talloc_free(tvals);
2262                         tvals = NULL;
2263                 }
2264         }
2265         talloc_free(nodes);
2266         talloc_free(data.dptr);
2267
2268         return tvals;
2269 }
2270
2271 struct get_runstate_callback_data {
2272         enum ctdb_runstate *out;
2273         bool fatal;
2274 };
2275
2276 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2277                                   int32_t res, TDB_DATA outdata,
2278                                   void *callback_data)
2279 {
2280         struct get_runstate_callback_data *cd =
2281                 (struct get_runstate_callback_data *)callback_data;
2282         int size;
2283
2284         if (res != 0) {
2285                 /* Already handled in fail callback */
2286                 return;
2287         }
2288
2289         if (outdata.dsize != sizeof(uint32_t)) {
2290                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2291                                  pnn, (int)sizeof(uint32_t),
2292                                  (int)outdata.dsize));
2293                 cd->fatal = true;
2294                 return;
2295         }
2296
2297         size = talloc_array_length(cd->out);
2298         if (pnn >= size) {
2299                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2300                                  pnn, size));
2301                 return;
2302         }
2303
2304         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2305 }
2306
2307 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2308                                        int32_t res, TDB_DATA outdata,
2309                                        void *callback)
2310 {
2311         struct get_runstate_callback_data *cd =
2312                 (struct get_runstate_callback_data *)callback;
2313
2314         switch (res) {
2315         case -ETIME:
2316                 DEBUG(DEBUG_ERR,
2317                       ("Timed out getting runstate from node %d\n", pnn));
2318                 cd->fatal = true;
2319                 break;
2320         default:
2321                 DEBUG(DEBUG_WARNING,
2322                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2323                        pnn));
2324         }
2325 }
2326
2327 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2328                                                     TALLOC_CTX *tmp_ctx,
2329                                                     struct ctdb_node_map *nodemap,
2330                                                     enum ctdb_runstate default_value)
2331 {
2332         uint32_t *nodes;
2333         enum ctdb_runstate *rs;
2334         struct get_runstate_callback_data callback_data;
2335         int i;
2336
2337         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2338         CTDB_NO_MEMORY_NULL(ctdb, rs);
2339         for (i=0; i<nodemap->num; i++) {
2340                 rs[i] = default_value;
2341         }
2342
2343         callback_data.out = rs;
2344         callback_data.fatal = false;
2345
2346         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2347         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2348                                       nodes, 0, TAKEOVER_TIMEOUT(),
2349                                       true, tdb_null,
2350                                       get_runstate_callback,
2351                                       get_runstate_fail_callback,
2352                                       &callback_data) != 0) {
2353                 if (callback_data.fatal) {
2354                         free(rs);
2355                         rs = NULL;
2356                 }
2357         }
2358         talloc_free(nodes);
2359
2360         return rs;
2361 }
2362
2363 /* Set internal flags for IP allocation:
2364  *   Clear ip flags
2365  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2366  *   Set NOIPHOST ip flag for each INACTIVE node
2367  *   if all nodes are disabled:
2368  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2369  *   else
2370  *     Set NOIPHOST ip flags for disabled nodes
2371  */
2372 static struct ctdb_ipflags *
2373 set_ipflags_internal(struct ctdb_context *ctdb,
2374                      TALLOC_CTX *tmp_ctx,
2375                      struct ctdb_node_map *nodemap,
2376                      uint32_t *tval_noiptakeover,
2377                      uint32_t *tval_noiphostonalldisabled,
2378                      enum ctdb_runstate *runstate)
2379 {
2380         int i;
2381         struct ctdb_ipflags *ipflags;
2382
2383         /* Clear IP flags - implicit due to talloc_zero */
2384         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2385         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2386
2387         for (i=0;i<nodemap->num;i++) {
2388                 /* Can not take IPs on node with NoIPTakeover set */
2389                 if (tval_noiptakeover[i] != 0) {
2390                         ipflags[i].noiptakeover = true;
2391                 }
2392
2393                 /* Can not host IPs on node not in RUNNING state */
2394                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2395                         ipflags[i].noiphost = true;
2396                         continue;
2397                 }
2398                 /* Can not host IPs on INACTIVE node */
2399                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2400                         ipflags[i].noiphost = true;
2401                 }
2402                 /* Remember the runstate */
2403                 ipflags[i].runstate = runstate[i];
2404         }
2405
2406         if (all_nodes_are_disabled(nodemap)) {
2407                 /* If all nodes are disabled, can not host IPs on node
2408                  * with NoIPHostOnAllDisabled set
2409                  */
2410                 for (i=0;i<nodemap->num;i++) {
2411                         if (tval_noiphostonalldisabled[i] != 0) {
2412                                 ipflags[i].noiphost = true;
2413                         }
2414                 }
2415         } else {
2416                 /* If some nodes are not disabled, then can not host
2417                  * IPs on DISABLED node
2418                  */
2419                 for (i=0;i<nodemap->num;i++) {
2420                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2421                                 ipflags[i].noiphost = true;
2422                         }
2423                 }
2424         }
2425
2426         return ipflags;
2427 }
2428
2429 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2430                                         TALLOC_CTX *tmp_ctx,
2431                                         struct ctdb_node_map *nodemap)
2432 {
2433         uint32_t *tval_noiptakeover;
2434         uint32_t *tval_noiphostonalldisabled;
2435         struct ctdb_ipflags *ipflags;
2436         enum ctdb_runstate *runstate;
2437
2438
2439         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2440                                                    "NoIPTakeover", 0);
2441         if (tval_noiptakeover == NULL) {
2442                 return NULL;
2443         }
2444
2445         tval_noiphostonalldisabled =
2446                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2447                                        "NoIPHostOnAllDisabled", 0);
2448         if (tval_noiphostonalldisabled == NULL) {
2449                 /* Caller frees tmp_ctx */
2450                 return NULL;
2451         }
2452
2453         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2454          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2455          * reasonable behaviour on a mixed cluster during upgrade.
2456          */
2457         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2458                                            CTDB_RUNSTATE_RUNNING);
2459         if (runstate == NULL) {
2460                 /* Caller frees tmp_ctx */
2461                 return NULL;
2462         }
2463
2464         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2465                                        tval_noiptakeover,
2466                                        tval_noiphostonalldisabled,
2467                                        runstate);
2468
2469         talloc_free(tval_noiptakeover);
2470         talloc_free(tval_noiphostonalldisabled);
2471         talloc_free(runstate);
2472
2473         return ipflags;
2474 }
2475
2476 struct iprealloc_callback_data {
2477         bool *retry_nodes;
2478         int retry_count;
2479         client_async_callback fail_callback;
2480         void *fail_callback_data;
2481         struct ctdb_node_map *nodemap;
2482 };
2483
2484 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2485                                         int32_t res, TDB_DATA outdata,
2486                                         void *callback)
2487 {
2488         int numnodes;
2489         struct iprealloc_callback_data *cd =
2490                 (struct iprealloc_callback_data *)callback;
2491
2492         numnodes = talloc_array_length(cd->retry_nodes);
2493         if (pnn > numnodes) {
2494                 DEBUG(DEBUG_ERR,
2495                       ("ipreallocated failure from node %d, "
2496                        "but only %d nodes in nodemap\n",
2497                        pnn, numnodes));
2498                 return;
2499         }
2500
2501         /* Can't run the "ipreallocated" event on a INACTIVE node */
2502         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2503                 DEBUG(DEBUG_WARNING,
2504                       ("ipreallocated failed on inactive node %d, ignoring\n",
2505                        pnn));
2506                 return;
2507         }
2508
2509         switch (res) {
2510         case -ETIME:
2511                 /* If the control timed out then that's a real error,
2512                  * so call the real fail callback
2513                  */
2514                 if (cd->fail_callback) {
2515                         cd->fail_callback(ctdb, pnn, res, outdata,
2516                                           cd->fail_callback_data);
2517                 } else {
2518                         DEBUG(DEBUG_WARNING,
2519                               ("iprealloc timed out but no callback registered\n"));
2520                 }
2521                 break;
2522         default:
2523                 /* If not a timeout then either the ipreallocated
2524                  * eventscript (or some setup) failed.  This might
2525                  * have failed because the IPREALLOCATED control isn't
2526                  * implemented - right now there is no way of knowing
2527                  * because the error codes are all folded down to -1.
2528                  * Consider retrying using EVENTSCRIPT control...
2529                  */
2530                 DEBUG(DEBUG_WARNING,
2531                       ("ipreallocated failure from node %d, flagging retry\n",
2532                        pnn));
2533                 cd->retry_nodes[pnn] = true;
2534                 cd->retry_count++;
2535         }
2536 }
2537
2538 struct takeover_callback_data {
2539         bool *node_failed;
2540         client_async_callback fail_callback;
2541         void *fail_callback_data;
2542         struct ctdb_node_map *nodemap;
2543 };
2544
2545 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2546                                        uint32_t node_pnn, int32_t res,
2547                                        TDB_DATA outdata, void *callback_data)
2548 {
2549         struct takeover_callback_data *cd =
2550                 talloc_get_type_abort(callback_data,
2551                                       struct takeover_callback_data);
2552         int i;
2553
2554         for (i = 0; i < cd->nodemap->num; i++) {
2555                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2556                         break;
2557                 }
2558         }
2559
2560         if (i == cd->nodemap->num) {
2561                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2562                 return;
2563         }
2564
2565         if (!cd->node_failed[i]) {
2566                 cd->node_failed[i] = true;
2567                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2568                                   cd->fail_callback_data);
2569         }
2570 }
2571
2572 /*
2573   make any IP alias changes for public addresses that are necessary 
2574  */
2575 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2576                       uint32_t *force_rebalance_nodes,
2577                       client_async_callback fail_callback, void *callback_data)
2578 {
2579         int i, j, ret;
2580         struct ctdb_public_ip ip;
2581         uint32_t *nodes;
2582         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2583         TDB_DATA data;
2584         struct timeval timeout;
2585         struct client_async_data *async_data;
2586         struct ctdb_client_control_state *state;
2587         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2588         struct ctdb_ipflags *ipflags;
2589         struct takeover_callback_data *takeover_data;
2590         struct iprealloc_callback_data iprealloc_data;
2591         bool *retry_data;
2592         bool can_host_ips;
2593
2594         /*
2595          * ip failover is completely disabled, just send out the 
2596          * ipreallocated event.
2597          */
2598         if (ctdb->tunable.disable_ip_failover != 0) {
2599                 goto ipreallocated;
2600         }
2601
2602         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2603         if (ipflags == NULL) {
2604                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2605                 talloc_free(tmp_ctx);
2606                 return -1;
2607         }
2608
2609         /* Short-circuit IP allocation if no nodes are in the RUNNING
2610          * runstate yet, since no nodes will be able to host IPs */
2611         can_host_ips = false;
2612         for (i=0; i<nodemap->num; i++) {
2613                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2614                         can_host_ips = true;
2615                 }
2616         }
2617         if (!can_host_ips) {
2618                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2619                 return 0;
2620         }
2621
2622         /* Do the IP reassignment calculations */
2623         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2624
2625         /* Now tell all nodes to release any public IPs should not
2626          * host.  This will be a NOOP on nodes that don't currently
2627          * hold the given IP.
2628          */
2629         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2630         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2631
2632         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2633                                                        bool, nodemap->num);
2634         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2635         takeover_data->fail_callback = fail_callback;
2636         takeover_data->fail_callback_data = callback_data;
2637         takeover_data->nodemap = nodemap;
2638
2639         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2640         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2641
2642         async_data->fail_callback = takeover_run_fail_callback;
2643         async_data->callback_data = takeover_data;
2644
2645         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2646
2647         /* Send a RELEASE_IP to all nodes that should not be hosting
2648          * each IP.  For each IP, all but one of these will be
2649          * redundant.  However, the redundant ones are used to tell
2650          * nodes which node should be hosting the IP so that commands
2651          * like "ctdb ip" can display a particular nodes idea of who
2652          * is hosting what. */
2653         for (i=0;i<nodemap->num;i++) {
2654                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2655                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2656                         continue;
2657                 }
2658
2659                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2660                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2661                                 /* This node should be serving this
2662                                    vnn so dont tell it to release the ip
2663                                 */
2664                                 continue;
2665                         }
2666                         ip.pnn  = tmp_ip->pnn;
2667                         ip.addr = tmp_ip->addr;
2668
2669                         timeout = TAKEOVER_TIMEOUT();
2670                         data.dsize = sizeof(ip);
2671                         data.dptr  = (uint8_t *)&ip;
2672                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2673                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2674                                                   data, async_data,
2675                                                   &timeout, NULL);
2676                         if (state == NULL) {
2677                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2678                                 talloc_free(tmp_ctx);
2679                                 return -1;
2680                         }
2681
2682                         ctdb_client_async_add(async_data, state);
2683                 }
2684         }
2685         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2686                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2687                 talloc_free(tmp_ctx);
2688                 return -1;
2689         }
2690         talloc_free(async_data);
2691
2692
2693         /* For each IP, send a TAKOVER_IP to the node that should be
2694          * hosting it.  Many of these will often be redundant (since
2695          * the allocation won't have changed) but they can be useful
2696          * to recover from inconsistencies. */
2697         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2698         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2699
2700         async_data->fail_callback = fail_callback;
2701         async_data->callback_data = callback_data;
2702
2703         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2704                 if (tmp_ip->pnn == -1) {
2705                         /* this IP won't be taken over */
2706                         continue;
2707                 }
2708
2709                 ip.pnn  = tmp_ip->pnn;
2710                 ip.addr = tmp_ip->addr;
2711
2712                 timeout = TAKEOVER_TIMEOUT();
2713                 data.dsize = sizeof(ip);
2714                 data.dptr  = (uint8_t *)&ip;
2715                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2716                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2717                                           data, async_data, &timeout, NULL);
2718                 if (state == NULL) {
2719                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2720                         talloc_free(tmp_ctx);
2721                         return -1;
2722                 }
2723
2724                 ctdb_client_async_add(async_data, state);
2725         }
2726         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2727                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2728                 talloc_free(tmp_ctx);
2729                 return -1;
2730         }
2731
2732 ipreallocated:
2733         /*
2734          * Tell all nodes to run eventscripts to process the
2735          * "ipreallocated" event.  This can do a lot of things,
2736          * including restarting services to reconfigure them if public
2737          * IPs have moved.  Once upon a time this event only used to
2738          * update natgw.
2739          */
2740         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2741         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2742         iprealloc_data.retry_nodes = retry_data;
2743         iprealloc_data.retry_count = 0;
2744         iprealloc_data.fail_callback = fail_callback;
2745         iprealloc_data.fail_callback_data = callback_data;
2746         iprealloc_data.nodemap = nodemap;
2747
2748         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2749         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2750                                         nodes, 0, TAKEOVER_TIMEOUT(),
2751                                         false, tdb_null,
2752                                         NULL, iprealloc_fail_callback,
2753                                         &iprealloc_data);
2754         if (ret != 0) {
2755                 /* If the control failed then we should retry to any
2756                  * nodes flagged by iprealloc_fail_callback using the
2757                  * EVENTSCRIPT control.  This is a best-effort at
2758                  * backward compatiblity when running a mixed cluster
2759                  * where some nodes have not yet been upgraded to
2760                  * support the IPREALLOCATED control.
2761                  */
2762                 DEBUG(DEBUG_WARNING,
2763                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2764
2765                 nodes = talloc_array(tmp_ctx, uint32_t,
2766                                      iprealloc_data.retry_count);
2767                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2768
2769                 j = 0;
2770                 for (i=0; i<nodemap->num; i++) {
2771                         if (iprealloc_data.retry_nodes[i]) {
2772                                 nodes[j] = i;
2773                                 j++;
2774                         }
2775                 }
2776
2777                 data.dptr  = discard_const("ipreallocated");
2778                 data.dsize = strlen((char *)data.dptr) + 1; 
2779                 ret = ctdb_client_async_control(ctdb,
2780                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2781                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2782                                                 false, data,
2783                                                 NULL, fail_callback,
2784                                                 callback_data);
2785                 if (ret != 0) {
2786                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2787                 }
2788         }
2789
2790         talloc_free(tmp_ctx);
2791         return ret;
2792 }
2793
2794
2795 /*
2796   destroy a ctdb_client_ip structure
2797  */
2798 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2799 {
2800         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2801                 ctdb_addr_to_str(&ip->addr),
2802                 ntohs(ip->addr.ip.sin_port),
2803                 ip->client_id));
2804
2805         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2806         return 0;
2807 }
2808
2809 /*
2810   called by a client to inform us of a TCP connection that it is managing
2811   that should tickled with an ACK when IP takeover is done
2812  */
2813 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2814                                 TDB_DATA indata)
2815 {
2816         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2817         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2818         struct ctdb_tcp_list *tcp;
2819         struct ctdb_tcp_connection t;
2820         int ret;
2821         TDB_DATA data;
2822         struct ctdb_client_ip *ip;
2823         struct ctdb_vnn *vnn;
2824         ctdb_sock_addr addr;
2825
2826         /* If we don't have public IPs, tickles are useless */
2827         if (ctdb->vnn == NULL) {
2828                 return 0;
2829         }
2830
2831         tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2832
2833         addr = tcp_sock->src;
2834         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2835         addr = tcp_sock->dest;
2836         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2837
2838         ZERO_STRUCT(addr);
2839         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2840         vnn = find_public_ip_vnn(ctdb, &addr);
2841         if (vnn == NULL) {
2842                 switch (addr.sa.sa_family) {
2843                 case AF_INET:
2844                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2845                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2846                                         ctdb_addr_to_str(&addr)));
2847                         }
2848                         break;
2849                 case AF_INET6:
2850                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2851                                 ctdb_addr_to_str(&addr)));
2852                         break;
2853                 default:
2854                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2855                 }
2856
2857                 return 0;
2858         }
2859
2860         if (vnn->pnn != ctdb->pnn) {
2861                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2862                         ctdb_addr_to_str(&addr),
2863                         client_id, client->pid));
2864                 /* failing this call will tell smbd to die */
2865                 return -1;
2866         }
2867
2868         ip = talloc(client, struct ctdb_client_ip);
2869         CTDB_NO_MEMORY(ctdb, ip);
2870
2871         ip->ctdb      = ctdb;
2872         ip->addr      = addr;
2873         ip->client_id = client_id;
2874         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2875         DLIST_ADD(ctdb->client_ip_list, ip);
2876
2877         tcp = talloc(client, struct ctdb_tcp_list);
2878         CTDB_NO_MEMORY(ctdb, tcp);
2879
2880         tcp->connection.src_addr = tcp_sock->src;
2881         tcp->connection.dst_addr = tcp_sock->dest;
2882
2883         DLIST_ADD(client->tcp_list, tcp);
2884
2885         t.src_addr = tcp_sock->src;
2886         t.dst_addr = tcp_sock->dest;
2887
2888         data.dptr = (uint8_t *)&t;
2889         data.dsize = sizeof(t);
2890
2891         switch (addr.sa.sa_family) {
2892         case AF_INET:
2893                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2894                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2895                         ctdb_addr_to_str(&tcp_sock->src),
2896                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2897                 break;
2898         case AF_INET6:
2899                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2900                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2901                         ctdb_addr_to_str(&tcp_sock->src),
2902                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2903                 break;
2904         default:
2905                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2906         }
2907
2908
2909         /* tell all nodes about this tcp connection */
2910         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2911                                        CTDB_CONTROL_TCP_ADD,
2912                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2913         if (ret != 0) {
2914                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2915                 return -1;
2916         }
2917
2918         return 0;
2919 }
2920
2921 /*
2922   find a tcp address on a list
2923  */
2924 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2925                                            struct ctdb_tcp_connection *tcp)
2926 {
2927         int i;
2928
2929         if (array == NULL) {
2930                 return NULL;
2931         }
2932
2933         for (i=0;i<array->num;i++) {
2934                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2935                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2936                         return &array->connections[i];
2937                 }
2938         }
2939         return NULL;
2940 }
2941
2942
2943
2944 /*
2945   called by a daemon to inform us of a TCP connection that one of its
2946   clients managing that should tickled with an ACK when IP takeover is
2947   done
2948  */
2949 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2950 {
2951         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2952         struct ctdb_tcp_array *tcparray;
2953         struct ctdb_tcp_connection tcp;
2954         struct ctdb_vnn *vnn;
2955
2956         /* If we don't have public IPs, tickles are useless */
2957         if (ctdb->vnn == NULL) {
2958                 return 0;
2959         }
2960
2961         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2962         if (vnn == NULL) {
2963                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2964                         ctdb_addr_to_str(&p->dst_addr)));
2965
2966                 return -1;
2967         }
2968
2969
2970         tcparray = vnn->tcp_array;
2971
2972         /* If this is the first tickle */
2973         if (tcparray == NULL) {
2974                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2975                 CTDB_NO_MEMORY(ctdb, tcparray);
2976                 vnn->tcp_array = tcparray;
2977
2978                 tcparray->num = 0;
2979                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2980                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2981
2982                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2983                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2984                 tcparray->num++;
2985
2986                 if (tcp_update_needed) {
2987                         vnn->tcp_update_needed = true;
2988                 }
2989                 return 0;
2990         }
2991
2992
2993         /* Do we already have this tickle ?*/
2994         tcp.src_addr = p->src_addr;
2995         tcp.dst_addr = p->dst_addr;
2996         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2997                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2998                         ctdb_addr_to_str(&tcp.dst_addr),
2999                         ntohs(tcp.dst_addr.ip.sin_port),
3000                         vnn->pnn));
3001                 return 0;
3002         }
3003
3004         /* A new tickle, we must add it to the array */
3005         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3006                                         struct ctdb_tcp_connection,
3007                                         tcparray->num+1);
3008         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3009
3010         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3011         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3012         tcparray->num++;
3013
3014         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3015                 ctdb_addr_to_str(&tcp.dst_addr),
3016                 ntohs(tcp.dst_addr.ip.sin_port),
3017                 vnn->pnn));
3018
3019         if (tcp_update_needed) {
3020                 vnn->tcp_update_needed = true;
3021         }
3022
3023         return 0;
3024 }
3025
3026
3027 /*
3028   called by a daemon to inform us of a TCP connection that one of its
3029   clients managing that should tickled with an ACK when IP takeover is
3030   done
3031  */
3032 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3033 {
3034         struct ctdb_tcp_connection *tcpp;
3035         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3036
3037         if (vnn == NULL) {
3038                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3039                         ctdb_addr_to_str(&conn->dst_addr)));
3040                 return;
3041         }
3042
3043         /* if the array is empty we cant remove it
3044            and we dont need to do anything
3045          */
3046         if (vnn->tcp_array == NULL) {
3047                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3048                         ctdb_addr_to_str(&conn->dst_addr),
3049                         ntohs(conn->dst_addr.ip.sin_port)));
3050                 return;
3051         }
3052
3053
3054         /* See if we know this connection
3055            if we dont know this connection  then we dont need to do anything
3056          */
3057         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3058         if (tcpp == NULL) {
3059                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3060                         ctdb_addr_to_str(&conn->dst_addr),
3061                         ntohs(conn->dst_addr.ip.sin_port)));
3062                 return;
3063         }
3064
3065
3066         /* We need to remove this entry from the array.
3067            Instead of allocating a new array and copying data to it
3068            we cheat and just copy the last entry in the existing array
3069            to the entry that is to be removed and just shring the 
3070            ->num field
3071          */
3072         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3073         vnn->tcp_array->num--;
3074
3075         /* If we deleted the last entry we also need to remove the entire array
3076          */
3077         if (vnn->tcp_array->num == 0) {
3078                 talloc_free(vnn->tcp_array);
3079                 vnn->tcp_array = NULL;
3080         }               
3081
3082         vnn->tcp_update_needed = true;
3083
3084         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3085                 ctdb_addr_to_str(&conn->src_addr),
3086                 ntohs(conn->src_addr.ip.sin_port)));
3087 }
3088
3089
3090 /*
3091   called by a daemon to inform us of a TCP connection that one of its
3092   clients used are no longer needed in the tickle database
3093  */
3094 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3095 {
3096         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3097
3098         /* If we don't have public IPs, tickles are useless */
3099         if (ctdb->vnn == NULL) {
3100                 return 0;
3101         }
3102
3103         ctdb_remove_tcp_connection(ctdb, conn);
3104
3105         return 0;
3106 }
3107
3108
3109 /*
3110   Called when another daemon starts - causes all tickles for all
3111   public addresses we are serving to be sent to the new node on the
3112   next check.  This actually causes the next scheduled call to
3113   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3114   doesn't require careful error handling.
3115  */
3116 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3117 {
3118         struct ctdb_vnn *vnn;
3119
3120         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3121                            (unsigned long) pnn));
3122
3123         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3124                 vnn->tcp_update_needed = true;
3125         }
3126
3127         return 0;
3128 }
3129
3130
3131 /*
3132   called when a client structure goes away - hook to remove
3133   elements from the tcp_list in all daemons
3134  */
3135 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3136 {
3137         while (client->tcp_list) {
3138                 struct ctdb_tcp_list *tcp = client->tcp_list;
3139                 DLIST_REMOVE(client->tcp_list, tcp);
3140                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3141         }
3142 }
3143
3144
3145 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3146 {
3147         struct ctdb_vnn *vnn;
3148         int count = 0;
3149
3150         if (ctdb->tunable.disable_ip_failover == 1) {
3151                 return;
3152         }
3153
3154         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3155                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3156                         ctdb_vnn_unassign_iface(ctdb, vnn);
3157                         continue;
3158                 }
3159                 if (!vnn->iface) {
3160                         continue;
3161                 }
3162
3163                 /* Don't allow multiple releases at once.  Some code,
3164                  * particularly ctdb_tickle_sentenced_connections() is
3165                  * not re-entrant */
3166                 if (vnn->update_in_flight) {
3167                         DEBUG(DEBUG_WARNING,
3168                               (__location__
3169                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3170                                     ctdb_addr_to_str(&vnn->public_address),
3171                                     vnn->public_netmask_bits,
3172                                     ctdb_vnn_iface_string(vnn)));
3173                         continue;
3174                 }
3175                 vnn->update_in_flight = true;
3176
3177                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3178                                     ctdb_addr_to_str(&vnn->public_address),
3179                                     vnn->public_netmask_bits,
3180                                     ctdb_vnn_iface_string(vnn)));
3181
3182                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3183                                   ctdb_vnn_iface_string(vnn),
3184                                   ctdb_addr_to_str(&vnn->public_address),
3185                                   vnn->public_netmask_bits);
3186                 release_kill_clients(ctdb, &vnn->public_address);
3187                 ctdb_vnn_unassign_iface(ctdb, vnn);
3188                 vnn->update_in_flight = false;
3189                 count++;
3190         }
3191
3192         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3193 }
3194
3195
3196 /*
3197   get list of public IPs
3198  */
3199 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3200                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3201 {
3202         int i, num, len;
3203         struct ctdb_all_public_ips *ips;
3204         struct ctdb_vnn *vnn;
3205         bool only_available = false;
3206
3207         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3208                 only_available = true;
3209         }
3210
3211         /* count how many public ip structures we have */
3212         num = 0;
3213         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3214                 num++;
3215         }
3216
3217         len = offsetof(struct ctdb_all_public_ips, ips) + 
3218                 num*sizeof(struct ctdb_public_ip);
3219         ips = talloc_zero_size(outdata, len);
3220         CTDB_NO_MEMORY(ctdb, ips);
3221
3222         i = 0;
3223         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3224                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3225                         continue;
3226                 }
3227                 ips->ips[i].pnn  = vnn->pnn;
3228                 ips->ips[i].addr = vnn->public_address;
3229                 i++;
3230         }
3231         ips->num = i;
3232         len = offsetof(struct ctdb_all_public_ips, ips) +
3233                 i*sizeof(struct ctdb_public_ip);
3234
3235         outdata->dsize = len;
3236         outdata->dptr  = (uint8_t *)ips;
3237
3238         return 0;
3239 }
3240
3241
3242 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3243                                         struct ctdb_req_control *c,
3244                                         TDB_DATA indata,
3245                                         TDB_DATA *outdata)
3246 {
3247         int i, num, len;
3248         ctdb_sock_addr *addr;
3249         struct ctdb_control_public_ip_info *info;
3250         struct ctdb_vnn *vnn;
3251
3252         addr = (ctdb_sock_addr *)indata.dptr;
3253
3254         vnn = find_public_ip_vnn(ctdb, addr);
3255         if (vnn == NULL) {
3256                 /* if it is not a public ip   it could be our 'single ip' */
3257                 if (ctdb->single_ip_vnn) {
3258                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3259                                 vnn = ctdb->single_ip_vnn;
3260                         }
3261                 }
3262         }
3263         if (vnn == NULL) {
3264                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3265                                  "'%s'not a public address\n",
3266                                  ctdb_addr_to_str(addr)));
3267                 return -1;
3268         }
3269
3270         /* count how many public ip structures we have */
3271         num = 0;
3272         for (;vnn->ifaces[num];) {
3273                 num++;
3274         }
3275
3276         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3277                 num*sizeof(struct ctdb_control_iface_info);
3278         info = talloc_zero_size(outdata, len);
3279         CTDB_NO_MEMORY(ctdb, info);
3280
3281         info->ip.addr = vnn->public_address;
3282         info->ip.pnn = vnn->pnn;
3283         info->active_idx = 0xFFFFFFFF;
3284
3285         for (i=0; vnn->ifaces[i]; i++) {
3286                 struct ctdb_iface *cur;
3287
3288                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3289                 if (cur == NULL) {
3290                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3291                                            vnn->ifaces[i]));
3292                         return -1;
3293                 }
3294                 if (vnn->iface == cur) {
3295                         info->active_idx = i;
3296                 }
3297                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3298                 info->ifaces[i].link_state = cur->link_up;
3299                 info->ifaces[i].references = cur->references;
3300         }
3301         info->num = i;
3302         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3303                 i*sizeof(struct ctdb_control_iface_info);
3304
3305         outdata->dsize = len;
3306         outdata->dptr  = (uint8_t *)info;
3307
3308         return 0;
3309 }
3310
3311 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3312                                 struct ctdb_req_control *c,
3313                                 TDB_DATA *outdata)
3314 {
3315         int i, num, len;
3316         struct ctdb_control_get_ifaces *ifaces;
3317         struct ctdb_iface *cur;
3318
3319         /* count how many public ip structures we have */
3320         num = 0;
3321         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3322                 num++;
3323         }
3324
3325         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3326                 num*sizeof(struct ctdb_control_iface_info);
3327         ifaces = talloc_zero_size(outdata, len);
3328         CTDB_NO_MEMORY(ctdb, ifaces);
3329
3330         i = 0;
3331         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3332                 strcpy(ifaces->ifaces[i].name, cur->name);
3333                 ifaces->ifaces[i].link_state = cur->link_up;
3334                 ifaces->ifaces[i].references = cur->references;
3335                 i++;
3336         }
3337         ifaces->num = i;
3338         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3339                 i*sizeof(struct ctdb_control_iface_info);
3340
3341         outdata->dsize = len;
3342         outdata->dptr  = (uint8_t *)ifaces;
3343
3344         return 0;
3345 }
3346
3347 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3348                                     struct ctdb_req_control *c,
3349                                     TDB_DATA indata)
3350 {
3351         struct ctdb_control_iface_info *info;
3352         struct ctdb_iface *iface;
3353         bool link_up = false;
3354
3355         info = (struct ctdb_control_iface_info *)indata.dptr;
3356
3357         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3358                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3359                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3360                                   len, len, info->name));
3361                 return -1;
3362         }
3363
3364         switch (info->link_state) {
3365         case 0:
3366                 link_up = false;
3367                 break;
3368         case 1:
3369                 link_up = true;
3370                 break;
3371         default:
3372                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3373                                   (unsigned int)info->link_state));
3374                 return -1;
3375         }
3376
3377         if (info->references != 0) {
3378                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3379                                   (unsigned int)info->references));
3380                 return -1;
3381         }
3382
3383         iface = ctdb_find_iface(ctdb, info->name);
3384         if (iface == NULL) {
3385                 return -1;
3386         }
3387
3388         if (link_up == iface->link_up) {
3389                 return 0;
3390         }
3391
3392         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3393               ("iface[%s] has changed it's link status %s => %s\n",
3394                iface->name,
3395                iface->link_up?"up":"down",
3396                link_up?"up":"down"));
3397
3398         iface->link_up = link_up;
3399         return 0;
3400 }
3401
3402
3403 /* 
3404    structure containing the listening socket and the list of tcp connections
3405    that the ctdb daemon is to kill
3406 */
3407 struct ctdb_kill_tcp {
3408         struct ctdb_vnn *vnn;
3409         struct ctdb_context *ctdb;
3410         int capture_fd;
3411         struct tevent_fd *fde;
3412         trbt_tree_t *connections;
3413         void *private_data;
3414 };
3415
3416 /*
3417   a tcp connection that is to be killed
3418  */
3419 struct ctdb_killtcp_con {
3420         ctdb_sock_addr src_addr;
3421         ctdb_sock_addr dst_addr;
3422         int count;
3423         struct ctdb_kill_tcp *killtcp;
3424 };
3425
3426 /* this function is used to create a key to represent this socketpair
3427    in the killtcp tree.
3428    this key is used to insert and lookup matching socketpairs that are
3429    to be tickled and RST
3430 */
3431 #define KILLTCP_KEYLEN  10
3432 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3433 {
3434         static uint32_t key[KILLTCP_KEYLEN];
3435
3436         bzero(key, sizeof(key));
3437
3438         if (src->sa.sa_family != dst->sa.sa_family) {
3439                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3440                 return key;
3441         }
3442         
3443         switch (src->sa.sa_family) {
3444         case AF_INET:
3445                 key[0]  = dst->ip.sin_addr.s_addr;
3446                 key[1]  = src->ip.sin_addr.s_addr;
3447                 key[2]  = dst->ip.sin_port;
3448                 key[3]  = src->ip.sin_port;
3449                 break;
3450         case AF_INET6: {
3451                 uint32_t *dst6_addr32 =
3452                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3453                 uint32_t *src6_addr32 =
3454                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3455                 key[0]  = dst6_addr32[3];
3456                 key[1]  = src6_addr32[3];
3457                 key[2]  = dst6_addr32[2];
3458                 key[3]  = src6_addr32[2];
3459                 key[4]  = dst6_addr32[1];
3460                 key[5]  = src6_addr32[1];
3461                 key[6]  = dst6_addr32[0];
3462                 key[7]  = src6_addr32[0];
3463                 key[8]  = dst->ip6.sin6_port;
3464                 key[9]  = src->ip6.sin6_port;
3465                 break;
3466         }
3467         default:
3468                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3469                 return key;
3470         }
3471
3472         return key;
3473 }
3474
3475 /*
3476   called when we get a read event on the raw socket
3477  */
3478 static void capture_tcp_handler(struct tevent_context *ev,
3479                                 struct tevent_fd *fde,
3480                                 uint16_t flags, void *private_data)
3481 {
3482         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3483         struct ctdb_killtcp_con *con;
3484         ctdb_sock_addr src, dst;
3485         uint32_t ack_seq, seq;
3486
3487         if (!(flags & TEVENT_FD_READ)) {
3488                 return;
3489         }
3490
3491         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3492                                 killtcp->private_data,
3493                                 &src, &dst,
3494                                 &ack_seq, &seq) != 0) {
3495                 /* probably a non-tcp ACK packet */
3496                 return;
3497         }
3498
3499         /* check if we have this guy in our list of connections
3500            to kill
3501         */
3502         con = trbt_lookuparray32(killtcp->connections, 
3503                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3504         if (con == NULL) {
3505                 /* no this was some other packet we can just ignore */
3506                 return;
3507         }
3508
3509         /* This one has been tickled !
3510            now reset him and remove him from the list.
3511          */
3512         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3513                 ntohs(con->dst_addr.ip.sin_port),
3514                 ctdb_addr_to_str(&con->src_addr),
3515                 ntohs(con->src_addr.ip.sin_port)));
3516
3517         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3518         talloc_free(con);
3519 }
3520
3521
3522 /* when traversing the list of all tcp connections to send tickle acks to
3523    (so that we can capture the ack coming back and kill the connection
3524     by a RST)
3525    this callback is called for each connection we are currently trying to kill
3526 */
3527 static int tickle_connection_traverse(void *param, void *data)
3528 {
3529         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3530
3531         /* have tried too many times, just give up */
3532         if (con->count >= 5) {
3533                 /* can't delete in traverse: reparent to delete_cons */
3534                 talloc_steal(param, con);
3535                 return 0;
3536         }
3537
3538         /* othervise, try tickling it again */
3539         con->count++;
3540         ctdb_sys_send_tcp(
3541                 (ctdb_sock_addr *)&con->dst_addr,
3542                 (ctdb_sock_addr *)&con->src_addr,
3543                 0, 0, 0);
3544         return 0;
3545 }
3546
3547
3548 /* 
3549    called every second until all sentenced connections have been reset
3550  */
3551 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3552                                               struct tevent_timer *te,
3553                                               struct timeval t, void *private_data)
3554 {
3555         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3556         void *delete_cons = talloc_new(NULL);
3557
3558         /* loop over all connections sending tickle ACKs */
3559         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3560
3561         /* now we've finished traverse, it's safe to do deletion. */
3562         talloc_free(delete_cons);
3563
3564         /* If there are no more connections to kill we can remove the
3565            entire killtcp structure
3566          */
3567         if ( (killtcp->connections == NULL) || 
3568              (killtcp->connections->root == NULL) ) {
3569                 talloc_free(killtcp);
3570                 return;
3571         }
3572
3573         /* try tickling them again in a seconds time
3574          */
3575         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3576                          timeval_current_ofs(1, 0),
3577                          ctdb_tickle_sentenced_connections, killtcp);
3578 }
3579
3580 /*
3581   destroy the killtcp structure
3582  */
3583 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3584 {
3585         struct ctdb_vnn *tmpvnn;
3586
3587         /* verify that this vnn is still active */
3588         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3589                 if (tmpvnn == killtcp->vnn) {
3590                         break;
3591                 }
3592         }
3593
3594         if (tmpvnn == NULL) {
3595                 return 0;
3596         }
3597
3598         if (killtcp->vnn->killtcp != killtcp) {
3599                 return 0;
3600         }
3601
3602         killtcp->vnn->killtcp = NULL;
3603
3604         return 0;
3605 }
3606
3607
3608 /* nothing fancy here, just unconditionally replace any existing
3609    connection structure with the new one.
3610
3611    dont even free the old one if it did exist, that one is talloc_stolen
3612    by the same node in the tree anyway and will be deleted when the new data 
3613    is deleted
3614 */
3615 static void *add_killtcp_callback(void *parm, void *data)
3616 {
3617         return parm;
3618 }
3619
3620 /*
3621   add a tcp socket to the list of connections we want to RST
3622  */
3623 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3624                                        ctdb_sock_addr *s,
3625                                        ctdb_sock_addr *d)
3626 {
3627         ctdb_sock_addr src, dst;
3628         struct ctdb_kill_tcp *killtcp;
3629         struct ctdb_killtcp_con *con;
3630         struct ctdb_vnn *vnn;
3631
3632         ctdb_canonicalize_ip(s, &src);
3633         ctdb_canonicalize_ip(d, &dst);
3634
3635         vnn = find_public_ip_vnn(ctdb, &dst);
3636         if (vnn == NULL) {
3637                 vnn = find_public_ip_vnn(ctdb, &src);
3638         }
3639         if (vnn == NULL) {
3640                 /* if it is not a public ip   it could be our 'single ip' */
3641                 if (ctdb->single_ip_vnn) {
3642                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3643                                 vnn = ctdb->single_ip_vnn;
3644                         }
3645                 }
3646         }
3647         if (vnn == NULL) {
3648                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3649                 return -1;
3650         }
3651
3652         killtcp = vnn->killtcp;
3653         
3654         /* If this is the first connection to kill we must allocate
3655            a new structure
3656          */
3657         if (killtcp == NULL) {
3658                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3659                 CTDB_NO_MEMORY(ctdb, killtcp);
3660
3661                 killtcp->vnn         = vnn;
3662                 killtcp->ctdb        = ctdb;
3663                 killtcp->capture_fd  = -1;
3664                 killtcp->connections = trbt_create(killtcp, 0);
3665
3666                 vnn->killtcp         = killtcp;
3667                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3668         }
3669
3670
3671
3672         /* create a structure that describes this connection we want to
3673            RST and store it in killtcp->connections
3674         */
3675         con = talloc(killtcp, struct ctdb_killtcp_con);
3676         CTDB_NO_MEMORY(ctdb, con);
3677         con->src_addr = src;
3678         con->dst_addr = dst;
3679         con->count    = 0;
3680         con->killtcp  = killtcp;
3681
3682
3683         trbt_insertarray32_callback(killtcp->connections,
3684                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3685                         add_killtcp_callback, con);
3686
3687         /* 
3688            If we dont have a socket to listen on yet we must create it
3689          */
3690         if (killtcp->capture_fd == -1) {
3691                 const char *iface = ctdb_vnn_iface_string(vnn);
3692                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3693                 if (killtcp->capture_fd == -1) {
3694                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3695                                           "socket on iface '%s' for killtcp (%s)\n",
3696                                           iface, strerror(errno)));
3697                         goto failed;
3698                 }
3699         }
3700
3701
3702         if (killtcp->fde == NULL) {
3703                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3704                                              killtcp->capture_fd,
3705                                              TEVENT_FD_READ,
3706                                              capture_tcp_handler, killtcp);
3707                 tevent_fd_set_auto_close(killtcp->fde);
3708
3709                 /* We also need to set up some events to tickle all these connections
3710                    until they are all reset
3711                 */
3712                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3713                                  ctdb_tickle_sentenced_connections, killtcp);
3714         }
3715
3716         /* tickle him once now */
3717         ctdb_sys_send_tcp(
3718                 &con->dst_addr,
3719                 &con->src_addr,
3720                 0, 0, 0);
3721
3722         return 0;
3723
3724 failed:
3725         talloc_free(vnn->killtcp);
3726         vnn->killtcp = NULL;
3727         return -1;
3728 }
3729
3730 /*
3731   kill a TCP connection.
3732  */
3733 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3734 {
3735         struct ctdb_tcp_connection *killtcp = (struct ctdb_tcp_connection *)indata.dptr;
3736
3737         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3738 }
3739
3740 /*
3741   called by a daemon to inform us of the entire list of TCP tickles for
3742   a particular public address.
3743   this control should only be sent by the node that is currently serving
3744   that public address.
3745  */
3746 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3747 {
3748         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3749         struct ctdb_tcp_array *tcparray;
3750         struct ctdb_vnn *vnn;
3751
3752         /* We must at least have tickles.num or else we cant verify the size
3753            of the received data blob
3754          */
3755         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3756                                         tickles.connections)) {
3757                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3758                 return -1;
3759         }
3760
3761         /* verify that the size of data matches what we expect */
3762         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3763                                 tickles.connections)
3764                          + sizeof(struct ctdb_tcp_connection)
3765                                  * list->tickles.num) {
3766                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3767                 return -1;
3768         }
3769
3770         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3771                            ctdb_addr_to_str(&list->addr)));
3772
3773         vnn = find_public_ip_vnn(ctdb, &list->addr);
3774         if (vnn == NULL) {
3775                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3776                         ctdb_addr_to_str(&list->addr)));
3777
3778                 return 1;
3779         }
3780
3781         /* remove any old ticklelist we might have */
3782         talloc_free(vnn->tcp_array);
3783         vnn->tcp_array = NULL;
3784
3785         tcparray = talloc(vnn, struct ctdb_tcp_array);
3786         CTDB_NO_MEMORY(ctdb, tcparray);
3787
3788         tcparray->num = list->tickles.num;
3789
3790         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3791         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3792
3793         memcpy(tcparray->connections, &list->tickles.connections[0],
3794                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3795
3796         /* We now have a new fresh tickle list array for this vnn */
3797         vnn->tcp_array = tcparray;
3798
3799         return 0;
3800 }
3801
3802 /*
3803   called to return the full list of tickles for the puclic address associated 
3804   with the provided vnn
3805  */
3806 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3807 {
3808         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3809         struct ctdb_control_tcp_tickle_list *list;
3810         struct ctdb_tcp_array *tcparray;
3811         int num;
3812         struct ctdb_vnn *vnn;
3813
3814         vnn = find_public_ip_vnn(ctdb, addr);
3815         if (vnn == NULL) {
3816                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3817                         ctdb_addr_to_str(addr)));
3818
3819                 return 1;
3820         }
3821
3822         tcparray = vnn->tcp_array;
3823         if (tcparray) {
3824                 num = tcparray->num;
3825         } else {
3826                 num = 0;
3827         }
3828
3829         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3830                                 tickles.connections)
3831                         + sizeof(struct ctdb_tcp_connection) * num;
3832
3833         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3834         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3835         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3836
3837         list->addr = *addr;
3838         list->tickles.num = num;
3839         if (num) {
3840                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3841                         sizeof(struct ctdb_tcp_connection) * num);
3842         }
3843
3844         return 0;
3845 }
3846
3847
3848 /*
3849   set the list of all tcp tickles for a public address
3850  */
3851 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3852                                             ctdb_sock_addr *addr,
3853                                             struct ctdb_tcp_array *tcparray)
3854 {
3855         int ret, num;
3856         TDB_DATA data;
3857         struct ctdb_control_tcp_tickle_list *list;
3858
3859         if (tcparray) {
3860                 num = tcparray->num;
3861         } else {
3862                 num = 0;
3863         }
3864
3865         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3866                                 tickles.connections) +
3867                         sizeof(struct ctdb_tcp_connection) * num;
3868         data.dptr = talloc_size(ctdb, data.dsize);
3869         CTDB_NO_MEMORY(ctdb, data.dptr);
3870
3871         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3872         list->addr = *addr;
3873         list->tickles.num = num;
3874         if (tcparray) {
3875                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3876         }
3877
3878         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3879                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3880                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3881         if (ret != 0) {
3882                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3883                 return -1;
3884         }
3885
3886         talloc_free(data.dptr);
3887
3888         return ret;
3889 }
3890
3891
3892 /*
3893   perform tickle updates if required
3894  */
3895 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3896                                     struct tevent_timer *te,
3897                                     struct timeval t, void *private_data)
3898 {
3899         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3900         int ret;
3901         struct ctdb_vnn *vnn;
3902
3903         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3904                 /* we only send out updates for public addresses that 
3905                    we have taken over
3906                  */
3907                 if (ctdb->pnn != vnn->pnn) {
3908                         continue;
3909                 }
3910                 /* We only send out the updates if we need to */
3911                 if (!vnn->tcp_update_needed) {
3912                         continue;
3913                 }
3914                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3915                                                        &vnn->public_address,
3916                                                        vnn->tcp_array);
3917                 if (ret != 0) {
3918                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3919                                 ctdb_addr_to_str(&vnn->public_address)));
3920                 } else {
3921                         DEBUG(DEBUG_INFO,
3922                               ("Sent tickle update for public address %s\n",
3923                                ctdb_addr_to_str(&vnn->public_address)));
3924                         vnn->tcp_update_needed = false;
3925                 }
3926         }
3927
3928         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3929                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3930                          ctdb_update_tcp_tickles, ctdb);
3931 }
3932
3933 /*
3934   start periodic update of tcp tickles
3935  */
3936 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3937 {
3938         ctdb->tickle_update_context = talloc_new(ctdb);
3939
3940         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3941                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3942                          ctdb_update_tcp_tickles, ctdb);
3943 }
3944
3945
3946
3947
3948 struct control_gratious_arp {
3949         struct ctdb_context *ctdb;
3950         ctdb_sock_addr addr;
3951         const char *iface;
3952         int count;
3953 };
3954
3955 /*
3956   send a control_gratuitous arp
3957  */
3958 static void send_gratious_arp(struct tevent_context *ev,
3959                               struct tevent_timer *te,
3960                               struct timeval t, void *private_data)
3961 {
3962         int ret;
3963         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3964                                                         struct control_gratious_arp);
3965
3966         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3967         if (ret != 0) {
3968                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3969                                  arp->iface, strerror(errno)));
3970         }
3971
3972
3973         arp->count++;
3974         if (arp->count == CTDB_ARP_REPEAT) {
3975                 talloc_free(arp);
3976                 return;
3977         }
3978
3979         tevent_add_timer(arp->ctdb->ev, arp,
3980                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3981                          send_gratious_arp, arp);
3982 }
3983
3984
3985 /*
3986   send a gratious arp 
3987  */
3988 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3989 {
3990         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3991         struct control_gratious_arp *arp;
3992
3993         /* verify the size of indata */
3994         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3995                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3996                                  (unsigned)indata.dsize, 
3997                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3998                 return -1;
3999         }
4000         if (indata.dsize != 
4001                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4002                 + gratious_arp->len ) ){
4003
4004                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4005                         "but should be %u bytes\n", 
4006                          (unsigned)indata.dsize, 
4007                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4008                 return -1;
4009         }
4010
4011
4012         arp = talloc(ctdb, struct control_gratious_arp);
4013         CTDB_NO_MEMORY(ctdb, arp);
4014
4015         arp->ctdb  = ctdb;
4016         arp->addr   = gratious_arp->addr;
4017         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4018         CTDB_NO_MEMORY(ctdb, arp->iface);
4019         arp->count = 0;
4020
4021         tevent_add_timer(arp->ctdb->ev, arp,
4022                          timeval_zero(), send_gratious_arp, arp);
4023
4024         return 0;
4025 }
4026
4027 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4028 {
4029         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4030         int ret;
4031
4032         /* verify the size of indata */
4033         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4034                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4035                 return -1;
4036         }
4037         if (indata.dsize != 
4038                 ( offsetof(struct ctdb_control_ip_iface, iface)
4039                 + pub->len ) ){
4040
4041                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4042                         "but should be %u bytes\n", 
4043                          (unsigned)indata.dsize, 
4044                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4045                 return -1;
4046         }
4047
4048         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4049
4050         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4051
4052         if (ret != 0) {
4053                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4054                 return -1;
4055         }
4056
4057         return 0;
4058 }
4059
4060 struct delete_ip_callback_state {
4061         struct ctdb_req_control *c;
4062 };
4063
4064 /*
4065   called when releaseip event finishes for del_public_address
4066  */
4067 static void delete_ip_callback(struct ctdb_context *ctdb,
4068                                int32_t status, TDB_DATA data,
4069                                const char *errormsg,
4070                                void *private_data)
4071 {
4072         struct delete_ip_callback_state *state =
4073                 talloc_get_type(private_data, struct delete_ip_callback_state);
4074
4075         /* If release failed then fail. */
4076         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4077         talloc_free(private_data);
4078 }
4079
4080 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4081                                         struct ctdb_req_control *c,
4082                                         TDB_DATA indata, bool *async_reply)
4083 {
4084         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4085         struct ctdb_vnn *vnn;
4086
4087         /* verify the size of indata */
4088         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4089                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4090                 return -1;
4091         }
4092         if (indata.dsize != 
4093                 ( offsetof(struct ctdb_control_ip_iface, iface)
4094                 + pub->len ) ){
4095
4096                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4097                         "but should be %u bytes\n", 
4098                          (unsigned)indata.dsize, 
4099                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4100                 return -1;
4101         }
4102
4103         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4104
4105         /* walk over all public addresses until we find a match */
4106         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4107                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4108                         if (vnn->pnn == ctdb->pnn) {
4109                                 struct delete_ip_callback_state *state;
4110                                 struct ctdb_public_ip *ip;
4111                                 TDB_DATA data;
4112                                 int ret;
4113
4114                                 vnn->delete_pending = true;
4115
4116                                 state = talloc(ctdb,
4117                                                struct delete_ip_callback_state);
4118                                 CTDB_NO_MEMORY(ctdb, state);
4119                                 state->c = c;
4120
4121                                 ip = talloc(state, struct ctdb_public_ip);
4122                                 if (ip == NULL) {
4123                                         DEBUG(DEBUG_ERR,
4124                                               (__location__ " Out of memory\n"));
4125                                         talloc_free(state);
4126                                         return -1;
4127                                 }
4128                                 ip->pnn = -1;
4129                                 ip->addr = pub->addr;
4130
4131                                 data.dsize = sizeof(struct ctdb_public_ip);
4132                                 data.dptr = (unsigned char *)ip;
4133
4134                                 ret = ctdb_daemon_send_control(ctdb,
4135                                                                ctdb_get_pnn(ctdb),
4136                                                                0,
4137                                                                CTDB_CONTROL_RELEASE_IP,
4138                                                                0, 0,
4139                                                                data,
4140                                                                delete_ip_callback,
4141                                                                state);
4142                                 if (ret == -1) {
4143                                         DEBUG(DEBUG_ERR,
4144                                               (__location__ "Unable to send "
4145                                                "CTDB_CONTROL_RELEASE_IP\n"));
4146                                         talloc_free(state);
4147                                         return -1;
4148                                 }
4149
4150                                 state->c = talloc_steal(state, c);
4151                                 *async_reply = true;
4152                         } else {
4153                                 /* This IP is not hosted on the
4154                                  * current node so just delete it
4155                                  * now. */
4156                                 do_delete_ip(ctdb, vnn);
4157                         }
4158
4159                         return 0;
4160                 }
4161         }
4162
4163         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4164                          ctdb_addr_to_str(&pub->addr)));
4165         return -1;
4166 }
4167
4168
4169 struct ipreallocated_callback_state {
4170         struct ctdb_req_control *c;
4171 };
4172
4173 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4174                                         int status, void *p)
4175 {
4176         struct ipreallocated_callback_state *state =
4177                 talloc_get_type(p, struct ipreallocated_callback_state);
4178
4179         if (status != 0) {
4180                 DEBUG(DEBUG_ERR,
4181                       (" \"ipreallocated\" event script failed (status %d)\n",
4182                        status));
4183                 if (status == -ETIME) {
4184                         ctdb_ban_self(ctdb);
4185                 }
4186         }
4187
4188         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4189         talloc_free(state);
4190 }
4191
4192 /* A control to run the ipreallocated event */
4193 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4194                                    struct ctdb_req_control *c,
4195                                    bool *async_reply)
4196 {
4197         int ret;
4198         struct ipreallocated_callback_state *state;
4199
4200         state = talloc(ctdb, struct ipreallocated_callback_state);
4201         CTDB_NO_MEMORY(ctdb, state);
4202
4203         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4204
4205         ret = ctdb_event_script_callback(ctdb, state,
4206                                          ctdb_ipreallocated_callback, state,
4207                                          CTDB_EVENT_IPREALLOCATED,
4208                                          "%s", "");
4209
4210         if (ret != 0) {
4211                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4212                 talloc_free(state);
4213                 return -1;
4214         }
4215
4216         /* tell the control that we will be reply asynchronously */
4217         state->c    = talloc_steal(state, c);
4218         *async_reply = true;
4219
4220         return 0;
4221 }
4222
4223
4224 /* This function is called from the recovery daemon to verify that a remote
4225    node has the expected ip allocation.
4226    This is verified against ctdb->ip_tree
4227 */
4228 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4229                                 struct ctdb_all_public_ips *ips,
4230                                 uint32_t pnn)
4231 {
4232         struct ctdb_public_ip_list *tmp_ip; 
4233         int i;
4234
4235         if (ctdb->ip_tree == NULL) {
4236                 /* dont know the expected allocation yet, assume remote node
4237                    is correct. */
4238                 return 0;
4239         }
4240
4241         if (ips == NULL) {
4242                 return 0;
4243         }
4244
4245         for (i=0; i<ips->num; i++) {
4246                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4247                 if (tmp_ip == NULL) {
4248                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4249                         return -1;
4250                 }
4251
4252                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4253                         continue;
4254                 }
4255
4256                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4257                         DEBUG(DEBUG_ERR,
4258                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4259                                pnn,
4260                                ctdb_addr_to_str(&ips->ips[i].addr),
4261                                ips->ips[i].pnn, tmp_ip->pnn));
4262                         return -1;
4263                 }
4264         }
4265
4266         return 0;
4267 }
4268
4269 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4270 {
4271         struct ctdb_public_ip_list *tmp_ip;
4272
4273         /* IP tree is never built if DisableIPFailover is set */
4274         if (ctdb->tunable.disable_ip_failover != 0) {
4275                 return 0;
4276         }
4277
4278         if (ctdb->ip_tree == NULL) {
4279                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4280                 return -1;
4281         }
4282
4283         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4284         if (tmp_ip == NULL) {
4285                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4286                 return -1;
4287         }
4288
4289         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4290         tmp_ip->pnn = ip->pnn;
4291
4292         return 0;
4293 }
4294
4295 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4296 {
4297         TALLOC_FREE(ctdb->ip_tree);
4298 }
4299
4300 struct ctdb_reloadips_handle {
4301         struct ctdb_context *ctdb;
4302         struct ctdb_req_control *c;
4303         int status;
4304         int fd[2];
4305         pid_t child;
4306         struct tevent_fd *fde;
4307 };
4308
4309 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4310 {
4311         if (h == h->ctdb->reload_ips) {
4312                 h->ctdb->reload_ips = NULL;
4313         }
4314         if (h->c != NULL) {
4315                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4316                 h->c = NULL;
4317         }
4318         ctdb_kill(h->ctdb, h->child, SIGKILL);
4319         return 0;
4320 }
4321
4322 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4323                                          struct tevent_timer *te,
4324                                          struct timeval t, void *private_data)
4325 {
4326         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4327
4328         talloc_free(h);
4329 }
4330
4331 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4332                                          struct tevent_fd *fde,
4333                                          uint16_t flags, void *private_data)
4334 {
4335         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4336
4337         char res;
4338         int ret;
4339
4340         ret = sys_read(h->fd[0], &res, 1);
4341         if (ret < 1 || res != 0) {
4342                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4343                 res = 1;
4344         }
4345         h->status = res;
4346
4347         talloc_free(h);
4348 }
4349
4350 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4351 {
4352         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4353         struct ctdb_all_public_ips *ips;
4354         struct ctdb_vnn *vnn;
4355         struct client_async_data *async_data;
4356         struct timeval timeout;
4357         TDB_DATA data;
4358         struct ctdb_client_control_state *state;
4359         bool first_add;
4360         int i, ret;
4361
4362         CTDB_NO_MEMORY(ctdb, mem_ctx);
4363
4364         /* Read IPs from local node */
4365         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4366                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4367         if (ret != 0) {
4368                 DEBUG(DEBUG_ERR,
4369                       ("Unable to fetch public IPs from local node\n"));
4370                 talloc_free(mem_ctx);
4371                 return -1;
4372         }
4373
4374         /* Read IPs file - this is safe since this is a child process */
4375         ctdb->vnn = NULL;
4376         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4377                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4378                 talloc_free(mem_ctx);
4379                 return -1;
4380         }
4381
4382         async_data = talloc_zero(mem_ctx, struct client_async_data);
4383         CTDB_NO_MEMORY(ctdb, async_data);
4384
4385         /* Compare IPs between node and file for IPs to be deleted */
4386         for (i = 0; i < ips->num; i++) {
4387                 /* */
4388                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4389                         if (ctdb_same_ip(&vnn->public_address,
4390                                          &ips->ips[i].addr)) {
4391                                 /* IP is still in file */
4392                                 break;
4393                         }
4394                 }
4395
4396                 if (vnn == NULL) {
4397                         /* Delete IP ips->ips[i] */
4398                         struct ctdb_control_ip_iface *pub;
4399
4400                         DEBUG(DEBUG_NOTICE,
4401                               ("IP %s no longer configured, deleting it\n",
4402                                ctdb_addr_to_str(&ips->ips[i].addr)));
4403
4404                         pub = talloc_zero(mem_ctx,
4405                                           struct ctdb_control_ip_iface);
4406                         CTDB_NO_MEMORY(ctdb, pub);
4407
4408                         pub->addr  = ips->ips[i].addr;
4409                         pub->mask  = 0;
4410                         pub->len   = 0;
4411
4412                         timeout = TAKEOVER_TIMEOUT();
4413
4414                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4415                                               iface) + pub->len;
4416                         data.dptr = (uint8_t *)pub;
4417
4418                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4419                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4420                                                   0, data, async_data,
4421                                                   &timeout, NULL);
4422                         if (state == NULL) {
4423                                 DEBUG(DEBUG_ERR,
4424                                       (__location__
4425                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4426                                 goto failed;
4427                         }
4428
4429                         ctdb_client_async_add(async_data, state);
4430                 }
4431         }
4432
4433         /* Compare IPs between node and file for IPs to be added */
4434         first_add = true;
4435         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4436                 for (i = 0; i < ips->num; i++) {
4437                         if (ctdb_same_ip(&vnn->public_address,
4438                                          &ips->ips[i].addr)) {
4439                                 /* IP already on node */
4440                                 break;
4441                         }
4442                 }
4443                 if (i == ips->num) {
4444                         /* Add IP ips->ips[i] */
4445                         struct ctdb_control_ip_iface *pub;
4446                         const char *ifaces = NULL;
4447                         uint32_t len;
4448                         int iface = 0;
4449
4450                         DEBUG(DEBUG_NOTICE,
4451                               ("New IP %s configured, adding it\n",
4452                                ctdb_addr_to_str(&vnn->public_address)));
4453                         if (first_add) {
4454                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4455
4456                                 data.dsize = sizeof(pnn);
4457                                 data.dptr  = (uint8_t *)&pnn;
4458
4459                                 ret = ctdb_client_send_message(
4460                                         ctdb,
4461                                         CTDB_BROADCAST_CONNECTED,
4462                                         CTDB_SRVID_REBALANCE_NODE,
4463                                         data);
4464                                 if (ret != 0) {
4465                                         DEBUG(DEBUG_WARNING,
4466                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4467                                 }
4468
4469                                 first_add = false;
4470                         }
4471
4472                         ifaces = vnn->ifaces[0];
4473                         iface = 1;
4474                         while (vnn->ifaces[iface] != NULL) {
4475                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4476                                                          vnn->ifaces[iface]);
4477                                 iface++;
4478                         }
4479
4480                         len   = strlen(ifaces) + 1;
4481                         pub = talloc_zero_size(mem_ctx,
4482                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4483                         CTDB_NO_MEMORY(ctdb, pub);
4484
4485                         pub->addr  = vnn->public_address;
4486                         pub->mask  = vnn->public_netmask_bits;
4487                         pub->len   = len;
4488                         memcpy(&pub->iface[0], ifaces, pub->len);
4489
4490                         timeout = TAKEOVER_TIMEOUT();
4491
4492                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4493                                               iface) + pub->len;
4494                         data.dptr = (uint8_t *)pub;
4495
4496                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4497                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4498                                                   0, data, async_data,
4499                                                   &timeout, NULL);
4500                         if (state == NULL) {
4501                                 DEBUG(DEBUG_ERR,
4502                                       (__location__
4503                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4504                                 goto failed;
4505                         }
4506
4507                         ctdb_client_async_add(async_data, state);
4508                 }
4509         }
4510
4511         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4512                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4513                 goto failed;
4514         }
4515
4516         talloc_free(mem_ctx);
4517         return 0;
4518
4519 failed:
4520         talloc_free(mem_ctx);
4521         return -1;
4522 }
4523
4524 /* This control is sent to force the node to re-read the public addresses file
4525    and drop any addresses we should nnot longer host, and add new addresses
4526    that we are now able to host
4527 */
4528 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4529 {
4530         struct ctdb_reloadips_handle *h;
4531         pid_t parent = getpid();
4532
4533         if (ctdb->reload_ips != NULL) {
4534                 talloc_free(ctdb->reload_ips);
4535                 ctdb->reload_ips = NULL;
4536         }
4537
4538         h = talloc(ctdb, struct ctdb_reloadips_handle);
4539         CTDB_NO_MEMORY(ctdb, h);
4540         h->ctdb     = ctdb;
4541         h->c        = NULL;
4542         h->status   = -1;
4543         
4544         if (pipe(h->fd) == -1) {
4545                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4546                 talloc_free(h);
4547                 return -1;
4548         }
4549
4550         h->child = ctdb_fork(ctdb);
4551         if (h->child == (pid_t)-1) {
4552                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4553                 close(h->fd[0]);
4554                 close(h->fd[1]);
4555                 talloc_free(h);
4556                 return -1;
4557         }
4558
4559         /* child process */
4560         if (h->child == 0) {
4561                 signed char res = 0;
4562
4563                 close(h->fd[0]);
4564                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4565
4566                 ctdb_set_process_name("ctdb_reloadips");
4567                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4568                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4569                         res = -1;
4570                 } else {
4571                         res = ctdb_reloadips_child(ctdb);
4572                         if (res != 0) {
4573                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4574                         }
4575                 }
4576
4577                 sys_write(h->fd[1], &res, 1);
4578                 /* make sure we die when our parent dies */
4579                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4580                         sleep(5);
4581                 }
4582                 _exit(0);
4583         }
4584
4585         h->c             = talloc_steal(h, c);
4586
4587         close(h->fd[1]);
4588         set_close_on_exec(h->fd[0]);
4589
4590         talloc_set_destructor(h, ctdb_reloadips_destructor);
4591
4592
4593         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4594                                ctdb_reloadips_child_handler, (void *)h);
4595         tevent_fd_set_auto_close(h->fde);
4596
4597         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4598                          ctdb_reloadips_timeout_event, h);
4599
4600         /* we reply later */
4601         *async_reply = true;
4602         return 0;
4603 }