7c18542d0bed43fd04a48bcf73939d4511eb6d37
[mat/samba.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40         enum ctdb_runstate runstate;
41 };
42
43 struct ctdb_iface {
44         struct ctdb_iface *prev, *next;
45         const char *name;
46         bool link_up;
47         uint32_t references;
48 };
49
50 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
51 {
52         if (vnn->iface) {
53                 return vnn->iface->name;
54         }
55
56         return "__none__";
57 }
58
59 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
60 {
61         struct ctdb_iface *i;
62
63         /* Verify that we dont have an entry for this ip yet */
64         for (i=ctdb->ifaces;i;i=i->next) {
65                 if (strcmp(i->name, iface) == 0) {
66                         return 0;
67                 }
68         }
69
70         /* create a new structure for this interface */
71         i = talloc_zero(ctdb, struct ctdb_iface);
72         CTDB_NO_MEMORY_FATAL(ctdb, i);
73         i->name = talloc_strdup(i, iface);
74         CTDB_NO_MEMORY(ctdb, i->name);
75         /*
76          * If link_up defaults to true then IPs can be allocated to a
77          * node during the first recovery.  However, then an interface
78          * could have its link marked down during the startup event,
79          * causing the IP to move almost immediately.  If link_up
80          * defaults to false then, during normal operation, IPs added
81          * to a new interface can't be assigned until a monitor cycle
82          * has occurred and marked the new interfaces up.  This makes
83          * IP allocation unpredictable.  The following is a neat
84          * compromise: early in startup link_up defaults to false, so
85          * IPs can't be assigned, and after startup IPs can be
86          * assigned immediately.
87          */
88         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
89
90         DLIST_ADD(ctdb->ifaces, i);
91
92         return 0;
93 }
94
95 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
96                                         const char *name)
97 {
98         int n;
99
100         for (n = 0; vnn->ifaces[n] != NULL; n++) {
101                 if (strcmp(name, vnn->ifaces[n]) == 0) {
102                         return true;
103                 }
104         }
105
106         return false;
107 }
108
109 /* If any interfaces now have no possible IPs then delete them.  This
110  * implementation is naive (i.e. simple) rather than clever
111  * (i.e. complex).  Given that this is run on delip and that operation
112  * is rare, this doesn't need to be efficient - it needs to be
113  * foolproof.  One alternative is reference counting, where the logic
114  * is distributed and can, therefore, be broken in multiple places.
115  * Another alternative is to build a red-black tree of interfaces that
116  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
117  * once) and then walking ctdb->ifaces once and deleting those not in
118  * the tree.  Let's go to one of those if the naive implementation
119  * causes problems...  :-)
120  */
121 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
122                                         struct ctdb_vnn *vnn)
123 {
124         struct ctdb_iface *i, *next;
125
126         /* For each interface, check if there's an IP using it. */
127         for (i = ctdb->ifaces; i != NULL; i = next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130                 next = i->next;
131
132                 /* Only consider interfaces named in the given VNN. */
133                 if (!vnn_has_interface_with_name(vnn, i->name)) {
134                         continue;
135                 }
136
137                 /* Is the "single IP" on this interface? */
138                 if ((ctdb->single_ip_vnn != NULL) &&
139                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
140                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
141                         /* Found, next interface please... */
142                         continue;
143                 }
144                 /* Search for a vnn with this interface. */
145                 found = false;
146                 for (tv=ctdb->vnn; tv; tv=tv->next) {
147                         if (vnn_has_interface_with_name(tv, i->name)) {
148                                 found = true;
149                                 break;
150                         }
151                 }
152
153                 if (!found) {
154                         /* None of the VNNs are using this interface. */
155                         DLIST_REMOVE(ctdb->ifaces, i);
156                         talloc_free(i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->delete_pending) {
264                 return false;
265         }
266
267         if (vnn->iface && vnn->iface->link_up) {
268                 return true;
269         }
270
271         for (i=0; vnn->ifaces[i]; i++) {
272                 struct ctdb_iface *cur;
273
274                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
275                 if (cur == NULL) {
276                         continue;
277                 }
278
279                 if (cur->link_up) {
280                         return true;
281                 }
282         }
283
284         return false;
285 }
286
287 struct ctdb_takeover_arp {
288         struct ctdb_context *ctdb;
289         uint32_t count;
290         ctdb_sock_addr addr;
291         struct ctdb_tcp_array *tcparray;
292         struct ctdb_vnn *vnn;
293 };
294
295
296 /*
297   lists of tcp endpoints
298  */
299 struct ctdb_tcp_list {
300         struct ctdb_tcp_list *prev, *next;
301         struct ctdb_tcp_connection connection;
302 };
303
304 /*
305   list of clients to kill on IP release
306  */
307 struct ctdb_client_ip {
308         struct ctdb_client_ip *prev, *next;
309         struct ctdb_context *ctdb;
310         ctdb_sock_addr addr;
311         uint32_t client_id;
312 };
313
314
315 /*
316   send a gratuitous arp
317  */
318 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
319                                   struct timeval t, void *private_data)
320 {
321         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
322                                                         struct ctdb_takeover_arp);
323         int i, ret;
324         struct ctdb_tcp_array *tcparray;
325         const char *iface = ctdb_vnn_iface_string(arp->vnn);
326
327         ret = ctdb_sys_send_arp(&arp->addr, iface);
328         if (ret != 0) {
329                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
330                                   iface, strerror(errno)));
331         }
332
333         tcparray = arp->tcparray;
334         if (tcparray) {
335                 for (i=0;i<tcparray->num;i++) {
336                         struct ctdb_tcp_connection *tcon;
337
338                         tcon = &tcparray->connections[i];
339                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
340                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
341                                 ctdb_addr_to_str(&tcon->src_addr),
342                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
343                         ret = ctdb_sys_send_tcp(
344                                 &tcon->src_addr, 
345                                 &tcon->dst_addr,
346                                 0, 0, 0);
347                         if (ret != 0) {
348                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
349                                         ctdb_addr_to_str(&tcon->src_addr)));
350                         }
351                 }
352         }
353
354         arp->count++;
355
356         if (arp->count == CTDB_ARP_REPEAT) {
357                 talloc_free(arp);
358                 return;
359         }
360
361         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
362                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
363                         ctdb_control_send_arp, arp);
364 }
365
366 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
367                                        struct ctdb_vnn *vnn)
368 {
369         struct ctdb_takeover_arp *arp;
370         struct ctdb_tcp_array *tcparray;
371
372         if (!vnn->takeover_ctx) {
373                 vnn->takeover_ctx = talloc_new(vnn);
374                 if (!vnn->takeover_ctx) {
375                         return -1;
376                 }
377         }
378
379         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
380         if (!arp) {
381                 return -1;
382         }
383
384         arp->ctdb = ctdb;
385         arp->addr = vnn->public_address;
386         arp->vnn  = vnn;
387
388         tcparray = vnn->tcp_array;
389         if (tcparray) {
390                 /* add all of the known tcp connections for this IP to the
391                    list of tcp connections to send tickle acks for */
392                 arp->tcparray = talloc_steal(arp, tcparray);
393
394                 vnn->tcp_array = NULL;
395                 vnn->tcp_update_needed = true;
396         }
397
398         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
399                         timeval_zero(), ctdb_control_send_arp, arp);
400
401         return 0;
402 }
403
404 struct takeover_callback_state {
405         struct ctdb_req_control *c;
406         ctdb_sock_addr *addr;
407         struct ctdb_vnn *vnn;
408 };
409
410 struct ctdb_do_takeip_state {
411         struct ctdb_req_control *c;
412         struct ctdb_vnn *vnn;
413 };
414
415 /*
416   called when takeip event finishes
417  */
418 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
419                                     void *private_data)
420 {
421         struct ctdb_do_takeip_state *state =
422                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
423         int32_t ret;
424         TDB_DATA data;
425
426         if (status != 0) {
427                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
428         
429                 if (status == -ETIME) {
430                         ctdb_ban_self(ctdb);
431                 }
432                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
433                                  ctdb_addr_to_str(&state->vnn->public_address),
434                                  ctdb_vnn_iface_string(state->vnn)));
435                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
436
437                 node->flags |= NODE_FLAGS_UNHEALTHY;
438                 talloc_free(state);
439                 return;
440         }
441
442         if (ctdb->do_checkpublicip) {
443
444         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
445         if (ret != 0) {
446                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
447                 talloc_free(state);
448                 return;
449         }
450
451         }
452
453         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
454         data.dsize = strlen((char *)data.dptr) + 1;
455         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
456
457         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
458
459
460         /* the control succeeded */
461         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
462         talloc_free(state);
463         return;
464 }
465
466 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
467 {
468         state->vnn->update_in_flight = false;
469         return 0;
470 }
471
472 /*
473   take over an ip address
474  */
475 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
476                               struct ctdb_req_control *c,
477                               struct ctdb_vnn *vnn)
478 {
479         int ret;
480         struct ctdb_do_takeip_state *state;
481
482         if (vnn->update_in_flight) {
483                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
484                                     "update for this IP already in flight\n",
485                                     ctdb_addr_to_str(&vnn->public_address),
486                                     vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         ret = ctdb_vnn_assign_iface(ctdb, vnn);
491         if (ret != 0) {
492                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
493                                  "assign a usable interface\n",
494                                  ctdb_addr_to_str(&vnn->public_address),
495                                  vnn->public_netmask_bits));
496                 return -1;
497         }
498
499         state = talloc(vnn, struct ctdb_do_takeip_state);
500         CTDB_NO_MEMORY(ctdb, state);
501
502         state->c = talloc_steal(ctdb, c);
503         state->vnn   = vnn;
504
505         vnn->update_in_flight = true;
506         talloc_set_destructor(state, ctdb_takeip_destructor);
507
508         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
509                             ctdb_addr_to_str(&vnn->public_address),
510                             vnn->public_netmask_bits,
511                             ctdb_vnn_iface_string(vnn)));
512
513         ret = ctdb_event_script_callback(ctdb,
514                                          state,
515                                          ctdb_do_takeip_callback,
516                                          state,
517                                          CTDB_EVENT_TAKE_IP,
518                                          "%s %s %u",
519                                          ctdb_vnn_iface_string(vnn),
520                                          ctdb_addr_to_str(&vnn->public_address),
521                                          vnn->public_netmask_bits);
522
523         if (ret != 0) {
524                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
525                         ctdb_addr_to_str(&vnn->public_address),
526                         ctdb_vnn_iface_string(vnn)));
527                 talloc_free(state);
528                 return -1;
529         }
530
531         return 0;
532 }
533
534 struct ctdb_do_updateip_state {
535         struct ctdb_req_control *c;
536         struct ctdb_iface *old;
537         struct ctdb_vnn *vnn;
538 };
539
540 /*
541   called when updateip event finishes
542  */
543 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
544                                       void *private_data)
545 {
546         struct ctdb_do_updateip_state *state =
547                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
548         int32_t ret;
549
550         if (status != 0) {
551                 if (status == -ETIME) {
552                         ctdb_ban_self(ctdb);
553                 }
554                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
555                         ctdb_addr_to_str(&state->vnn->public_address),
556                         state->old->name,
557                         ctdb_vnn_iface_string(state->vnn)));
558
559                 /*
560                  * All we can do is reset the old interface
561                  * and let the next run fix it
562                  */
563                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
564                 state->vnn->iface = state->old;
565                 state->vnn->iface->references++;
566
567                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
568                 talloc_free(state);
569                 return;
570         }
571
572         if (ctdb->do_checkpublicip) {
573
574         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
575         if (ret != 0) {
576                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
577                 talloc_free(state);
578                 return;
579         }
580
581         }
582
583         /* the control succeeded */
584         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
585         talloc_free(state);
586         return;
587 }
588
589 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
590 {
591         state->vnn->update_in_flight = false;
592         return 0;
593 }
594
595 /*
596   update (move) an ip address
597  */
598 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
599                                 struct ctdb_req_control *c,
600                                 struct ctdb_vnn *vnn)
601 {
602         int ret;
603         struct ctdb_do_updateip_state *state;
604         struct ctdb_iface *old = vnn->iface;
605         const char *new_name;
606
607         if (vnn->update_in_flight) {
608                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
609                                     "update for this IP already in flight\n",
610                                     ctdb_addr_to_str(&vnn->public_address),
611                                     vnn->public_netmask_bits));
612                 return -1;
613         }
614
615         ctdb_vnn_unassign_iface(ctdb, vnn);
616         ret = ctdb_vnn_assign_iface(ctdb, vnn);
617         if (ret != 0) {
618                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
619                                  "assin a usable interface (old iface '%s')\n",
620                                  ctdb_addr_to_str(&vnn->public_address),
621                                  vnn->public_netmask_bits,
622                                  old->name));
623                 return -1;
624         }
625
626         new_name = ctdb_vnn_iface_string(vnn);
627         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
628                 /* A benign update from one interface onto itself.
629                  * no need to run the eventscripts in this case, just return
630                  * success.
631                  */
632                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
633                 return 0;
634         }
635
636         state = talloc(vnn, struct ctdb_do_updateip_state);
637         CTDB_NO_MEMORY(ctdb, state);
638
639         state->c = talloc_steal(ctdb, c);
640         state->old = old;
641         state->vnn = vnn;
642
643         vnn->update_in_flight = true;
644         talloc_set_destructor(state, ctdb_updateip_destructor);
645
646         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
647                             "interface %s to %s\n",
648                             ctdb_addr_to_str(&vnn->public_address),
649                             vnn->public_netmask_bits,
650                             old->name,
651                             new_name));
652
653         ret = ctdb_event_script_callback(ctdb,
654                                          state,
655                                          ctdb_do_updateip_callback,
656                                          state,
657                                          CTDB_EVENT_UPDATE_IP,
658                                          "%s %s %s %u",
659                                          state->old->name,
660                                          new_name,
661                                          ctdb_addr_to_str(&vnn->public_address),
662                                          vnn->public_netmask_bits);
663         if (ret != 0) {
664                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
665                                  ctdb_addr_to_str(&vnn->public_address),
666                                  old->name, new_name));
667                 talloc_free(state);
668                 return -1;
669         }
670
671         return 0;
672 }
673
674 /*
675   Find the vnn of the node that has a public ip address
676   returns -1 if the address is not known as a public address
677  */
678 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
679 {
680         struct ctdb_vnn *vnn;
681
682         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
683                 if (ctdb_same_ip(&vnn->public_address, addr)) {
684                         return vnn;
685                 }
686         }
687
688         return NULL;
689 }
690
691 /*
692   take over an ip address
693  */
694 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
695                                  struct ctdb_req_control *c,
696                                  TDB_DATA indata,
697                                  bool *async_reply)
698 {
699         int ret;
700         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
701         struct ctdb_vnn *vnn;
702         bool have_ip = false;
703         bool do_updateip = false;
704         bool do_takeip = false;
705         struct ctdb_iface *best_iface = NULL;
706
707         if (pip->pnn != ctdb->pnn) {
708                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
709                                  "with pnn %d, but we're node %d\n",
710                                  ctdb_addr_to_str(&pip->addr),
711                                  pip->pnn, ctdb->pnn));
712                 return -1;
713         }
714
715         /* update out vnn list */
716         vnn = find_public_ip_vnn(ctdb, &pip->addr);
717         if (vnn == NULL) {
718                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
719                         ctdb_addr_to_str(&pip->addr)));
720                 return 0;
721         }
722
723         if (ctdb->do_checkpublicip) {
724                 have_ip = ctdb_sys_have_ip(&pip->addr);
725         }
726         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
727         if (best_iface == NULL) {
728                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
729                                  "a usable interface (old %s, have_ip %d)\n",
730                                  ctdb_addr_to_str(&vnn->public_address),
731                                  vnn->public_netmask_bits,
732                                  ctdb_vnn_iface_string(vnn),
733                                  have_ip));
734                 return -1;
735         }
736
737         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
738                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
739                 have_ip = false;
740         }
741
742
743         if (vnn->iface == NULL && have_ip) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
746                                  ctdb_addr_to_str(&vnn->public_address)));
747                 return 0;
748         }
749
750         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
751                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752                                   "and we have it on iface[%s], but it was assigned to node %d"
753                                   "and we are node %d, banning ourself\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
756                 ctdb_ban_self(ctdb);
757                 return -1;
758         }
759
760         if (vnn->pnn == -1 && have_ip) {
761                 vnn->pnn = ctdb->pnn;
762                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
763                                   "and we already have it on iface[%s], update local daemon\n",
764                                  ctdb_addr_to_str(&vnn->public_address),
765                                   ctdb_vnn_iface_string(vnn)));
766                 return 0;
767         }
768
769         if (vnn->iface) {
770                 if (vnn->iface != best_iface) {
771                         if (!vnn->iface->link_up) {
772                                 do_updateip = true;
773                         } else if (vnn->iface->references > (best_iface->references + 1)) {
774                                 /* only move when the rebalance gains something */
775                                         do_updateip = true;
776                         }
777                 }
778         }
779
780         if (!have_ip) {
781                 if (do_updateip) {
782                         ctdb_vnn_unassign_iface(ctdb, vnn);
783                         do_updateip = false;
784                 }
785                 do_takeip = true;
786         }
787
788         if (do_takeip) {
789                 ret = ctdb_do_takeip(ctdb, c, vnn);
790                 if (ret != 0) {
791                         return -1;
792                 }
793         } else if (do_updateip) {
794                 ret = ctdb_do_updateip(ctdb, c, vnn);
795                 if (ret != 0) {
796                         return -1;
797                 }
798         } else {
799                 /*
800                  * The interface is up and the kernel known the ip
801                  * => do nothing
802                  */
803                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
804                         ctdb_addr_to_str(&pip->addr),
805                         vnn->public_netmask_bits,
806                         ctdb_vnn_iface_string(vnn)));
807                 return 0;
808         }
809
810         /* tell ctdb_control.c that we will be replying asynchronously */
811         *async_reply = true;
812
813         return 0;
814 }
815
816 /*
817   kill any clients that are registered with a IP that is being released
818  */
819 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
820 {
821         struct ctdb_client_ip *ip;
822
823         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
824                 ctdb_addr_to_str(addr)));
825
826         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
827                 ctdb_sock_addr tmp_addr;
828
829                 tmp_addr = ip->addr;
830                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
831                         ip->client_id,
832                         ctdb_addr_to_str(&ip->addr)));
833
834                 if (ctdb_same_ip(&tmp_addr, addr)) {
835                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
836                                                                      ip->client_id, 
837                                                                      struct ctdb_client);
838                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
839                                 ip->client_id,
840                                 ctdb_addr_to_str(&ip->addr),
841                                 client->pid));
842
843                         if (client->pid != 0) {
844                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
845                                         (unsigned)client->pid,
846                                         ctdb_addr_to_str(addr),
847                                         ip->client_id));
848                                 kill(client->pid, SIGKILL);
849                         }
850                 }
851         }
852 }
853
854 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
855 {
856         DLIST_REMOVE(ctdb->vnn, vnn);
857         ctdb_vnn_unassign_iface(ctdb, vnn);
858         ctdb_remove_orphaned_ifaces(ctdb, vnn);
859         talloc_free(vnn);
860 }
861
862 /*
863   called when releaseip event finishes
864  */
865 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
866                                 void *private_data)
867 {
868         struct takeover_callback_state *state = 
869                 talloc_get_type(private_data, struct takeover_callback_state);
870         TDB_DATA data;
871
872         if (status == -ETIME) {
873                 ctdb_ban_self(ctdb);
874         }
875
876         if (ctdb->do_checkpublicip) {
877                 if  (ctdb_sys_have_ip(state->addr)) {
878                         DEBUG(DEBUG_ERR,
879                               ("IP %s still hosted during release IP callback, failing\n",
880                                ctdb_addr_to_str(state->addr)));
881                         ctdb_request_control_reply(ctdb, state->c,
882                                                    NULL, -1, NULL);
883                         talloc_free(state);
884                         return;
885                 }
886         }
887
888         /* send a message to all clients of this node telling them
889            that the cluster has been reconfigured and they should
890            release any sockets on this IP */
891         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
892         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
893         data.dsize = strlen((char *)data.dptr)+1;
894
895         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
896
897         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
898
899         /* kill clients that have registered with this IP */
900         release_kill_clients(ctdb, state->addr);
901
902         ctdb_vnn_unassign_iface(ctdb, state->vnn);
903
904         /* Process the IP if it has been marked for deletion */
905         if (state->vnn->delete_pending) {
906                 do_delete_ip(ctdb, state->vnn);
907                 state->vnn = NULL;
908         }
909
910         /* the control succeeded */
911         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
912         talloc_free(state);
913 }
914
915 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
916 {
917         if (state->vnn != NULL) {
918                 state->vnn->update_in_flight = false;
919         }
920         return 0;
921 }
922
923 /*
924   release an ip address
925  */
926 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
927                                 struct ctdb_req_control *c,
928                                 TDB_DATA indata, 
929                                 bool *async_reply)
930 {
931         int ret;
932         struct takeover_callback_state *state;
933         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
934         struct ctdb_vnn *vnn;
935         char *iface;
936
937         /* update our vnn list */
938         vnn = find_public_ip_vnn(ctdb, &pip->addr);
939         if (vnn == NULL) {
940                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
941                         ctdb_addr_to_str(&pip->addr)));
942                 return 0;
943         }
944         vnn->pnn = pip->pnn;
945
946         /* stop any previous arps */
947         talloc_free(vnn->takeover_ctx);
948         vnn->takeover_ctx = NULL;
949
950         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
951          * lazy multicast to drop an IP from any node that isn't the
952          * intended new node.  The following causes makes ctdbd ignore
953          * a release for any address it doesn't host.
954          */
955         if (ctdb->do_checkpublicip) {
956                 if (!ctdb_sys_have_ip(&pip->addr)) {
957                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
958                                 ctdb_addr_to_str(&pip->addr),
959                                 vnn->public_netmask_bits,
960                                 ctdb_vnn_iface_string(vnn)));
961                         ctdb_vnn_unassign_iface(ctdb, vnn);
962                         return 0;
963                 }
964         } else {
965                 if (vnn->iface == NULL) {
966                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
967                                            ctdb_addr_to_str(&pip->addr),
968                                            vnn->public_netmask_bits));
969                         return 0;
970                 }
971         }
972
973         /* There is a potential race between take_ip and us because we
974          * update the VNN via a callback that run when the
975          * eventscripts have been run.  Avoid the race by allowing one
976          * update to be in flight at a time.
977          */
978         if (vnn->update_in_flight) {
979                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
980                                     "update for this IP already in flight\n",
981                                     ctdb_addr_to_str(&vnn->public_address),
982                                     vnn->public_netmask_bits));
983                 return -1;
984         }
985
986         iface = strdup(ctdb_vnn_iface_string(vnn));
987
988         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
989                 ctdb_addr_to_str(&pip->addr),
990                 vnn->public_netmask_bits,
991                 iface,
992                 pip->pnn));
993
994         state = talloc(ctdb, struct takeover_callback_state);
995         if (state == NULL) {
996                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
997                                __FILE__, __LINE__);
998                 free(iface);
999                 return -1;
1000         }
1001
1002         state->c = talloc_steal(state, c);
1003         state->addr = talloc(state, ctdb_sock_addr);       
1004         if (state->addr == NULL) {
1005                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1006                                __FILE__, __LINE__);
1007                 free(iface);
1008                 talloc_free(state);
1009                 return -1;
1010         }
1011         *state->addr = pip->addr;
1012         state->vnn   = vnn;
1013
1014         vnn->update_in_flight = true;
1015         talloc_set_destructor(state, ctdb_releaseip_destructor);
1016
1017         ret = ctdb_event_script_callback(ctdb, 
1018                                          state, release_ip_callback, state,
1019                                          CTDB_EVENT_RELEASE_IP,
1020                                          "%s %s %u",
1021                                          iface,
1022                                          ctdb_addr_to_str(&pip->addr),
1023                                          vnn->public_netmask_bits);
1024         free(iface);
1025         if (ret != 0) {
1026                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1027                         ctdb_addr_to_str(&pip->addr),
1028                         ctdb_vnn_iface_string(vnn)));
1029                 talloc_free(state);
1030                 return -1;
1031         }
1032
1033         /* tell the control that we will be reply asynchronously */
1034         *async_reply = true;
1035         return 0;
1036 }
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the public address lists from a file
1112 */
1113 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1114 {
1115         char **lines;
1116         int nlines;
1117         int i;
1118
1119         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1120         if (lines == NULL) {
1121                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1122                 return -1;
1123         }
1124         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1125                 nlines--;
1126         }
1127
1128         for (i=0;i<nlines;i++) {
1129                 unsigned mask;
1130                 ctdb_sock_addr addr;
1131                 const char *addrstr;
1132                 const char *ifaces;
1133                 char *tok, *line;
1134
1135                 line = lines[i];
1136                 while ((*line == ' ') || (*line == '\t')) {
1137                         line++;
1138                 }
1139                 if (*line == '#') {
1140                         continue;
1141                 }
1142                 if (strcmp(line, "") == 0) {
1143                         continue;
1144                 }
1145                 tok = strtok(line, " \t");
1146                 addrstr = tok;
1147                 tok = strtok(NULL, " \t");
1148                 if (tok == NULL) {
1149                         if (NULL == ctdb->default_public_interface) {
1150                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1151                                          i+1));
1152                                 talloc_free(lines);
1153                                 return -1;
1154                         }
1155                         ifaces = ctdb->default_public_interface;
1156                 } else {
1157                         ifaces = tok;
1158                 }
1159
1160                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1161                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1162                         talloc_free(lines);
1163                         return -1;
1164                 }
1165                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1166                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1167                         talloc_free(lines);
1168                         return -1;
1169                 }
1170         }
1171
1172
1173         talloc_free(lines);
1174         return 0;
1175 }
1176
1177 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1178                               const char *iface,
1179                               const char *ip)
1180 {
1181         struct ctdb_vnn *svnn;
1182         struct ctdb_iface *cur = NULL;
1183         bool ok;
1184         int ret;
1185
1186         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1187         CTDB_NO_MEMORY(ctdb, svnn);
1188
1189         svnn->ifaces = talloc_array(svnn, const char *, 2);
1190         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1191         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1192         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1193         svnn->ifaces[1] = NULL;
1194
1195         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1196         if (!ok) {
1197                 talloc_free(svnn);
1198                 return -1;
1199         }
1200
1201         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1202         if (ret != 0) {
1203                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1204                                    "for single_ip[%s]\n",
1205                                    svnn->ifaces[0],
1206                                    ctdb_addr_to_str(&svnn->public_address)));
1207                 talloc_free(svnn);
1208                 return -1;
1209         }
1210
1211         /* assume the single public ip interface is initially "good" */
1212         cur = ctdb_find_iface(ctdb, iface);
1213         if (cur == NULL) {
1214                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1215                 return -1;
1216         }
1217         cur->link_up = true;
1218
1219         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1220         if (ret != 0) {
1221                 talloc_free(svnn);
1222                 return -1;
1223         }
1224
1225         ctdb->single_ip_vnn = svnn;
1226         return 0;
1227 }
1228
1229 struct ctdb_public_ip_list {
1230         struct ctdb_public_ip_list *next;
1231         uint32_t pnn;
1232         ctdb_sock_addr addr;
1233 };
1234
1235 /* Given a physical node, return the number of
1236    public addresses that is currently assigned to this node.
1237 */
1238 static int node_ip_coverage(struct ctdb_context *ctdb, 
1239         int32_t pnn,
1240         struct ctdb_public_ip_list *ips)
1241 {
1242         int num=0;
1243
1244         for (;ips;ips=ips->next) {
1245                 if (ips->pnn == pnn) {
1246                         num++;
1247                 }
1248         }
1249         return num;
1250 }
1251
1252
1253 /* Can the given node host the given IP: is the public IP known to the
1254  * node and is NOIPHOST unset?
1255 */
1256 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1257                              struct ctdb_ipflags ipflags,
1258                              struct ctdb_public_ip_list *ip)
1259 {
1260         struct ctdb_all_public_ips *public_ips;
1261         int i;
1262
1263         if (ipflags.noiphost) {
1264                 return false;
1265         }
1266
1267         public_ips = ctdb->nodes[pnn]->available_public_ips;
1268
1269         if (public_ips == NULL) {
1270                 return false;
1271         }
1272
1273         for (i=0; i<public_ips->num; i++) {
1274                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1275                         /* yes, this node can serve this public ip */
1276                         return true;
1277                 }
1278         }
1279
1280         return false;
1281 }
1282
1283 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1284                                  struct ctdb_ipflags ipflags,
1285                                  struct ctdb_public_ip_list *ip)
1286 {
1287         if (ipflags.noiptakeover) {
1288                 return false;
1289         }
1290
1291         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1292 }
1293
1294 /* search the node lists list for a node to takeover this ip.
1295    pick the node that currently are serving the least number of ips
1296    so that the ips get spread out evenly.
1297 */
1298 static int find_takeover_node(struct ctdb_context *ctdb, 
1299                 struct ctdb_ipflags *ipflags,
1300                 struct ctdb_public_ip_list *ip,
1301                 struct ctdb_public_ip_list *all_ips)
1302 {
1303         int pnn, min=0, num;
1304         int i, numnodes;
1305
1306         numnodes = talloc_array_length(ipflags);
1307         pnn    = -1;
1308         for (i=0; i<numnodes; i++) {
1309                 /* verify that this node can serve this ip */
1310                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1311                         /* no it couldnt   so skip to the next node */
1312                         continue;
1313                 }
1314
1315                 num = node_ip_coverage(ctdb, i, all_ips);
1316                 /* was this the first node we checked ? */
1317                 if (pnn == -1) {
1318                         pnn = i;
1319                         min  = num;
1320                 } else {
1321                         if (num < min) {
1322                                 pnn = i;
1323                                 min  = num;
1324                         }
1325                 }
1326         }       
1327         if (pnn == -1) {
1328                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1329                         ctdb_addr_to_str(&ip->addr)));
1330
1331                 return -1;
1332         }
1333
1334         ip->pnn = pnn;
1335         return 0;
1336 }
1337
1338 #define IP_KEYLEN       4
1339 static uint32_t *ip_key(ctdb_sock_addr *ip)
1340 {
1341         static uint32_t key[IP_KEYLEN];
1342
1343         bzero(key, sizeof(key));
1344
1345         switch (ip->sa.sa_family) {
1346         case AF_INET:
1347                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1348                 break;
1349         case AF_INET6: {
1350                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1351                 key[0]  = htonl(s6_a32[0]);
1352                 key[1]  = htonl(s6_a32[1]);
1353                 key[2]  = htonl(s6_a32[2]);
1354                 key[3]  = htonl(s6_a32[3]);
1355                 break;
1356         }
1357         default:
1358                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1359                 return key;
1360         }
1361
1362         return key;
1363 }
1364
1365 static void *add_ip_callback(void *parm, void *data)
1366 {
1367         struct ctdb_public_ip_list *this_ip = parm; 
1368         struct ctdb_public_ip_list *prev_ip = data; 
1369
1370         if (prev_ip == NULL) {
1371                 return parm;
1372         }
1373         if (this_ip->pnn == -1) {
1374                 this_ip->pnn = prev_ip->pnn;
1375         }
1376
1377         return parm;
1378 }
1379
1380 static int getips_count_callback(void *param, void *data)
1381 {
1382         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1383         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1384
1385         new_ip->next = *ip_list;
1386         *ip_list     = new_ip;
1387         return 0;
1388 }
1389
1390 static struct ctdb_public_ip_list *
1391 create_merged_ip_list(struct ctdb_context *ctdb)
1392 {
1393         int i, j;
1394         struct ctdb_public_ip_list *ip_list;
1395         struct ctdb_all_public_ips *public_ips;
1396
1397         if (ctdb->ip_tree != NULL) {
1398                 talloc_free(ctdb->ip_tree);
1399                 ctdb->ip_tree = NULL;
1400         }
1401         ctdb->ip_tree = trbt_create(ctdb, 0);
1402
1403         for (i=0;i<ctdb->num_nodes;i++) {
1404                 public_ips = ctdb->nodes[i]->known_public_ips;
1405
1406                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1407                         continue;
1408                 }
1409
1410                 /* there were no public ips for this node */
1411                 if (public_ips == NULL) {
1412                         continue;
1413                 }               
1414
1415                 for (j=0;j<public_ips->num;j++) {
1416                         struct ctdb_public_ip_list *tmp_ip; 
1417
1418                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1419                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1420                         /* Do not use information about IP addresses hosted
1421                          * on other nodes, it may not be accurate */
1422                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1423                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1424                         } else {
1425                                 tmp_ip->pnn = -1;
1426                         }
1427                         tmp_ip->addr = public_ips->ips[j].addr;
1428                         tmp_ip->next = NULL;
1429
1430                         trbt_insertarray32_callback(ctdb->ip_tree,
1431                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1432                                 add_ip_callback,
1433                                 tmp_ip);
1434                 }
1435         }
1436
1437         ip_list = NULL;
1438         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1439
1440         return ip_list;
1441 }
1442
1443 /* 
1444  * This is the length of the longtest common prefix between the IPs.
1445  * It is calculated by XOR-ing the 2 IPs together and counting the
1446  * number of leading zeroes.  The implementation means that all
1447  * addresses end up being 128 bits long.
1448  *
1449  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1450  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1451  * lots of nodes and IP addresses?
1452  */
1453 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1454 {
1455         uint32_t ip1_k[IP_KEYLEN];
1456         uint32_t *t;
1457         int i;
1458         uint32_t x;
1459
1460         uint32_t distance = 0;
1461
1462         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1463         t = ip_key(ip2);
1464         for (i=0; i<IP_KEYLEN; i++) {
1465                 x = ip1_k[i] ^ t[i];
1466                 if (x == 0) {
1467                         distance += 32;
1468                 } else {
1469                         /* Count number of leading zeroes. 
1470                          * FIXME? This could be optimised...
1471                          */
1472                         while ((x & (1 << 31)) == 0) {
1473                                 x <<= 1;
1474                                 distance += 1;
1475                         }
1476                 }
1477         }
1478
1479         return distance;
1480 }
1481
1482 /* Calculate the IP distance for the given IP relative to IPs on the
1483    given node.  The ips argument is generally the all_ips variable
1484    used in the main part of the algorithm.
1485  */
1486 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1487                                   struct ctdb_public_ip_list *ips,
1488                                   int pnn)
1489 {
1490         struct ctdb_public_ip_list *t;
1491         uint32_t d;
1492
1493         uint32_t sum = 0;
1494
1495         for (t=ips; t != NULL; t=t->next) {
1496                 if (t->pnn != pnn) {
1497                         continue;
1498                 }
1499
1500                 /* Optimisation: We never calculate the distance
1501                  * between an address and itself.  This allows us to
1502                  * calculate the effect of removing an address from a
1503                  * node by simply calculating the distance between
1504                  * that address and all of the exitsing addresses.
1505                  * Moreover, we assume that we're only ever dealing
1506                  * with addresses from all_ips so we can identify an
1507                  * address via a pointer rather than doing a more
1508                  * expensive address comparison. */
1509                 if (&(t->addr) == ip) {
1510                         continue;
1511                 }
1512
1513                 d = ip_distance(ip, &(t->addr));
1514                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1515         }
1516
1517         return sum;
1518 }
1519
1520 /* Return the LCP2 imbalance metric for addresses currently assigned
1521    to the given node.
1522  */
1523 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1524 {
1525         struct ctdb_public_ip_list *t;
1526
1527         uint32_t imbalance = 0;
1528
1529         for (t=all_ips; t!=NULL; t=t->next) {
1530                 if (t->pnn != pnn) {
1531                         continue;
1532                 }
1533                 /* Pass the rest of the IPs rather than the whole
1534                    all_ips input list.
1535                 */
1536                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1537         }
1538
1539         return imbalance;
1540 }
1541
1542 /* Allocate any unassigned IPs just by looping through the IPs and
1543  * finding the best node for each.
1544  */
1545 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1546                                       struct ctdb_ipflags *ipflags,
1547                                       struct ctdb_public_ip_list *all_ips)
1548 {
1549         struct ctdb_public_ip_list *tmp_ip;
1550
1551         /* loop over all ip's and find a physical node to cover for 
1552            each unassigned ip.
1553         */
1554         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1555                 if (tmp_ip->pnn == -1) {
1556                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1557                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1558                                         ctdb_addr_to_str(&tmp_ip->addr)));
1559                         }
1560                 }
1561         }
1562 }
1563
1564 /* Basic non-deterministic rebalancing algorithm.
1565  */
1566 static void basic_failback(struct ctdb_context *ctdb,
1567                            struct ctdb_ipflags *ipflags,
1568                            struct ctdb_public_ip_list *all_ips,
1569                            int num_ips)
1570 {
1571         int i, numnodes;
1572         int maxnode, maxnum, minnode, minnum, num, retries;
1573         struct ctdb_public_ip_list *tmp_ip;
1574
1575         numnodes = talloc_array_length(ipflags);
1576         retries = 0;
1577
1578 try_again:
1579         maxnum=0;
1580         minnum=0;
1581
1582         /* for each ip address, loop over all nodes that can serve
1583            this ip and make sure that the difference between the node
1584            serving the most and the node serving the least ip's are
1585            not greater than 1.
1586         */
1587         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1588                 if (tmp_ip->pnn == -1) {
1589                         continue;
1590                 }
1591
1592                 /* Get the highest and lowest number of ips's served by any 
1593                    valid node which can serve this ip.
1594                 */
1595                 maxnode = -1;
1596                 minnode = -1;
1597                 for (i=0; i<numnodes; i++) {
1598                         /* only check nodes that can actually serve this ip */
1599                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1600                                 /* no it couldnt   so skip to the next node */
1601                                 continue;
1602                         }
1603
1604                         num = node_ip_coverage(ctdb, i, all_ips);
1605                         if (maxnode == -1) {
1606                                 maxnode = i;
1607                                 maxnum  = num;
1608                         } else {
1609                                 if (num > maxnum) {
1610                                         maxnode = i;
1611                                         maxnum  = num;
1612                                 }
1613                         }
1614                         if (minnode == -1) {
1615                                 minnode = i;
1616                                 minnum  = num;
1617                         } else {
1618                                 if (num < minnum) {
1619                                         minnode = i;
1620                                         minnum  = num;
1621                                 }
1622                         }
1623                 }
1624                 if (maxnode == -1) {
1625                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1626                                 ctdb_addr_to_str(&tmp_ip->addr)));
1627
1628                         continue;
1629                 }
1630
1631                 /* if the spread between the smallest and largest coverage by
1632                    a node is >=2 we steal one of the ips from the node with
1633                    most coverage to even things out a bit.
1634                    try to do this a limited number of times since we dont
1635                    want to spend too much time balancing the ip coverage.
1636                 */
1637                 if ( (maxnum > minnum+1)
1638                      && (retries < (num_ips + 5)) ){
1639                         struct ctdb_public_ip_list *tmp;
1640
1641                         /* Reassign one of maxnode's VNNs */
1642                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1643                                 if (tmp->pnn == maxnode) {
1644                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1645                                         retries++;
1646                                         goto try_again;;
1647                                 }
1648                         }
1649                 }
1650         }
1651 }
1652
1653 static void lcp2_init(struct ctdb_context *tmp_ctx,
1654                       struct ctdb_ipflags *ipflags,
1655                       struct ctdb_public_ip_list *all_ips,
1656                       uint32_t *force_rebalance_nodes,
1657                       uint32_t **lcp2_imbalances,
1658                       bool **rebalance_candidates)
1659 {
1660         int i, numnodes;
1661         struct ctdb_public_ip_list *tmp_ip;
1662
1663         numnodes = talloc_array_length(ipflags);
1664
1665         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1666         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1667         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1668         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1669
1670         for (i=0; i<numnodes; i++) {
1671                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1672                 /* First step: assume all nodes are candidates */
1673                 (*rebalance_candidates)[i] = true;
1674         }
1675
1676         /* 2nd step: if a node has IPs assigned then it must have been
1677          * healthy before, so we remove it from consideration.  This
1678          * is overkill but is all we have because we don't maintain
1679          * state between takeover runs.  An alternative would be to
1680          * keep state and invalidate it every time the recovery master
1681          * changes.
1682          */
1683         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1684                 if (tmp_ip->pnn != -1) {
1685                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1686                 }
1687         }
1688
1689         /* 3rd step: if a node is forced to re-balance then
1690            we allow failback onto the node */
1691         if (force_rebalance_nodes == NULL) {
1692                 return;
1693         }
1694         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1695                 uint32_t pnn = force_rebalance_nodes[i];
1696                 if (pnn >= numnodes) {
1697                         DEBUG(DEBUG_ERR,
1698                               (__location__ "unknown node %u\n", pnn));
1699                         continue;
1700                 }
1701
1702                 DEBUG(DEBUG_NOTICE,
1703                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1704                 (*rebalance_candidates)[pnn] = true;
1705         }
1706 }
1707
1708 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1709  * the IP/node combination that will cost the least.
1710  */
1711 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1712                                      struct ctdb_ipflags *ipflags,
1713                                      struct ctdb_public_ip_list *all_ips,
1714                                      uint32_t *lcp2_imbalances)
1715 {
1716         struct ctdb_public_ip_list *tmp_ip;
1717         int dstnode, numnodes;
1718
1719         int minnode;
1720         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1721         struct ctdb_public_ip_list *minip;
1722
1723         bool should_loop = true;
1724         bool have_unassigned = true;
1725
1726         numnodes = talloc_array_length(ipflags);
1727
1728         while (have_unassigned && should_loop) {
1729                 should_loop = false;
1730
1731                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1732                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1733
1734                 minnode = -1;
1735                 mindsum = 0;
1736                 minip = NULL;
1737
1738                 /* loop over each unassigned ip. */
1739                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1740                         if (tmp_ip->pnn != -1) {
1741                                 continue;
1742                         }
1743
1744                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1745                                 /* only check nodes that can actually takeover this ip */
1746                                 if (!can_node_takeover_ip(ctdb, dstnode,
1747                                                           ipflags[dstnode],
1748                                                           tmp_ip)) {
1749                                         /* no it couldnt   so skip to the next node */
1750                                         continue;
1751                                 }
1752
1753                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1754                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1755                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1756                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1757                                                    dstnode,
1758                                                    dstimbl - lcp2_imbalances[dstnode]));
1759
1760
1761                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1762                                         minnode = dstnode;
1763                                         minimbl = dstimbl;
1764                                         mindsum = dstdsum;
1765                                         minip = tmp_ip;
1766                                         should_loop = true;
1767                                 }
1768                         }
1769                 }
1770
1771                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1772
1773                 /* If we found one then assign it to the given node. */
1774                 if (minnode != -1) {
1775                         minip->pnn = minnode;
1776                         lcp2_imbalances[minnode] = minimbl;
1777                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1778                                           ctdb_addr_to_str(&(minip->addr)),
1779                                           minnode,
1780                                           mindsum));
1781                 }
1782
1783                 /* There might be a better way but at least this is clear. */
1784                 have_unassigned = false;
1785                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1786                         if (tmp_ip->pnn == -1) {
1787                                 have_unassigned = true;
1788                         }
1789                 }
1790         }
1791
1792         /* We know if we have an unassigned addresses so we might as
1793          * well optimise.
1794          */
1795         if (have_unassigned) {
1796                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1797                         if (tmp_ip->pnn == -1) {
1798                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1799                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1800                         }
1801                 }
1802         }
1803 }
1804
1805 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1806  * to move IPs from, determines the best IP/destination node
1807  * combination to move from the source node.
1808  */
1809 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1810                                     struct ctdb_ipflags *ipflags,
1811                                     struct ctdb_public_ip_list *all_ips,
1812                                     int srcnode,
1813                                     uint32_t *lcp2_imbalances,
1814                                     bool *rebalance_candidates)
1815 {
1816         int dstnode, mindstnode, numnodes;
1817         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1818         uint32_t minsrcimbl, mindstimbl;
1819         struct ctdb_public_ip_list *minip;
1820         struct ctdb_public_ip_list *tmp_ip;
1821
1822         /* Find an IP and destination node that best reduces imbalance. */
1823         srcimbl = 0;
1824         minip = NULL;
1825         minsrcimbl = 0;
1826         mindstnode = -1;
1827         mindstimbl = 0;
1828
1829         numnodes = talloc_array_length(ipflags);
1830
1831         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1832         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1833                            srcnode, lcp2_imbalances[srcnode]));
1834
1835         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1836                 /* Only consider addresses on srcnode. */
1837                 if (tmp_ip->pnn != srcnode) {
1838                         continue;
1839                 }
1840
1841                 /* What is this IP address costing the source node? */
1842                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1843                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1844
1845                 /* Consider this IP address would cost each potential
1846                  * destination node.  Destination nodes are limited to
1847                  * those that are newly healthy, since we don't want
1848                  * to do gratuitous failover of IPs just to make minor
1849                  * balance improvements.
1850                  */
1851                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1852                         if (!rebalance_candidates[dstnode]) {
1853                                 continue;
1854                         }
1855
1856                         /* only check nodes that can actually takeover this ip */
1857                         if (!can_node_takeover_ip(ctdb, dstnode,
1858                                                   ipflags[dstnode], tmp_ip)) {
1859                                 /* no it couldnt   so skip to the next node */
1860                                 continue;
1861                         }
1862
1863                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1864                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1865                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1866                                            srcnode, -srcdsum,
1867                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1868                                            dstnode, dstdsum));
1869
1870                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1871                             (dstdsum < srcdsum) &&                      \
1872                             ((mindstnode == -1) ||                              \
1873                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1874
1875                                 minip = tmp_ip;
1876                                 minsrcimbl = srcimbl;
1877                                 mindstnode = dstnode;
1878                                 mindstimbl = dstimbl;
1879                         }
1880                 }
1881         }
1882         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1883
1884         if (mindstnode != -1) {
1885                 /* We found a move that makes things better... */
1886                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1887                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1888                                   ctdb_addr_to_str(&(minip->addr)),
1889                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1890
1891
1892                 lcp2_imbalances[srcnode] = minsrcimbl;
1893                 lcp2_imbalances[mindstnode] = mindstimbl;
1894                 minip->pnn = mindstnode;
1895
1896                 return true;
1897         }
1898
1899         return false;
1900         
1901 }
1902
1903 struct lcp2_imbalance_pnn {
1904         uint32_t imbalance;
1905         int pnn;
1906 };
1907
1908 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1909 {
1910         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1911         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1912
1913         if (lipa->imbalance > lipb->imbalance) {
1914                 return -1;
1915         } else if (lipa->imbalance == lipb->imbalance) {
1916                 return 0;
1917         } else {
1918                 return 1;
1919         }
1920 }
1921
1922 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1923  * node with the highest LCP2 imbalance, and then determines the best
1924  * IP/destination node combination to move from the source node.
1925  */
1926 static void lcp2_failback(struct ctdb_context *ctdb,
1927                           struct ctdb_ipflags *ipflags,
1928                           struct ctdb_public_ip_list *all_ips,
1929                           uint32_t *lcp2_imbalances,
1930                           bool *rebalance_candidates)
1931 {
1932         int i, numnodes;
1933         struct lcp2_imbalance_pnn * lips;
1934         bool again;
1935
1936         numnodes = talloc_array_length(ipflags);
1937
1938 try_again:
1939         /* Put the imbalances and nodes into an array, sort them and
1940          * iterate through candidates.  Usually the 1st one will be
1941          * used, so this doesn't cost much...
1942          */
1943         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1944         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1945         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1946         for (i=0; i<numnodes; i++) {
1947                 lips[i].imbalance = lcp2_imbalances[i];
1948                 lips[i].pnn = i;
1949                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1950         }
1951         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1952               lcp2_cmp_imbalance_pnn);
1953
1954         again = false;
1955         for (i=0; i<numnodes; i++) {
1956                 /* This means that all nodes had 0 or 1 addresses, so
1957                  * can't be imbalanced.
1958                  */
1959                 if (lips[i].imbalance == 0) {
1960                         break;
1961                 }
1962
1963                 if (lcp2_failback_candidate(ctdb,
1964                                             ipflags,
1965                                             all_ips,
1966                                             lips[i].pnn,
1967                                             lcp2_imbalances,
1968                                             rebalance_candidates)) {
1969                         again = true;
1970                         break;
1971                 }
1972         }
1973
1974         talloc_free(lips);
1975         if (again) {
1976                 goto try_again;
1977         }
1978 }
1979
1980 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1981                                     struct ctdb_ipflags *ipflags,
1982                                     struct ctdb_public_ip_list *all_ips)
1983 {
1984         struct ctdb_public_ip_list *tmp_ip;
1985
1986         /* verify that the assigned nodes can serve that public ip
1987            and set it to -1 if not
1988         */
1989         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1990                 if (tmp_ip->pnn == -1) {
1991                         continue;
1992                 }
1993                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1994                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1995                         /* this node can not serve this ip. */
1996                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1997                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1998                                            tmp_ip->pnn));
1999                         tmp_ip->pnn = -1;
2000                 }
2001         }
2002 }
2003
2004 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2005                                        struct ctdb_ipflags *ipflags,
2006                                        struct ctdb_public_ip_list *all_ips)
2007 {
2008         struct ctdb_public_ip_list *tmp_ip;
2009         int i, numnodes;
2010
2011         numnodes = talloc_array_length(ipflags);
2012
2013         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2014        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2015         *  always be allocated the same way for a specific set of
2016         *  available/unavailable nodes.
2017         */
2018
2019         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2020                 tmp_ip->pnn = i % numnodes;
2021         }
2022
2023         /* IP failback doesn't make sense with deterministic
2024          * IPs, since the modulo step above implicitly fails
2025          * back IPs to their "home" node.
2026          */
2027         if (1 == ctdb->tunable.no_ip_failback) {
2028                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2029         }
2030
2031         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2032
2033         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2034
2035         /* No failback here! */
2036 }
2037
2038 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2039                                           struct ctdb_ipflags *ipflags,
2040                                           struct ctdb_public_ip_list *all_ips)
2041 {
2042         /* This should be pushed down into basic_failback. */
2043         struct ctdb_public_ip_list *tmp_ip;
2044         int num_ips = 0;
2045         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2046                 num_ips++;
2047         }
2048
2049         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2050
2051         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2052
2053         /* If we don't want IPs to fail back then don't rebalance IPs. */
2054         if (1 == ctdb->tunable.no_ip_failback) {
2055                 return;
2056         }
2057
2058         /* Now, try to make sure the ip adresses are evenly distributed
2059            across the nodes.
2060         */
2061         basic_failback(ctdb, ipflags, all_ips, num_ips);
2062 }
2063
2064 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2065                           struct ctdb_ipflags *ipflags,
2066                           struct ctdb_public_ip_list *all_ips,
2067                           uint32_t *force_rebalance_nodes)
2068 {
2069         uint32_t *lcp2_imbalances;
2070         bool *rebalance_candidates;
2071         int numnodes, num_rebalance_candidates, i;
2072
2073         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2074
2075         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2076
2077         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2078                   &lcp2_imbalances, &rebalance_candidates);
2079
2080         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2081
2082         /* If we don't want IPs to fail back then don't rebalance IPs. */
2083         if (1 == ctdb->tunable.no_ip_failback) {
2084                 goto finished;
2085         }
2086
2087         /* It is only worth continuing if we have suitable target
2088          * nodes to transfer IPs to.  This check is much cheaper than
2089          * continuing on...
2090          */
2091         numnodes = talloc_array_length(ipflags);
2092         num_rebalance_candidates = 0;
2093         for (i=0; i<numnodes; i++) {
2094                 if (rebalance_candidates[i]) {
2095                         num_rebalance_candidates++;
2096                 }
2097         }
2098         if (num_rebalance_candidates == 0) {
2099                 goto finished;
2100         }
2101
2102         /* Now, try to make sure the ip adresses are evenly distributed
2103            across the nodes.
2104         */
2105         lcp2_failback(ctdb, ipflags, all_ips,
2106                       lcp2_imbalances, rebalance_candidates);
2107
2108 finished:
2109         talloc_free(tmp_ctx);
2110 }
2111
2112 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2113 {
2114         int i;
2115
2116         for (i=0;i<nodemap->num;i++) {
2117                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2118                         /* Found one completely healthy node */
2119                         return false;
2120                 }
2121         }
2122
2123         return true;
2124 }
2125
2126 /* The calculation part of the IP allocation algorithm. */
2127 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2128                                    struct ctdb_ipflags *ipflags,
2129                                    struct ctdb_public_ip_list **all_ips_p,
2130                                    uint32_t *force_rebalance_nodes)
2131 {
2132         /* since nodes only know about those public addresses that
2133            can be served by that particular node, no single node has
2134            a full list of all public addresses that exist in the cluster.
2135            Walk over all node structures and create a merged list of
2136            all public addresses that exist in the cluster.
2137
2138            keep the tree of ips around as ctdb->ip_tree
2139         */
2140         *all_ips_p = create_merged_ip_list(ctdb);
2141
2142         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2143                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2144         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2145                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2146         } else {
2147                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2148         }
2149
2150         /* at this point ->pnn is the node which will own each IP
2151            or -1 if there is no node that can cover this ip
2152         */
2153
2154         return;
2155 }
2156
2157 struct get_tunable_callback_data {
2158         const char *tunable;
2159         uint32_t *out;
2160         bool fatal;
2161 };
2162
2163 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2164                                  int32_t res, TDB_DATA outdata,
2165                                  void *callback)
2166 {
2167         struct get_tunable_callback_data *cd =
2168                 (struct get_tunable_callback_data *)callback;
2169         int size;
2170
2171         if (res != 0) {
2172                 /* Already handled in fail callback */
2173                 return;
2174         }
2175
2176         if (outdata.dsize != sizeof(uint32_t)) {
2177                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2178                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2179                                  (int)outdata.dsize));
2180                 cd->fatal = true;
2181                 return;
2182         }
2183
2184         size = talloc_array_length(cd->out);
2185         if (pnn >= size) {
2186                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2187                                  cd->tunable, pnn, size));
2188                 return;
2189         }
2190
2191                 
2192         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2193 }
2194
2195 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2196                                        int32_t res, TDB_DATA outdata,
2197                                        void *callback)
2198 {
2199         struct get_tunable_callback_data *cd =
2200                 (struct get_tunable_callback_data *)callback;
2201
2202         switch (res) {
2203         case -ETIME:
2204                 DEBUG(DEBUG_ERR,
2205                       ("Timed out getting tunable \"%s\" from node %d\n",
2206                        cd->tunable, pnn));
2207                 cd->fatal = true;
2208                 break;
2209         case -EINVAL:
2210         case -1:
2211                 DEBUG(DEBUG_WARNING,
2212                       ("Tunable \"%s\" not implemented on node %d\n",
2213                        cd->tunable, pnn));
2214                 break;
2215         default:
2216                 DEBUG(DEBUG_ERR,
2217                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2218                        cd->tunable, pnn));
2219                 cd->fatal = true;
2220         }
2221 }
2222
2223 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2224                                         TALLOC_CTX *tmp_ctx,
2225                                         struct ctdb_node_map *nodemap,
2226                                         const char *tunable,
2227                                         uint32_t default_value)
2228 {
2229         TDB_DATA data;
2230         struct ctdb_control_get_tunable *t;
2231         uint32_t *nodes;
2232         uint32_t *tvals;
2233         struct get_tunable_callback_data callback_data;
2234         int i;
2235
2236         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2237         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2238         for (i=0; i<nodemap->num; i++) {
2239                 tvals[i] = default_value;
2240         }
2241                 
2242         callback_data.out = tvals;
2243         callback_data.tunable = tunable;
2244         callback_data.fatal = false;
2245
2246         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2247         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2248         t = (struct ctdb_control_get_tunable *)data.dptr;
2249         t->length = strlen(tunable)+1;
2250         memcpy(t->name, tunable, t->length);
2251         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2252         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2253                                       nodes, 0, TAKEOVER_TIMEOUT(),
2254                                       false, data,
2255                                       get_tunable_callback,
2256                                       get_tunable_fail_callback,
2257                                       &callback_data) != 0) {
2258                 if (callback_data.fatal) {
2259                         talloc_free(tvals);
2260                         tvals = NULL;
2261                 }
2262         }
2263         talloc_free(nodes);
2264         talloc_free(data.dptr);
2265
2266         return tvals;
2267 }
2268
2269 struct get_runstate_callback_data {
2270         enum ctdb_runstate *out;
2271         bool fatal;
2272 };
2273
2274 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2275                                   int32_t res, TDB_DATA outdata,
2276                                   void *callback_data)
2277 {
2278         struct get_runstate_callback_data *cd =
2279                 (struct get_runstate_callback_data *)callback_data;
2280         int size;
2281
2282         if (res != 0) {
2283                 /* Already handled in fail callback */
2284                 return;
2285         }
2286
2287         if (outdata.dsize != sizeof(uint32_t)) {
2288                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2289                                  pnn, (int)sizeof(uint32_t),
2290                                  (int)outdata.dsize));
2291                 cd->fatal = true;
2292                 return;
2293         }
2294
2295         size = talloc_array_length(cd->out);
2296         if (pnn >= size) {
2297                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2298                                  pnn, size));
2299                 return;
2300         }
2301
2302         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2303 }
2304
2305 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2306                                        int32_t res, TDB_DATA outdata,
2307                                        void *callback)
2308 {
2309         struct get_runstate_callback_data *cd =
2310                 (struct get_runstate_callback_data *)callback;
2311
2312         switch (res) {
2313         case -ETIME:
2314                 DEBUG(DEBUG_ERR,
2315                       ("Timed out getting runstate from node %d\n", pnn));
2316                 cd->fatal = true;
2317                 break;
2318         default:
2319                 DEBUG(DEBUG_WARNING,
2320                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2321                        pnn));
2322         }
2323 }
2324
2325 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2326                                                     TALLOC_CTX *tmp_ctx,
2327                                                     struct ctdb_node_map *nodemap,
2328                                                     enum ctdb_runstate default_value)
2329 {
2330         uint32_t *nodes;
2331         enum ctdb_runstate *rs;
2332         struct get_runstate_callback_data callback_data;
2333         int i;
2334
2335         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2336         CTDB_NO_MEMORY_NULL(ctdb, rs);
2337         for (i=0; i<nodemap->num; i++) {
2338                 rs[i] = default_value;
2339         }
2340
2341         callback_data.out = rs;
2342         callback_data.fatal = false;
2343
2344         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2345         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2346                                       nodes, 0, TAKEOVER_TIMEOUT(),
2347                                       true, tdb_null,
2348                                       get_runstate_callback,
2349                                       get_runstate_fail_callback,
2350                                       &callback_data) != 0) {
2351                 if (callback_data.fatal) {
2352                         free(rs);
2353                         rs = NULL;
2354                 }
2355         }
2356         talloc_free(nodes);
2357
2358         return rs;
2359 }
2360
2361 /* Set internal flags for IP allocation:
2362  *   Clear ip flags
2363  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2364  *   Set NOIPHOST ip flag for each INACTIVE node
2365  *   if all nodes are disabled:
2366  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2367  *   else
2368  *     Set NOIPHOST ip flags for disabled nodes
2369  */
2370 static struct ctdb_ipflags *
2371 set_ipflags_internal(struct ctdb_context *ctdb,
2372                      TALLOC_CTX *tmp_ctx,
2373                      struct ctdb_node_map *nodemap,
2374                      uint32_t *tval_noiptakeover,
2375                      uint32_t *tval_noiphostonalldisabled,
2376                      enum ctdb_runstate *runstate)
2377 {
2378         int i;
2379         struct ctdb_ipflags *ipflags;
2380
2381         /* Clear IP flags - implicit due to talloc_zero */
2382         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2383         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2384
2385         for (i=0;i<nodemap->num;i++) {
2386                 /* Can not take IPs on node with NoIPTakeover set */
2387                 if (tval_noiptakeover[i] != 0) {
2388                         ipflags[i].noiptakeover = true;
2389                 }
2390
2391                 /* Can not host IPs on node not in RUNNING state */
2392                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2393                         ipflags[i].noiphost = true;
2394                         continue;
2395                 }
2396                 /* Can not host IPs on INACTIVE node */
2397                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2398                         ipflags[i].noiphost = true;
2399                 }
2400                 /* Remember the runstate */
2401                 ipflags[i].runstate = runstate[i];
2402         }
2403
2404         if (all_nodes_are_disabled(nodemap)) {
2405                 /* If all nodes are disabled, can not host IPs on node
2406                  * with NoIPHostOnAllDisabled set
2407                  */
2408                 for (i=0;i<nodemap->num;i++) {
2409                         if (tval_noiphostonalldisabled[i] != 0) {
2410                                 ipflags[i].noiphost = true;
2411                         }
2412                 }
2413         } else {
2414                 /* If some nodes are not disabled, then can not host
2415                  * IPs on DISABLED node
2416                  */
2417                 for (i=0;i<nodemap->num;i++) {
2418                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2419                                 ipflags[i].noiphost = true;
2420                         }
2421                 }
2422         }
2423
2424         return ipflags;
2425 }
2426
2427 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2428                                         TALLOC_CTX *tmp_ctx,
2429                                         struct ctdb_node_map *nodemap)
2430 {
2431         uint32_t *tval_noiptakeover;
2432         uint32_t *tval_noiphostonalldisabled;
2433         struct ctdb_ipflags *ipflags;
2434         enum ctdb_runstate *runstate;
2435
2436
2437         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2438                                                    "NoIPTakeover", 0);
2439         if (tval_noiptakeover == NULL) {
2440                 return NULL;
2441         }
2442
2443         tval_noiphostonalldisabled =
2444                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2445                                        "NoIPHostOnAllDisabled", 0);
2446         if (tval_noiphostonalldisabled == NULL) {
2447                 /* Caller frees tmp_ctx */
2448                 return NULL;
2449         }
2450
2451         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2452          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2453          * reasonable behaviour on a mixed cluster during upgrade.
2454          */
2455         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2456                                            CTDB_RUNSTATE_RUNNING);
2457         if (runstate == NULL) {
2458                 /* Caller frees tmp_ctx */
2459                 return NULL;
2460         }
2461
2462         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2463                                        tval_noiptakeover,
2464                                        tval_noiphostonalldisabled,
2465                                        runstate);
2466
2467         talloc_free(tval_noiptakeover);
2468         talloc_free(tval_noiphostonalldisabled);
2469         talloc_free(runstate);
2470
2471         return ipflags;
2472 }
2473
2474 struct iprealloc_callback_data {
2475         bool *retry_nodes;
2476         int retry_count;
2477         client_async_callback fail_callback;
2478         void *fail_callback_data;
2479         struct ctdb_node_map *nodemap;
2480 };
2481
2482 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2483                                         int32_t res, TDB_DATA outdata,
2484                                         void *callback)
2485 {
2486         int numnodes;
2487         struct iprealloc_callback_data *cd =
2488                 (struct iprealloc_callback_data *)callback;
2489
2490         numnodes = talloc_array_length(cd->retry_nodes);
2491         if (pnn > numnodes) {
2492                 DEBUG(DEBUG_ERR,
2493                       ("ipreallocated failure from node %d, "
2494                        "but only %d nodes in nodemap\n",
2495                        pnn, numnodes));
2496                 return;
2497         }
2498
2499         /* Can't run the "ipreallocated" event on a INACTIVE node */
2500         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2501                 DEBUG(DEBUG_WARNING,
2502                       ("ipreallocated failed on inactive node %d, ignoring\n",
2503                        pnn));
2504                 return;
2505         }
2506
2507         switch (res) {
2508         case -ETIME:
2509                 /* If the control timed out then that's a real error,
2510                  * so call the real fail callback
2511                  */
2512                 if (cd->fail_callback) {
2513                         cd->fail_callback(ctdb, pnn, res, outdata,
2514                                           cd->fail_callback_data);
2515                 } else {
2516                         DEBUG(DEBUG_WARNING,
2517                               ("iprealloc timed out but no callback registered\n"));
2518                 }
2519                 break;
2520         default:
2521                 /* If not a timeout then either the ipreallocated
2522                  * eventscript (or some setup) failed.  This might
2523                  * have failed because the IPREALLOCATED control isn't
2524                  * implemented - right now there is no way of knowing
2525                  * because the error codes are all folded down to -1.
2526                  * Consider retrying using EVENTSCRIPT control...
2527                  */
2528                 DEBUG(DEBUG_WARNING,
2529                       ("ipreallocated failure from node %d, flagging retry\n",
2530                        pnn));
2531                 cd->retry_nodes[pnn] = true;
2532                 cd->retry_count++;
2533         }
2534 }
2535
2536 struct takeover_callback_data {
2537         bool *node_failed;
2538         client_async_callback fail_callback;
2539         void *fail_callback_data;
2540         struct ctdb_node_map *nodemap;
2541 };
2542
2543 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2544                                        uint32_t node_pnn, int32_t res,
2545                                        TDB_DATA outdata, void *callback_data)
2546 {
2547         struct takeover_callback_data *cd =
2548                 talloc_get_type_abort(callback_data,
2549                                       struct takeover_callback_data);
2550         int i;
2551
2552         for (i = 0; i < cd->nodemap->num; i++) {
2553                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2554                         break;
2555                 }
2556         }
2557
2558         if (i == cd->nodemap->num) {
2559                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2560                 return;
2561         }
2562
2563         if (!cd->node_failed[i]) {
2564                 cd->node_failed[i] = true;
2565                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2566                                   cd->fail_callback_data);
2567         }
2568 }
2569
2570 /*
2571   make any IP alias changes for public addresses that are necessary 
2572  */
2573 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2574                       uint32_t *force_rebalance_nodes,
2575                       client_async_callback fail_callback, void *callback_data)
2576 {
2577         int i, j, ret;
2578         struct ctdb_public_ip ip;
2579         uint32_t *nodes;
2580         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2581         TDB_DATA data;
2582         struct timeval timeout;
2583         struct client_async_data *async_data;
2584         struct ctdb_client_control_state *state;
2585         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2586         struct ctdb_ipflags *ipflags;
2587         struct takeover_callback_data *takeover_data;
2588         struct iprealloc_callback_data iprealloc_data;
2589         bool *retry_data;
2590         bool can_host_ips;
2591
2592         /*
2593          * ip failover is completely disabled, just send out the 
2594          * ipreallocated event.
2595          */
2596         if (ctdb->tunable.disable_ip_failover != 0) {
2597                 goto ipreallocated;
2598         }
2599
2600         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2601         if (ipflags == NULL) {
2602                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2603                 talloc_free(tmp_ctx);
2604                 return -1;
2605         }
2606
2607         /* Short-circuit IP allocation if no nodes are in the RUNNING
2608          * runstate yet, since no nodes will be able to host IPs */
2609         can_host_ips = false;
2610         for (i=0; i<nodemap->num; i++) {
2611                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2612                         can_host_ips = true;
2613                 }
2614         }
2615         if (!can_host_ips) {
2616                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2617                 return 0;
2618         }
2619
2620         /* Do the IP reassignment calculations */
2621         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2622
2623         /* Now tell all nodes to release any public IPs should not
2624          * host.  This will be a NOOP on nodes that don't currently
2625          * hold the given IP.
2626          */
2627         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2628         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2629
2630         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2631                                                        bool, nodemap->num);
2632         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2633         takeover_data->fail_callback = fail_callback;
2634         takeover_data->fail_callback_data = callback_data;
2635         takeover_data->nodemap = nodemap;
2636
2637         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2638         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2639
2640         async_data->fail_callback = takeover_run_fail_callback;
2641         async_data->callback_data = takeover_data;
2642
2643         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2644
2645         /* Send a RELEASE_IP to all nodes that should not be hosting
2646          * each IP.  For each IP, all but one of these will be
2647          * redundant.  However, the redundant ones are used to tell
2648          * nodes which node should be hosting the IP so that commands
2649          * like "ctdb ip" can display a particular nodes idea of who
2650          * is hosting what. */
2651         for (i=0;i<nodemap->num;i++) {
2652                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2653                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2654                         continue;
2655                 }
2656
2657                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2658                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2659                                 /* This node should be serving this
2660                                    vnn so dont tell it to release the ip
2661                                 */
2662                                 continue;
2663                         }
2664                         ip.pnn  = tmp_ip->pnn;
2665                         ip.addr = tmp_ip->addr;
2666
2667                         timeout = TAKEOVER_TIMEOUT();
2668                         data.dsize = sizeof(ip);
2669                         data.dptr  = (uint8_t *)&ip;
2670                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2671                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2672                                                   data, async_data,
2673                                                   &timeout, NULL);
2674                         if (state == NULL) {
2675                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2676                                 talloc_free(tmp_ctx);
2677                                 return -1;
2678                         }
2679
2680                         ctdb_client_async_add(async_data, state);
2681                 }
2682         }
2683         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2684                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2685                 talloc_free(tmp_ctx);
2686                 return -1;
2687         }
2688         talloc_free(async_data);
2689
2690
2691         /* For each IP, send a TAKOVER_IP to the node that should be
2692          * hosting it.  Many of these will often be redundant (since
2693          * the allocation won't have changed) but they can be useful
2694          * to recover from inconsistencies. */
2695         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2696         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2697
2698         async_data->fail_callback = fail_callback;
2699         async_data->callback_data = callback_data;
2700
2701         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2702                 if (tmp_ip->pnn == -1) {
2703                         /* this IP won't be taken over */
2704                         continue;
2705                 }
2706
2707                 ip.pnn  = tmp_ip->pnn;
2708                 ip.addr = tmp_ip->addr;
2709
2710                 timeout = TAKEOVER_TIMEOUT();
2711                 data.dsize = sizeof(ip);
2712                 data.dptr  = (uint8_t *)&ip;
2713                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2714                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2715                                           data, async_data, &timeout, NULL);
2716                 if (state == NULL) {
2717                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2718                         talloc_free(tmp_ctx);
2719                         return -1;
2720                 }
2721
2722                 ctdb_client_async_add(async_data, state);
2723         }
2724         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2725                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2726                 talloc_free(tmp_ctx);
2727                 return -1;
2728         }
2729
2730 ipreallocated:
2731         /*
2732          * Tell all nodes to run eventscripts to process the
2733          * "ipreallocated" event.  This can do a lot of things,
2734          * including restarting services to reconfigure them if public
2735          * IPs have moved.  Once upon a time this event only used to
2736          * update natgw.
2737          */
2738         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2739         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2740         iprealloc_data.retry_nodes = retry_data;
2741         iprealloc_data.retry_count = 0;
2742         iprealloc_data.fail_callback = fail_callback;
2743         iprealloc_data.fail_callback_data = callback_data;
2744         iprealloc_data.nodemap = nodemap;
2745
2746         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2747         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2748                                         nodes, 0, TAKEOVER_TIMEOUT(),
2749                                         false, tdb_null,
2750                                         NULL, iprealloc_fail_callback,
2751                                         &iprealloc_data);
2752         if (ret != 0) {
2753                 /* If the control failed then we should retry to any
2754                  * nodes flagged by iprealloc_fail_callback using the
2755                  * EVENTSCRIPT control.  This is a best-effort at
2756                  * backward compatiblity when running a mixed cluster
2757                  * where some nodes have not yet been upgraded to
2758                  * support the IPREALLOCATED control.
2759                  */
2760                 DEBUG(DEBUG_WARNING,
2761                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2762
2763                 nodes = talloc_array(tmp_ctx, uint32_t,
2764                                      iprealloc_data.retry_count);
2765                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2766
2767                 j = 0;
2768                 for (i=0; i<nodemap->num; i++) {
2769                         if (iprealloc_data.retry_nodes[i]) {
2770                                 nodes[j] = i;
2771                                 j++;
2772                         }
2773                 }
2774
2775                 data.dptr  = discard_const("ipreallocated");
2776                 data.dsize = strlen((char *)data.dptr) + 1; 
2777                 ret = ctdb_client_async_control(ctdb,
2778                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2779                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2780                                                 false, data,
2781                                                 NULL, fail_callback,
2782                                                 callback_data);
2783                 if (ret != 0) {
2784                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2785                 }
2786         }
2787
2788         talloc_free(tmp_ctx);
2789         return ret;
2790 }
2791
2792
2793 /*
2794   destroy a ctdb_client_ip structure
2795  */
2796 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2797 {
2798         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2799                 ctdb_addr_to_str(&ip->addr),
2800                 ntohs(ip->addr.ip.sin_port),
2801                 ip->client_id));
2802
2803         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2804         return 0;
2805 }
2806
2807 /*
2808   called by a client to inform us of a TCP connection that it is managing
2809   that should tickled with an ACK when IP takeover is done
2810  */
2811 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2812                                 TDB_DATA indata)
2813 {
2814         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2815         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2816         struct ctdb_tcp_list *tcp;
2817         struct ctdb_tcp_connection t;
2818         int ret;
2819         TDB_DATA data;
2820         struct ctdb_client_ip *ip;
2821         struct ctdb_vnn *vnn;
2822         ctdb_sock_addr addr;
2823
2824         /* If we don't have public IPs, tickles are useless */
2825         if (ctdb->vnn == NULL) {
2826                 return 0;
2827         }
2828
2829         tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2830
2831         addr = tcp_sock->src;
2832         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2833         addr = tcp_sock->dest;
2834         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2835
2836         ZERO_STRUCT(addr);
2837         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2838         vnn = find_public_ip_vnn(ctdb, &addr);
2839         if (vnn == NULL) {
2840                 switch (addr.sa.sa_family) {
2841                 case AF_INET:
2842                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2843                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2844                                         ctdb_addr_to_str(&addr)));
2845                         }
2846                         break;
2847                 case AF_INET6:
2848                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2849                                 ctdb_addr_to_str(&addr)));
2850                         break;
2851                 default:
2852                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2853                 }
2854
2855                 return 0;
2856         }
2857
2858         if (vnn->pnn != ctdb->pnn) {
2859                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2860                         ctdb_addr_to_str(&addr),
2861                         client_id, client->pid));
2862                 /* failing this call will tell smbd to die */
2863                 return -1;
2864         }
2865
2866         ip = talloc(client, struct ctdb_client_ip);
2867         CTDB_NO_MEMORY(ctdb, ip);
2868
2869         ip->ctdb      = ctdb;
2870         ip->addr      = addr;
2871         ip->client_id = client_id;
2872         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2873         DLIST_ADD(ctdb->client_ip_list, ip);
2874
2875         tcp = talloc(client, struct ctdb_tcp_list);
2876         CTDB_NO_MEMORY(ctdb, tcp);
2877
2878         tcp->connection.src_addr = tcp_sock->src;
2879         tcp->connection.dst_addr = tcp_sock->dest;
2880
2881         DLIST_ADD(client->tcp_list, tcp);
2882
2883         t.src_addr = tcp_sock->src;
2884         t.dst_addr = tcp_sock->dest;
2885
2886         data.dptr = (uint8_t *)&t;
2887         data.dsize = sizeof(t);
2888
2889         switch (addr.sa.sa_family) {
2890         case AF_INET:
2891                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2892                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2893                         ctdb_addr_to_str(&tcp_sock->src),
2894                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2895                 break;
2896         case AF_INET6:
2897                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2898                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2899                         ctdb_addr_to_str(&tcp_sock->src),
2900                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2901                 break;
2902         default:
2903                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2904         }
2905
2906
2907         /* tell all nodes about this tcp connection */
2908         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2909                                        CTDB_CONTROL_TCP_ADD,
2910                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2911         if (ret != 0) {
2912                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2913                 return -1;
2914         }
2915
2916         return 0;
2917 }
2918
2919 /*
2920   find a tcp address on a list
2921  */
2922 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2923                                            struct ctdb_tcp_connection *tcp)
2924 {
2925         int i;
2926
2927         if (array == NULL) {
2928                 return NULL;
2929         }
2930
2931         for (i=0;i<array->num;i++) {
2932                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2933                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2934                         return &array->connections[i];
2935                 }
2936         }
2937         return NULL;
2938 }
2939
2940
2941
2942 /*
2943   called by a daemon to inform us of a TCP connection that one of its
2944   clients managing that should tickled with an ACK when IP takeover is
2945   done
2946  */
2947 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2948 {
2949         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2950         struct ctdb_tcp_array *tcparray;
2951         struct ctdb_tcp_connection tcp;
2952         struct ctdb_vnn *vnn;
2953
2954         /* If we don't have public IPs, tickles are useless */
2955         if (ctdb->vnn == NULL) {
2956                 return 0;
2957         }
2958
2959         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2960         if (vnn == NULL) {
2961                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2962                         ctdb_addr_to_str(&p->dst_addr)));
2963
2964                 return -1;
2965         }
2966
2967
2968         tcparray = vnn->tcp_array;
2969
2970         /* If this is the first tickle */
2971         if (tcparray == NULL) {
2972                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2973                 CTDB_NO_MEMORY(ctdb, tcparray);
2974                 vnn->tcp_array = tcparray;
2975
2976                 tcparray->num = 0;
2977                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2978                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2979
2980                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2981                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2982                 tcparray->num++;
2983
2984                 if (tcp_update_needed) {
2985                         vnn->tcp_update_needed = true;
2986                 }
2987                 return 0;
2988         }
2989
2990
2991         /* Do we already have this tickle ?*/
2992         tcp.src_addr = p->src_addr;
2993         tcp.dst_addr = p->dst_addr;
2994         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2995                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2996                         ctdb_addr_to_str(&tcp.dst_addr),
2997                         ntohs(tcp.dst_addr.ip.sin_port),
2998                         vnn->pnn));
2999                 return 0;
3000         }
3001
3002         /* A new tickle, we must add it to the array */
3003         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3004                                         struct ctdb_tcp_connection,
3005                                         tcparray->num+1);
3006         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3007
3008         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3009         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3010         tcparray->num++;
3011
3012         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3013                 ctdb_addr_to_str(&tcp.dst_addr),
3014                 ntohs(tcp.dst_addr.ip.sin_port),
3015                 vnn->pnn));
3016
3017         if (tcp_update_needed) {
3018                 vnn->tcp_update_needed = true;
3019         }
3020
3021         return 0;
3022 }
3023
3024
3025 /*
3026   called by a daemon to inform us of a TCP connection that one of its
3027   clients managing that should tickled with an ACK when IP takeover is
3028   done
3029  */
3030 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3031 {
3032         struct ctdb_tcp_connection *tcpp;
3033         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3034
3035         if (vnn == NULL) {
3036                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3037                         ctdb_addr_to_str(&conn->dst_addr)));
3038                 return;
3039         }
3040
3041         /* if the array is empty we cant remove it
3042            and we dont need to do anything
3043          */
3044         if (vnn->tcp_array == NULL) {
3045                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3046                         ctdb_addr_to_str(&conn->dst_addr),
3047                         ntohs(conn->dst_addr.ip.sin_port)));
3048                 return;
3049         }
3050
3051
3052         /* See if we know this connection
3053            if we dont know this connection  then we dont need to do anything
3054          */
3055         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3056         if (tcpp == NULL) {
3057                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3058                         ctdb_addr_to_str(&conn->dst_addr),
3059                         ntohs(conn->dst_addr.ip.sin_port)));
3060                 return;
3061         }
3062
3063
3064         /* We need to remove this entry from the array.
3065            Instead of allocating a new array and copying data to it
3066            we cheat and just copy the last entry in the existing array
3067            to the entry that is to be removed and just shring the 
3068            ->num field
3069          */
3070         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3071         vnn->tcp_array->num--;
3072
3073         /* If we deleted the last entry we also need to remove the entire array
3074          */
3075         if (vnn->tcp_array->num == 0) {
3076                 talloc_free(vnn->tcp_array);
3077                 vnn->tcp_array = NULL;
3078         }               
3079
3080         vnn->tcp_update_needed = true;
3081
3082         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3083                 ctdb_addr_to_str(&conn->src_addr),
3084                 ntohs(conn->src_addr.ip.sin_port)));
3085 }
3086
3087
3088 /*
3089   called by a daemon to inform us of a TCP connection that one of its
3090   clients used are no longer needed in the tickle database
3091  */
3092 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3093 {
3094         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3095
3096         /* If we don't have public IPs, tickles are useless */
3097         if (ctdb->vnn == NULL) {
3098                 return 0;
3099         }
3100
3101         ctdb_remove_tcp_connection(ctdb, conn);
3102
3103         return 0;
3104 }
3105
3106
3107 /*
3108   Called when another daemon starts - causes all tickles for all
3109   public addresses we are serving to be sent to the new node on the
3110   next check.  This actually causes the next scheduled call to
3111   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3112   doesn't require careful error handling.
3113  */
3114 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3115 {
3116         struct ctdb_vnn *vnn;
3117
3118         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3119                            (unsigned long) pnn));
3120
3121         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3122                 vnn->tcp_update_needed = true;
3123         }
3124
3125         return 0;
3126 }
3127
3128
3129 /*
3130   called when a client structure goes away - hook to remove
3131   elements from the tcp_list in all daemons
3132  */
3133 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3134 {
3135         while (client->tcp_list) {
3136                 struct ctdb_tcp_list *tcp = client->tcp_list;
3137                 DLIST_REMOVE(client->tcp_list, tcp);
3138                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3139         }
3140 }
3141
3142
3143 /*
3144   release all IPs on shutdown
3145  */
3146 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3147 {
3148         struct ctdb_vnn *vnn;
3149         int count = 0;
3150
3151         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3152                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3153                         ctdb_vnn_unassign_iface(ctdb, vnn);
3154                         continue;
3155                 }
3156                 if (!vnn->iface) {
3157                         continue;
3158                 }
3159
3160                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3161                                     ctdb_addr_to_str(&vnn->public_address),
3162                                     vnn->public_netmask_bits,
3163                                     ctdb_vnn_iface_string(vnn)));
3164
3165                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3166                                   ctdb_vnn_iface_string(vnn),
3167                                   ctdb_addr_to_str(&vnn->public_address),
3168                                   vnn->public_netmask_bits);
3169                 release_kill_clients(ctdb, &vnn->public_address);
3170                 ctdb_vnn_unassign_iface(ctdb, vnn);
3171                 count++;
3172         }
3173
3174         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3175 }
3176
3177
3178 /*
3179   get list of public IPs
3180  */
3181 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3182                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3183 {
3184         int i, num, len;
3185         struct ctdb_all_public_ips *ips;
3186         struct ctdb_vnn *vnn;
3187         bool only_available = false;
3188
3189         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3190                 only_available = true;
3191         }
3192
3193         /* count how many public ip structures we have */
3194         num = 0;
3195         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3196                 num++;
3197         }
3198
3199         len = offsetof(struct ctdb_all_public_ips, ips) + 
3200                 num*sizeof(struct ctdb_public_ip);
3201         ips = talloc_zero_size(outdata, len);
3202         CTDB_NO_MEMORY(ctdb, ips);
3203
3204         i = 0;
3205         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3206                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3207                         continue;
3208                 }
3209                 ips->ips[i].pnn  = vnn->pnn;
3210                 ips->ips[i].addr = vnn->public_address;
3211                 i++;
3212         }
3213         ips->num = i;
3214         len = offsetof(struct ctdb_all_public_ips, ips) +
3215                 i*sizeof(struct ctdb_public_ip);
3216
3217         outdata->dsize = len;
3218         outdata->dptr  = (uint8_t *)ips;
3219
3220         return 0;
3221 }
3222
3223
3224 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3225                                         struct ctdb_req_control *c,
3226                                         TDB_DATA indata,
3227                                         TDB_DATA *outdata)
3228 {
3229         int i, num, len;
3230         ctdb_sock_addr *addr;
3231         struct ctdb_control_public_ip_info *info;
3232         struct ctdb_vnn *vnn;
3233
3234         addr = (ctdb_sock_addr *)indata.dptr;
3235
3236         vnn = find_public_ip_vnn(ctdb, addr);
3237         if (vnn == NULL) {
3238                 /* if it is not a public ip   it could be our 'single ip' */
3239                 if (ctdb->single_ip_vnn) {
3240                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3241                                 vnn = ctdb->single_ip_vnn;
3242                         }
3243                 }
3244         }
3245         if (vnn == NULL) {
3246                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3247                                  "'%s'not a public address\n",
3248                                  ctdb_addr_to_str(addr)));
3249                 return -1;
3250         }
3251
3252         /* count how many public ip structures we have */
3253         num = 0;
3254         for (;vnn->ifaces[num];) {
3255                 num++;
3256         }
3257
3258         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3259                 num*sizeof(struct ctdb_control_iface_info);
3260         info = talloc_zero_size(outdata, len);
3261         CTDB_NO_MEMORY(ctdb, info);
3262
3263         info->ip.addr = vnn->public_address;
3264         info->ip.pnn = vnn->pnn;
3265         info->active_idx = 0xFFFFFFFF;
3266
3267         for (i=0; vnn->ifaces[i]; i++) {
3268                 struct ctdb_iface *cur;
3269
3270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3271                 if (cur == NULL) {
3272                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3273                                            vnn->ifaces[i]));
3274                         return -1;
3275                 }
3276                 if (vnn->iface == cur) {
3277                         info->active_idx = i;
3278                 }
3279                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3280                 info->ifaces[i].link_state = cur->link_up;
3281                 info->ifaces[i].references = cur->references;
3282         }
3283         info->num = i;
3284         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3285                 i*sizeof(struct ctdb_control_iface_info);
3286
3287         outdata->dsize = len;
3288         outdata->dptr  = (uint8_t *)info;
3289
3290         return 0;
3291 }
3292
3293 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3294                                 struct ctdb_req_control *c,
3295                                 TDB_DATA *outdata)
3296 {
3297         int i, num, len;
3298         struct ctdb_control_get_ifaces *ifaces;
3299         struct ctdb_iface *cur;
3300
3301         /* count how many public ip structures we have */
3302         num = 0;
3303         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3304                 num++;
3305         }
3306
3307         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3308                 num*sizeof(struct ctdb_control_iface_info);
3309         ifaces = talloc_zero_size(outdata, len);
3310         CTDB_NO_MEMORY(ctdb, ifaces);
3311
3312         i = 0;
3313         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3314                 strcpy(ifaces->ifaces[i].name, cur->name);
3315                 ifaces->ifaces[i].link_state = cur->link_up;
3316                 ifaces->ifaces[i].references = cur->references;
3317                 i++;
3318         }
3319         ifaces->num = i;
3320         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3321                 i*sizeof(struct ctdb_control_iface_info);
3322
3323         outdata->dsize = len;
3324         outdata->dptr  = (uint8_t *)ifaces;
3325
3326         return 0;
3327 }
3328
3329 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3330                                     struct ctdb_req_control *c,
3331                                     TDB_DATA indata)
3332 {
3333         struct ctdb_control_iface_info *info;
3334         struct ctdb_iface *iface;
3335         bool link_up = false;
3336
3337         info = (struct ctdb_control_iface_info *)indata.dptr;
3338
3339         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3340                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3341                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3342                                   len, len, info->name));
3343                 return -1;
3344         }
3345
3346         switch (info->link_state) {
3347         case 0:
3348                 link_up = false;
3349                 break;
3350         case 1:
3351                 link_up = true;
3352                 break;
3353         default:
3354                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3355                                   (unsigned int)info->link_state));
3356                 return -1;
3357         }
3358
3359         if (info->references != 0) {
3360                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3361                                   (unsigned int)info->references));
3362                 return -1;
3363         }
3364
3365         iface = ctdb_find_iface(ctdb, info->name);
3366         if (iface == NULL) {
3367                 return -1;
3368         }
3369
3370         if (link_up == iface->link_up) {
3371                 return 0;
3372         }
3373
3374         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3375               ("iface[%s] has changed it's link status %s => %s\n",
3376                iface->name,
3377                iface->link_up?"up":"down",
3378                link_up?"up":"down"));
3379
3380         iface->link_up = link_up;
3381         return 0;
3382 }
3383
3384
3385 /* 
3386    structure containing the listening socket and the list of tcp connections
3387    that the ctdb daemon is to kill
3388 */
3389 struct ctdb_kill_tcp {
3390         struct ctdb_vnn *vnn;
3391         struct ctdb_context *ctdb;
3392         int capture_fd;
3393         struct fd_event *fde;
3394         trbt_tree_t *connections;
3395         void *private_data;
3396 };
3397
3398 /*
3399   a tcp connection that is to be killed
3400  */
3401 struct ctdb_killtcp_con {
3402         ctdb_sock_addr src_addr;
3403         ctdb_sock_addr dst_addr;
3404         int count;
3405         struct ctdb_kill_tcp *killtcp;
3406 };
3407
3408 /* this function is used to create a key to represent this socketpair
3409    in the killtcp tree.
3410    this key is used to insert and lookup matching socketpairs that are
3411    to be tickled and RST
3412 */
3413 #define KILLTCP_KEYLEN  10
3414 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3415 {
3416         static uint32_t key[KILLTCP_KEYLEN];
3417
3418         bzero(key, sizeof(key));
3419
3420         if (src->sa.sa_family != dst->sa.sa_family) {
3421                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3422                 return key;
3423         }
3424         
3425         switch (src->sa.sa_family) {
3426         case AF_INET:
3427                 key[0]  = dst->ip.sin_addr.s_addr;
3428                 key[1]  = src->ip.sin_addr.s_addr;
3429                 key[2]  = dst->ip.sin_port;
3430                 key[3]  = src->ip.sin_port;
3431                 break;
3432         case AF_INET6: {
3433                 uint32_t *dst6_addr32 =
3434                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3435                 uint32_t *src6_addr32 =
3436                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3437                 key[0]  = dst6_addr32[3];
3438                 key[1]  = src6_addr32[3];
3439                 key[2]  = dst6_addr32[2];
3440                 key[3]  = src6_addr32[2];
3441                 key[4]  = dst6_addr32[1];
3442                 key[5]  = src6_addr32[1];
3443                 key[6]  = dst6_addr32[0];
3444                 key[7]  = src6_addr32[0];
3445                 key[8]  = dst->ip6.sin6_port;
3446                 key[9]  = src->ip6.sin6_port;
3447                 break;
3448         }
3449         default:
3450                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3451                 return key;
3452         }
3453
3454         return key;
3455 }
3456
3457 /*
3458   called when we get a read event on the raw socket
3459  */
3460 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3461                                 uint16_t flags, void *private_data)
3462 {
3463         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3464         struct ctdb_killtcp_con *con;
3465         ctdb_sock_addr src, dst;
3466         uint32_t ack_seq, seq;
3467
3468         if (!(flags & EVENT_FD_READ)) {
3469                 return;
3470         }
3471
3472         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3473                                 killtcp->private_data,
3474                                 &src, &dst,
3475                                 &ack_seq, &seq) != 0) {
3476                 /* probably a non-tcp ACK packet */
3477                 return;
3478         }
3479
3480         /* check if we have this guy in our list of connections
3481            to kill
3482         */
3483         con = trbt_lookuparray32(killtcp->connections, 
3484                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3485         if (con == NULL) {
3486                 /* no this was some other packet we can just ignore */
3487                 return;
3488         }
3489
3490         /* This one has been tickled !
3491            now reset him and remove him from the list.
3492          */
3493         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3494                 ntohs(con->dst_addr.ip.sin_port),
3495                 ctdb_addr_to_str(&con->src_addr),
3496                 ntohs(con->src_addr.ip.sin_port)));
3497
3498         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3499         talloc_free(con);
3500 }
3501
3502
3503 /* when traversing the list of all tcp connections to send tickle acks to
3504    (so that we can capture the ack coming back and kill the connection
3505     by a RST)
3506    this callback is called for each connection we are currently trying to kill
3507 */
3508 static int tickle_connection_traverse(void *param, void *data)
3509 {
3510         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3511
3512         /* have tried too many times, just give up */
3513         if (con->count >= 5) {
3514                 /* can't delete in traverse: reparent to delete_cons */
3515                 talloc_steal(param, con);
3516                 return 0;
3517         }
3518
3519         /* othervise, try tickling it again */
3520         con->count++;
3521         ctdb_sys_send_tcp(
3522                 (ctdb_sock_addr *)&con->dst_addr,
3523                 (ctdb_sock_addr *)&con->src_addr,
3524                 0, 0, 0);
3525         return 0;
3526 }
3527
3528
3529 /* 
3530    called every second until all sentenced connections have been reset
3531  */
3532 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3533                                               struct timeval t, void *private_data)
3534 {
3535         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3536         void *delete_cons = talloc_new(NULL);
3537
3538         /* loop over all connections sending tickle ACKs */
3539         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3540
3541         /* now we've finished traverse, it's safe to do deletion. */
3542         talloc_free(delete_cons);
3543
3544         /* If there are no more connections to kill we can remove the
3545            entire killtcp structure
3546          */
3547         if ( (killtcp->connections == NULL) || 
3548              (killtcp->connections->root == NULL) ) {
3549                 talloc_free(killtcp);
3550                 return;
3551         }
3552
3553         /* try tickling them again in a seconds time
3554          */
3555         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3556                         ctdb_tickle_sentenced_connections, killtcp);
3557 }
3558
3559 /*
3560   destroy the killtcp structure
3561  */
3562 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3563 {
3564         struct ctdb_vnn *tmpvnn;
3565
3566         /* verify that this vnn is still active */
3567         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3568                 if (tmpvnn == killtcp->vnn) {
3569                         break;
3570                 }
3571         }
3572
3573         if (tmpvnn == NULL) {
3574                 return 0;
3575         }
3576
3577         if (killtcp->vnn->killtcp != killtcp) {
3578                 return 0;
3579         }
3580
3581         killtcp->vnn->killtcp = NULL;
3582
3583         return 0;
3584 }
3585
3586
3587 /* nothing fancy here, just unconditionally replace any existing
3588    connection structure with the new one.
3589
3590    dont even free the old one if it did exist, that one is talloc_stolen
3591    by the same node in the tree anyway and will be deleted when the new data 
3592    is deleted
3593 */
3594 static void *add_killtcp_callback(void *parm, void *data)
3595 {
3596         return parm;
3597 }
3598
3599 /*
3600   add a tcp socket to the list of connections we want to RST
3601  */
3602 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3603                                        ctdb_sock_addr *s,
3604                                        ctdb_sock_addr *d)
3605 {
3606         ctdb_sock_addr src, dst;
3607         struct ctdb_kill_tcp *killtcp;
3608         struct ctdb_killtcp_con *con;
3609         struct ctdb_vnn *vnn;
3610
3611         ctdb_canonicalize_ip(s, &src);
3612         ctdb_canonicalize_ip(d, &dst);
3613
3614         vnn = find_public_ip_vnn(ctdb, &dst);
3615         if (vnn == NULL) {
3616                 vnn = find_public_ip_vnn(ctdb, &src);
3617         }
3618         if (vnn == NULL) {
3619                 /* if it is not a public ip   it could be our 'single ip' */
3620                 if (ctdb->single_ip_vnn) {
3621                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3622                                 vnn = ctdb->single_ip_vnn;
3623                         }
3624                 }
3625         }
3626         if (vnn == NULL) {
3627                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3628                 return -1;
3629         }
3630
3631         killtcp = vnn->killtcp;
3632         
3633         /* If this is the first connection to kill we must allocate
3634            a new structure
3635          */
3636         if (killtcp == NULL) {
3637                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3638                 CTDB_NO_MEMORY(ctdb, killtcp);
3639
3640                 killtcp->vnn         = vnn;
3641                 killtcp->ctdb        = ctdb;
3642                 killtcp->capture_fd  = -1;
3643                 killtcp->connections = trbt_create(killtcp, 0);
3644
3645                 vnn->killtcp         = killtcp;
3646                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3647         }
3648
3649
3650
3651         /* create a structure that describes this connection we want to
3652            RST and store it in killtcp->connections
3653         */
3654         con = talloc(killtcp, struct ctdb_killtcp_con);
3655         CTDB_NO_MEMORY(ctdb, con);
3656         con->src_addr = src;
3657         con->dst_addr = dst;
3658         con->count    = 0;
3659         con->killtcp  = killtcp;
3660
3661
3662         trbt_insertarray32_callback(killtcp->connections,
3663                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3664                         add_killtcp_callback, con);
3665
3666         /* 
3667            If we dont have a socket to listen on yet we must create it
3668          */
3669         if (killtcp->capture_fd == -1) {
3670                 const char *iface = ctdb_vnn_iface_string(vnn);
3671                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3672                 if (killtcp->capture_fd == -1) {
3673                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3674                                           "socket on iface '%s' for killtcp (%s)\n",
3675                                           iface, strerror(errno)));
3676                         goto failed;
3677                 }
3678         }
3679
3680
3681         if (killtcp->fde == NULL) {
3682                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3683                                             EVENT_FD_READ,
3684                                             capture_tcp_handler, killtcp);
3685                 tevent_fd_set_auto_close(killtcp->fde);
3686
3687                 /* We also need to set up some events to tickle all these connections
3688                    until they are all reset
3689                 */
3690                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3691                                 ctdb_tickle_sentenced_connections, killtcp);
3692         }
3693
3694         /* tickle him once now */
3695         ctdb_sys_send_tcp(
3696                 &con->dst_addr,
3697                 &con->src_addr,
3698                 0, 0, 0);
3699
3700         return 0;
3701
3702 failed:
3703         talloc_free(vnn->killtcp);
3704         vnn->killtcp = NULL;
3705         return -1;
3706 }
3707
3708 /*
3709   kill a TCP connection.
3710  */
3711 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3712 {
3713         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3714
3715         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3716 }
3717
3718 /*
3719   called by a daemon to inform us of the entire list of TCP tickles for
3720   a particular public address.
3721   this control should only be sent by the node that is currently serving
3722   that public address.
3723  */
3724 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3725 {
3726         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3727         struct ctdb_tcp_array *tcparray;
3728         struct ctdb_vnn *vnn;
3729
3730         /* We must at least have tickles.num or else we cant verify the size
3731            of the received data blob
3732          */
3733         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3734                                         tickles.connections)) {
3735                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3736                 return -1;
3737         }
3738
3739         /* verify that the size of data matches what we expect */
3740         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3741                                 tickles.connections)
3742                          + sizeof(struct ctdb_tcp_connection)
3743                                  * list->tickles.num) {
3744                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3745                 return -1;
3746         }
3747
3748         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3749                            ctdb_addr_to_str(&list->addr)));
3750
3751         vnn = find_public_ip_vnn(ctdb, &list->addr);
3752         if (vnn == NULL) {
3753                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3754                         ctdb_addr_to_str(&list->addr)));
3755
3756                 return 1;
3757         }
3758
3759         /* remove any old ticklelist we might have */
3760         talloc_free(vnn->tcp_array);
3761         vnn->tcp_array = NULL;
3762
3763         tcparray = talloc(vnn, struct ctdb_tcp_array);
3764         CTDB_NO_MEMORY(ctdb, tcparray);
3765
3766         tcparray->num = list->tickles.num;
3767
3768         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3769         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3770
3771         memcpy(tcparray->connections, &list->tickles.connections[0],
3772                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3773
3774         /* We now have a new fresh tickle list array for this vnn */
3775         vnn->tcp_array = tcparray;
3776
3777         return 0;
3778 }
3779
3780 /*
3781   called to return the full list of tickles for the puclic address associated 
3782   with the provided vnn
3783  */
3784 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3785 {
3786         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3787         struct ctdb_control_tcp_tickle_list *list;
3788         struct ctdb_tcp_array *tcparray;
3789         int num;
3790         struct ctdb_vnn *vnn;
3791
3792         vnn = find_public_ip_vnn(ctdb, addr);
3793         if (vnn == NULL) {
3794                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3795                         ctdb_addr_to_str(addr)));
3796
3797                 return 1;
3798         }
3799
3800         tcparray = vnn->tcp_array;
3801         if (tcparray) {
3802                 num = tcparray->num;
3803         } else {
3804                 num = 0;
3805         }
3806
3807         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3808                                 tickles.connections)
3809                         + sizeof(struct ctdb_tcp_connection) * num;
3810
3811         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3812         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3813         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3814
3815         list->addr = *addr;
3816         list->tickles.num = num;
3817         if (num) {
3818                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3819                         sizeof(struct ctdb_tcp_connection) * num);
3820         }
3821
3822         return 0;
3823 }
3824
3825
3826 /*
3827   set the list of all tcp tickles for a public address
3828  */
3829 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3830                                             ctdb_sock_addr *addr,
3831                                             struct ctdb_tcp_array *tcparray)
3832 {
3833         int ret, num;
3834         TDB_DATA data;
3835         struct ctdb_control_tcp_tickle_list *list;
3836
3837         if (tcparray) {
3838                 num = tcparray->num;
3839         } else {
3840                 num = 0;
3841         }
3842
3843         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3844                                 tickles.connections) +
3845                         sizeof(struct ctdb_tcp_connection) * num;
3846         data.dptr = talloc_size(ctdb, data.dsize);
3847         CTDB_NO_MEMORY(ctdb, data.dptr);
3848
3849         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3850         list->addr = *addr;
3851         list->tickles.num = num;
3852         if (tcparray) {
3853                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3854         }
3855
3856         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3857                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3858                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3859         if (ret != 0) {
3860                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3861                 return -1;
3862         }
3863
3864         talloc_free(data.dptr);
3865
3866         return ret;
3867 }
3868
3869
3870 /*
3871   perform tickle updates if required
3872  */
3873 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3874                                 struct timed_event *te, 
3875                                 struct timeval t, void *private_data)
3876 {
3877         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3878         int ret;
3879         struct ctdb_vnn *vnn;
3880
3881         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3882                 /* we only send out updates for public addresses that 
3883                    we have taken over
3884                  */
3885                 if (ctdb->pnn != vnn->pnn) {
3886                         continue;
3887                 }
3888                 /* We only send out the updates if we need to */
3889                 if (!vnn->tcp_update_needed) {
3890                         continue;
3891                 }
3892                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3893                                                        &vnn->public_address,
3894                                                        vnn->tcp_array);
3895                 if (ret != 0) {
3896                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3897                                 ctdb_addr_to_str(&vnn->public_address)));
3898                 } else {
3899                         DEBUG(DEBUG_INFO,
3900                               ("Sent tickle update for public address %s\n",
3901                                ctdb_addr_to_str(&vnn->public_address)));
3902                         vnn->tcp_update_needed = false;
3903                 }
3904         }
3905
3906         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3907                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3908                              ctdb_update_tcp_tickles, ctdb);
3909 }               
3910         
3911
3912 /*
3913   start periodic update of tcp tickles
3914  */
3915 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3916 {
3917         ctdb->tickle_update_context = talloc_new(ctdb);
3918
3919         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3920                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3921                              ctdb_update_tcp_tickles, ctdb);
3922 }
3923
3924
3925
3926
3927 struct control_gratious_arp {
3928         struct ctdb_context *ctdb;
3929         ctdb_sock_addr addr;
3930         const char *iface;
3931         int count;
3932 };
3933
3934 /*
3935   send a control_gratuitous arp
3936  */
3937 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3938                                   struct timeval t, void *private_data)
3939 {
3940         int ret;
3941         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3942                                                         struct control_gratious_arp);
3943
3944         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3945         if (ret != 0) {
3946                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3947                                  arp->iface, strerror(errno)));
3948         }
3949
3950
3951         arp->count++;
3952         if (arp->count == CTDB_ARP_REPEAT) {
3953                 talloc_free(arp);
3954                 return;
3955         }
3956
3957         event_add_timed(arp->ctdb->ev, arp, 
3958                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3959                         send_gratious_arp, arp);
3960 }
3961
3962
3963 /*
3964   send a gratious arp 
3965  */
3966 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3967 {
3968         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3969         struct control_gratious_arp *arp;
3970
3971         /* verify the size of indata */
3972         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3973                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3974                                  (unsigned)indata.dsize, 
3975                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3976                 return -1;
3977         }
3978         if (indata.dsize != 
3979                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3980                 + gratious_arp->len ) ){
3981
3982                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3983                         "but should be %u bytes\n", 
3984                          (unsigned)indata.dsize, 
3985                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3986                 return -1;
3987         }
3988
3989
3990         arp = talloc(ctdb, struct control_gratious_arp);
3991         CTDB_NO_MEMORY(ctdb, arp);
3992
3993         arp->ctdb  = ctdb;
3994         arp->addr   = gratious_arp->addr;
3995         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3996         CTDB_NO_MEMORY(ctdb, arp->iface);
3997         arp->count = 0;
3998         
3999         event_add_timed(arp->ctdb->ev, arp, 
4000                         timeval_zero(), send_gratious_arp, arp);
4001
4002         return 0;
4003 }
4004
4005 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4006 {
4007         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4008         int ret;
4009
4010         /* verify the size of indata */
4011         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4012                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4013                 return -1;
4014         }
4015         if (indata.dsize != 
4016                 ( offsetof(struct ctdb_control_ip_iface, iface)
4017                 + pub->len ) ){
4018
4019                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4020                         "but should be %u bytes\n", 
4021                          (unsigned)indata.dsize, 
4022                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4023                 return -1;
4024         }
4025
4026         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4027
4028         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4029
4030         if (ret != 0) {
4031                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4032                 return -1;
4033         }
4034
4035         return 0;
4036 }
4037
4038 struct delete_ip_callback_state {
4039         struct ctdb_req_control *c;
4040 };
4041
4042 /*
4043   called when releaseip event finishes for del_public_address
4044  */
4045 static void delete_ip_callback(struct ctdb_context *ctdb,
4046                                int32_t status, TDB_DATA data,
4047                                const char *errormsg,
4048                                void *private_data)
4049 {
4050         struct delete_ip_callback_state *state =
4051                 talloc_get_type(private_data, struct delete_ip_callback_state);
4052
4053         /* If release failed then fail. */
4054         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4055         talloc_free(private_data);
4056 }
4057
4058 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4059                                         struct ctdb_req_control *c,
4060                                         TDB_DATA indata, bool *async_reply)
4061 {
4062         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4063         struct ctdb_vnn *vnn;
4064
4065         /* verify the size of indata */
4066         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4067                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4068                 return -1;
4069         }
4070         if (indata.dsize != 
4071                 ( offsetof(struct ctdb_control_ip_iface, iface)
4072                 + pub->len ) ){
4073
4074                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4075                         "but should be %u bytes\n", 
4076                          (unsigned)indata.dsize, 
4077                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4078                 return -1;
4079         }
4080
4081         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4082
4083         /* walk over all public addresses until we find a match */
4084         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4085                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4086                         if (vnn->pnn == ctdb->pnn) {
4087                                 struct delete_ip_callback_state *state;
4088                                 struct ctdb_public_ip *ip;
4089                                 TDB_DATA data;
4090                                 int ret;
4091
4092                                 vnn->delete_pending = true;
4093
4094                                 state = talloc(ctdb,
4095                                                struct delete_ip_callback_state);
4096                                 CTDB_NO_MEMORY(ctdb, state);
4097                                 state->c = c;
4098
4099                                 ip = talloc(state, struct ctdb_public_ip);
4100                                 if (ip == NULL) {
4101                                         DEBUG(DEBUG_ERR,
4102                                               (__location__ " Out of memory\n"));
4103                                         talloc_free(state);
4104                                         return -1;
4105                                 }
4106                                 ip->pnn = -1;
4107                                 ip->addr = pub->addr;
4108
4109                                 data.dsize = sizeof(struct ctdb_public_ip);
4110                                 data.dptr = (unsigned char *)ip;
4111
4112                                 ret = ctdb_daemon_send_control(ctdb,
4113                                                                ctdb_get_pnn(ctdb),
4114                                                                0,
4115                                                                CTDB_CONTROL_RELEASE_IP,
4116                                                                0, 0,
4117                                                                data,
4118                                                                delete_ip_callback,
4119                                                                state);
4120                                 if (ret == -1) {
4121                                         DEBUG(DEBUG_ERR,
4122                                               (__location__ "Unable to send "
4123                                                "CTDB_CONTROL_RELEASE_IP\n"));
4124                                         talloc_free(state);
4125                                         return -1;
4126                                 }
4127
4128                                 state->c = talloc_steal(state, c);
4129                                 *async_reply = true;
4130                         } else {
4131                                 /* This IP is not hosted on the
4132                                  * current node so just delete it
4133                                  * now. */
4134                                 do_delete_ip(ctdb, vnn);
4135                         }
4136
4137                         return 0;
4138                 }
4139         }
4140
4141         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4142                          ctdb_addr_to_str(&pub->addr)));
4143         return -1;
4144 }
4145
4146
4147 struct ipreallocated_callback_state {
4148         struct ctdb_req_control *c;
4149 };
4150
4151 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4152                                         int status, void *p)
4153 {
4154         struct ipreallocated_callback_state *state =
4155                 talloc_get_type(p, struct ipreallocated_callback_state);
4156
4157         if (status != 0) {
4158                 DEBUG(DEBUG_ERR,
4159                       (" \"ipreallocated\" event script failed (status %d)\n",
4160                        status));
4161                 if (status == -ETIME) {
4162                         ctdb_ban_self(ctdb);
4163                 }
4164         }
4165
4166         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4167         talloc_free(state);
4168 }
4169
4170 /* A control to run the ipreallocated event */
4171 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4172                                    struct ctdb_req_control *c,
4173                                    bool *async_reply)
4174 {
4175         int ret;
4176         struct ipreallocated_callback_state *state;
4177
4178         state = talloc(ctdb, struct ipreallocated_callback_state);
4179         CTDB_NO_MEMORY(ctdb, state);
4180
4181         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4182
4183         ret = ctdb_event_script_callback(ctdb, state,
4184                                          ctdb_ipreallocated_callback, state,
4185                                          CTDB_EVENT_IPREALLOCATED,
4186                                          "%s", "");
4187
4188         if (ret != 0) {
4189                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4190                 talloc_free(state);
4191                 return -1;
4192         }
4193
4194         /* tell the control that we will be reply asynchronously */
4195         state->c    = talloc_steal(state, c);
4196         *async_reply = true;
4197
4198         return 0;
4199 }
4200
4201
4202 /* This function is called from the recovery daemon to verify that a remote
4203    node has the expected ip allocation.
4204    This is verified against ctdb->ip_tree
4205 */
4206 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4207                                 struct ctdb_all_public_ips *ips,
4208                                 uint32_t pnn)
4209 {
4210         struct ctdb_public_ip_list *tmp_ip; 
4211         int i;
4212
4213         if (ctdb->ip_tree == NULL) {
4214                 /* dont know the expected allocation yet, assume remote node
4215                    is correct. */
4216                 return 0;
4217         }
4218
4219         if (ips == NULL) {
4220                 return 0;
4221         }
4222
4223         for (i=0; i<ips->num; i++) {
4224                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4225                 if (tmp_ip == NULL) {
4226                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4227                         return -1;
4228                 }
4229
4230                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4231                         continue;
4232                 }
4233
4234                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4235                         DEBUG(DEBUG_ERR,
4236                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4237                                pnn,
4238                                ctdb_addr_to_str(&ips->ips[i].addr),
4239                                ips->ips[i].pnn, tmp_ip->pnn));
4240                         return -1;
4241                 }
4242         }
4243
4244         return 0;
4245 }
4246
4247 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4248 {
4249         struct ctdb_public_ip_list *tmp_ip; 
4250
4251         if (ctdb->ip_tree == NULL) {
4252                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4253                 return -1;
4254         }
4255
4256         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4257         if (tmp_ip == NULL) {
4258                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4259                 return -1;
4260         }
4261
4262         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4263         tmp_ip->pnn = ip->pnn;
4264
4265         return 0;
4266 }
4267
4268
4269 struct ctdb_reloadips_handle {
4270         struct ctdb_context *ctdb;
4271         struct ctdb_req_control *c;
4272         int status;
4273         int fd[2];
4274         pid_t child;
4275         struct fd_event *fde;
4276 };
4277
4278 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4279 {
4280         if (h == h->ctdb->reload_ips) {
4281                 h->ctdb->reload_ips = NULL;
4282         }
4283         if (h->c != NULL) {
4284                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4285                 h->c = NULL;
4286         }
4287         ctdb_kill(h->ctdb, h->child, SIGKILL);
4288         return 0;
4289 }
4290
4291 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4292                                 struct timed_event *te,
4293                                 struct timeval t, void *private_data)
4294 {
4295         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4296
4297         talloc_free(h);
4298 }       
4299
4300 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4301                              uint16_t flags, void *private_data)
4302 {
4303         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4304
4305         char res;
4306         int ret;
4307
4308         ret = sys_read(h->fd[0], &res, 1);
4309         if (ret < 1 || res != 0) {
4310                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4311                 res = 1;
4312         }
4313         h->status = res;
4314
4315         talloc_free(h);
4316 }
4317
4318 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4319 {
4320         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4321         struct ctdb_all_public_ips *ips;
4322         struct ctdb_vnn *vnn;
4323         struct client_async_data *async_data;
4324         struct timeval timeout;
4325         TDB_DATA data;
4326         struct ctdb_client_control_state *state;
4327         bool first_add;
4328         int i, ret;
4329
4330         CTDB_NO_MEMORY(ctdb, mem_ctx);
4331
4332         /* Read IPs from local node */
4333         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4334                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4335         if (ret != 0) {
4336                 DEBUG(DEBUG_ERR,
4337                       ("Unable to fetch public IPs from local node\n"));
4338                 talloc_free(mem_ctx);
4339                 return -1;
4340         }
4341
4342         /* Read IPs file - this is safe since this is a child process */
4343         ctdb->vnn = NULL;
4344         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4345                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4346                 talloc_free(mem_ctx);
4347                 return -1;
4348         }
4349
4350         async_data = talloc_zero(mem_ctx, struct client_async_data);
4351         CTDB_NO_MEMORY(ctdb, async_data);
4352
4353         /* Compare IPs between node and file for IPs to be deleted */
4354         for (i = 0; i < ips->num; i++) {
4355                 /* */
4356                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4357                         if (ctdb_same_ip(&vnn->public_address,
4358                                          &ips->ips[i].addr)) {
4359                                 /* IP is still in file */
4360                                 break;
4361                         }
4362                 }
4363
4364                 if (vnn == NULL) {
4365                         /* Delete IP ips->ips[i] */
4366                         struct ctdb_control_ip_iface *pub;
4367
4368                         DEBUG(DEBUG_NOTICE,
4369                               ("IP %s no longer configured, deleting it\n",
4370                                ctdb_addr_to_str(&ips->ips[i].addr)));
4371
4372                         pub = talloc_zero(mem_ctx,
4373                                           struct ctdb_control_ip_iface);
4374                         CTDB_NO_MEMORY(ctdb, pub);
4375
4376                         pub->addr  = ips->ips[i].addr;
4377                         pub->mask  = 0;
4378                         pub->len   = 0;
4379
4380                         timeout = TAKEOVER_TIMEOUT();
4381
4382                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4383                                               iface) + pub->len;
4384                         data.dptr = (uint8_t *)pub;
4385
4386                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4387                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4388                                                   0, data, async_data,
4389                                                   &timeout, NULL);
4390                         if (state == NULL) {
4391                                 DEBUG(DEBUG_ERR,
4392                                       (__location__
4393                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4394                                 goto failed;
4395                         }
4396
4397                         ctdb_client_async_add(async_data, state);
4398                 }
4399         }
4400
4401         /* Compare IPs between node and file for IPs to be added */
4402         first_add = true;
4403         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4404                 for (i = 0; i < ips->num; i++) {
4405                         if (ctdb_same_ip(&vnn->public_address,
4406                                          &ips->ips[i].addr)) {
4407                                 /* IP already on node */
4408                                 break;
4409                         }
4410                 }
4411                 if (i == ips->num) {
4412                         /* Add IP ips->ips[i] */
4413                         struct ctdb_control_ip_iface *pub;
4414                         const char *ifaces = NULL;
4415                         uint32_t len;
4416                         int iface = 0;
4417
4418                         DEBUG(DEBUG_NOTICE,
4419                               ("New IP %s configured, adding it\n",
4420                                ctdb_addr_to_str(&vnn->public_address)));
4421                         if (first_add) {
4422                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4423
4424                                 data.dsize = sizeof(pnn);
4425                                 data.dptr  = (uint8_t *)&pnn;
4426
4427                                 ret = ctdb_client_send_message(
4428                                         ctdb,
4429                                         CTDB_BROADCAST_CONNECTED,
4430                                         CTDB_SRVID_REBALANCE_NODE,
4431                                         data);
4432                                 if (ret != 0) {
4433                                         DEBUG(DEBUG_WARNING,
4434                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4435                                 }
4436
4437                                 first_add = false;
4438                         }
4439
4440                         ifaces = vnn->ifaces[0];
4441                         iface = 1;
4442                         while (vnn->ifaces[iface] != NULL) {
4443                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4444                                                          vnn->ifaces[iface]);
4445                                 iface++;
4446                         }
4447
4448                         len   = strlen(ifaces) + 1;
4449                         pub = talloc_zero_size(mem_ctx,
4450                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4451                         CTDB_NO_MEMORY(ctdb, pub);
4452
4453                         pub->addr  = vnn->public_address;
4454                         pub->mask  = vnn->public_netmask_bits;
4455                         pub->len   = len;
4456                         memcpy(&pub->iface[0], ifaces, pub->len);
4457
4458                         timeout = TAKEOVER_TIMEOUT();
4459
4460                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4461                                               iface) + pub->len;
4462                         data.dptr = (uint8_t *)pub;
4463
4464                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4465                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4466                                                   0, data, async_data,
4467                                                   &timeout, NULL);
4468                         if (state == NULL) {
4469                                 DEBUG(DEBUG_ERR,
4470                                       (__location__
4471                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4472                                 goto failed;
4473                         }
4474
4475                         ctdb_client_async_add(async_data, state);
4476                 }
4477         }
4478
4479         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4480                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4481                 goto failed;
4482         }
4483
4484         talloc_free(mem_ctx);
4485         return 0;
4486
4487 failed:
4488         talloc_free(mem_ctx);
4489         return -1;
4490 }
4491
4492 /* This control is sent to force the node to re-read the public addresses file
4493    and drop any addresses we should nnot longer host, and add new addresses
4494    that we are now able to host
4495 */
4496 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4497 {
4498         struct ctdb_reloadips_handle *h;
4499         pid_t parent = getpid();
4500
4501         if (ctdb->reload_ips != NULL) {
4502                 talloc_free(ctdb->reload_ips);
4503                 ctdb->reload_ips = NULL;
4504         }
4505
4506         h = talloc(ctdb, struct ctdb_reloadips_handle);
4507         CTDB_NO_MEMORY(ctdb, h);
4508         h->ctdb     = ctdb;
4509         h->c        = NULL;
4510         h->status   = -1;
4511         
4512         if (pipe(h->fd) == -1) {
4513                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4514                 talloc_free(h);
4515                 return -1;
4516         }
4517
4518         h->child = ctdb_fork(ctdb);
4519         if (h->child == (pid_t)-1) {
4520                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4521                 close(h->fd[0]);
4522                 close(h->fd[1]);
4523                 talloc_free(h);
4524                 return -1;
4525         }
4526
4527         /* child process */
4528         if (h->child == 0) {
4529                 signed char res = 0;
4530
4531                 close(h->fd[0]);
4532                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4533
4534                 ctdb_set_process_name("ctdb_reloadips");
4535                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4536                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4537                         res = -1;
4538                 } else {
4539                         res = ctdb_reloadips_child(ctdb);
4540                         if (res != 0) {
4541                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4542                         }
4543                 }
4544
4545                 sys_write(h->fd[1], &res, 1);
4546                 /* make sure we die when our parent dies */
4547                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4548                         sleep(5);
4549                 }
4550                 _exit(0);
4551         }
4552
4553         h->c             = talloc_steal(h, c);
4554
4555         close(h->fd[1]);
4556         set_close_on_exec(h->fd[0]);
4557
4558         talloc_set_destructor(h, ctdb_reloadips_destructor);
4559
4560
4561         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4562                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4563                         (void *)h);
4564         tevent_fd_set_auto_close(h->fde);
4565
4566         event_add_timed(ctdb->ev, h,
4567                         timeval_current_ofs(120, 0),
4568                         ctdb_reloadips_timeout_event, h);
4569
4570         /* we reply later */
4571         *async_reply = true;
4572         return 0;
4573 }