ctdb-daemon: Stop using tevent compatibility definitions
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29 #include "common/reqid.h"
30 #include "common/system.h"
31
32
33 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
34
35 #define CTDB_ARP_INTERVAL 1
36 #define CTDB_ARP_REPEAT   3
37
38 /* Flags used in IP allocation algorithms. */
39 struct ctdb_ipflags {
40         bool noiptakeover;
41         bool noiphost;
42         enum ctdb_runstate runstate;
43 };
44
45 struct ctdb_iface {
46         struct ctdb_iface *prev, *next;
47         const char *name;
48         bool link_up;
49         uint32_t references;
50 };
51
52 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
53 {
54         if (vnn->iface) {
55                 return vnn->iface->name;
56         }
57
58         return "__none__";
59 }
60
61 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
62 {
63         struct ctdb_iface *i;
64
65         /* Verify that we dont have an entry for this ip yet */
66         for (i=ctdb->ifaces;i;i=i->next) {
67                 if (strcmp(i->name, iface) == 0) {
68                         return 0;
69                 }
70         }
71
72         /* create a new structure for this interface */
73         i = talloc_zero(ctdb, struct ctdb_iface);
74         CTDB_NO_MEMORY_FATAL(ctdb, i);
75         i->name = talloc_strdup(i, iface);
76         CTDB_NO_MEMORY(ctdb, i->name);
77
78         i->link_up = true;
79
80         DLIST_ADD(ctdb->ifaces, i);
81
82         return 0;
83 }
84
85 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
86                                         const char *name)
87 {
88         int n;
89
90         for (n = 0; vnn->ifaces[n] != NULL; n++) {
91                 if (strcmp(name, vnn->ifaces[n]) == 0) {
92                         return true;
93                 }
94         }
95
96         return false;
97 }
98
99 /* If any interfaces now have no possible IPs then delete them.  This
100  * implementation is naive (i.e. simple) rather than clever
101  * (i.e. complex).  Given that this is run on delip and that operation
102  * is rare, this doesn't need to be efficient - it needs to be
103  * foolproof.  One alternative is reference counting, where the logic
104  * is distributed and can, therefore, be broken in multiple places.
105  * Another alternative is to build a red-black tree of interfaces that
106  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
107  * once) and then walking ctdb->ifaces once and deleting those not in
108  * the tree.  Let's go to one of those if the naive implementation
109  * causes problems...  :-)
110  */
111 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
112                                         struct ctdb_vnn *vnn)
113 {
114         struct ctdb_iface *i, *next;
115
116         /* For each interface, check if there's an IP using it. */
117         for (i = ctdb->ifaces; i != NULL; i = next) {
118                 struct ctdb_vnn *tv;
119                 bool found;
120                 next = i->next;
121
122                 /* Only consider interfaces named in the given VNN. */
123                 if (!vnn_has_interface_with_name(vnn, i->name)) {
124                         continue;
125                 }
126
127                 /* Is the "single IP" on this interface? */
128                 if ((ctdb->single_ip_vnn != NULL) &&
129                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
130                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
131                         /* Found, next interface please... */
132                         continue;
133                 }
134                 /* Search for a vnn with this interface. */
135                 found = false;
136                 for (tv=ctdb->vnn; tv; tv=tv->next) {
137                         if (vnn_has_interface_with_name(tv, i->name)) {
138                                 found = true;
139                                 break;
140                         }
141                 }
142
143                 if (!found) {
144                         /* None of the VNNs are using this interface. */
145                         DLIST_REMOVE(ctdb->ifaces, i);
146                         talloc_free(i);
147                 }
148         }
149 }
150
151
152 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
153                                           const char *iface)
154 {
155         struct ctdb_iface *i;
156
157         for (i=ctdb->ifaces;i;i=i->next) {
158                 if (strcmp(i->name, iface) == 0) {
159                         return i;
160                 }
161         }
162
163         return NULL;
164 }
165
166 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
167                                               struct ctdb_vnn *vnn)
168 {
169         int i;
170         struct ctdb_iface *cur = NULL;
171         struct ctdb_iface *best = NULL;
172
173         for (i=0; vnn->ifaces[i]; i++) {
174
175                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
176                 if (cur == NULL) {
177                         continue;
178                 }
179
180                 if (!cur->link_up) {
181                         continue;
182                 }
183
184                 if (best == NULL) {
185                         best = cur;
186                         continue;
187                 }
188
189                 if (cur->references < best->references) {
190                         best = cur;
191                         continue;
192                 }
193         }
194
195         return best;
196 }
197
198 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
199                                      struct ctdb_vnn *vnn)
200 {
201         struct ctdb_iface *best = NULL;
202
203         if (vnn->iface) {
204                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
205                                    "still assigned to iface '%s'\n",
206                                    ctdb_addr_to_str(&vnn->public_address),
207                                    ctdb_vnn_iface_string(vnn)));
208                 return 0;
209         }
210
211         best = ctdb_vnn_best_iface(ctdb, vnn);
212         if (best == NULL) {
213                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
214                                   "cannot assign to iface any iface\n",
215                                   ctdb_addr_to_str(&vnn->public_address)));
216                 return -1;
217         }
218
219         vnn->iface = best;
220         best->references++;
221         vnn->pnn = ctdb->pnn;
222
223         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
224                            "now assigned to iface '%s' refs[%d]\n",
225                            ctdb_addr_to_str(&vnn->public_address),
226                            ctdb_vnn_iface_string(vnn),
227                            best->references));
228         return 0;
229 }
230
231 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
232                                     struct ctdb_vnn *vnn)
233 {
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now unassigned (old iface '%s' refs[%d])\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            vnn->iface?vnn->iface->references:0));
239         if (vnn->iface) {
240                 vnn->iface->references--;
241         }
242         vnn->iface = NULL;
243         if (vnn->pnn == ctdb->pnn) {
244                 vnn->pnn = -1;
245         }
246 }
247
248 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
249                                struct ctdb_vnn *vnn)
250 {
251         int i;
252
253         if (vnn->delete_pending) {
254                 return false;
255         }
256
257         if (vnn->iface && vnn->iface->link_up) {
258                 return true;
259         }
260
261         for (i=0; vnn->ifaces[i]; i++) {
262                 struct ctdb_iface *cur;
263
264                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
265                 if (cur == NULL) {
266                         continue;
267                 }
268
269                 if (cur->link_up) {
270                         return true;
271                 }
272         }
273
274         return false;
275 }
276
277 struct ctdb_takeover_arp {
278         struct ctdb_context *ctdb;
279         uint32_t count;
280         ctdb_sock_addr addr;
281         struct ctdb_tcp_array *tcparray;
282         struct ctdb_vnn *vnn;
283 };
284
285
286 /*
287   lists of tcp endpoints
288  */
289 struct ctdb_tcp_list {
290         struct ctdb_tcp_list *prev, *next;
291         struct ctdb_tcp_connection connection;
292 };
293
294 /*
295   list of clients to kill on IP release
296  */
297 struct ctdb_client_ip {
298         struct ctdb_client_ip *prev, *next;
299         struct ctdb_context *ctdb;
300         ctdb_sock_addr addr;
301         uint32_t client_id;
302 };
303
304
305 /*
306   send a gratuitous arp
307  */
308 static void ctdb_control_send_arp(struct tevent_context *ev,
309                                   struct tevent_timer *te,
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
353                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
354                          ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
390                          timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          CTDB_EVENT_TAKE_IP,
509                                          "%s %s %u",
510                                          ctdb_vnn_iface_string(vnn),
511                                          ctdb_addr_to_str(&vnn->public_address),
512                                          vnn->public_netmask_bits);
513
514         if (ret != 0) {
515                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
516                         ctdb_addr_to_str(&vnn->public_address),
517                         ctdb_vnn_iface_string(vnn)));
518                 talloc_free(state);
519                 return -1;
520         }
521
522         return 0;
523 }
524
525 struct ctdb_do_updateip_state {
526         struct ctdb_req_control *c;
527         struct ctdb_iface *old;
528         struct ctdb_vnn *vnn;
529 };
530
531 /*
532   called when updateip event finishes
533  */
534 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
535                                       void *private_data)
536 {
537         struct ctdb_do_updateip_state *state =
538                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
539         int32_t ret;
540
541         if (status != 0) {
542                 if (status == -ETIME) {
543                         ctdb_ban_self(ctdb);
544                 }
545                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
546                         ctdb_addr_to_str(&state->vnn->public_address),
547                         state->old->name,
548                         ctdb_vnn_iface_string(state->vnn)));
549
550                 /*
551                  * All we can do is reset the old interface
552                  * and let the next run fix it
553                  */
554                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
555                 state->vnn->iface = state->old;
556                 state->vnn->iface->references++;
557
558                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
559                 talloc_free(state);
560                 return;
561         }
562
563         if (ctdb->do_checkpublicip) {
564
565         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
566         if (ret != 0) {
567                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
568                 talloc_free(state);
569                 return;
570         }
571
572         }
573
574         /* the control succeeded */
575         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
576         talloc_free(state);
577         return;
578 }
579
580 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
581 {
582         state->vnn->update_in_flight = false;
583         return 0;
584 }
585
586 /*
587   update (move) an ip address
588  */
589 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
590                                 struct ctdb_req_control *c,
591                                 struct ctdb_vnn *vnn)
592 {
593         int ret;
594         struct ctdb_do_updateip_state *state;
595         struct ctdb_iface *old = vnn->iface;
596         const char *new_name;
597
598         if (vnn->update_in_flight) {
599                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
600                                     "update for this IP already in flight\n",
601                                     ctdb_addr_to_str(&vnn->public_address),
602                                     vnn->public_netmask_bits));
603                 return -1;
604         }
605
606         ctdb_vnn_unassign_iface(ctdb, vnn);
607         ret = ctdb_vnn_assign_iface(ctdb, vnn);
608         if (ret != 0) {
609                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
610                                  "assin a usable interface (old iface '%s')\n",
611                                  ctdb_addr_to_str(&vnn->public_address),
612                                  vnn->public_netmask_bits,
613                                  old->name));
614                 return -1;
615         }
616
617         new_name = ctdb_vnn_iface_string(vnn);
618         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
619                 /* A benign update from one interface onto itself.
620                  * no need to run the eventscripts in this case, just return
621                  * success.
622                  */
623                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
624                 return 0;
625         }
626
627         state = talloc(vnn, struct ctdb_do_updateip_state);
628         CTDB_NO_MEMORY(ctdb, state);
629
630         state->c = talloc_steal(ctdb, c);
631         state->old = old;
632         state->vnn = vnn;
633
634         vnn->update_in_flight = true;
635         talloc_set_destructor(state, ctdb_updateip_destructor);
636
637         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
638                             "interface %s to %s\n",
639                             ctdb_addr_to_str(&vnn->public_address),
640                             vnn->public_netmask_bits,
641                             old->name,
642                             new_name));
643
644         ret = ctdb_event_script_callback(ctdb,
645                                          state,
646                                          ctdb_do_updateip_callback,
647                                          state,
648                                          CTDB_EVENT_UPDATE_IP,
649                                          "%s %s %s %u",
650                                          state->old->name,
651                                          new_name,
652                                          ctdb_addr_to_str(&vnn->public_address),
653                                          vnn->public_netmask_bits);
654         if (ret != 0) {
655                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
656                                  ctdb_addr_to_str(&vnn->public_address),
657                                  old->name, new_name));
658                 talloc_free(state);
659                 return -1;
660         }
661
662         return 0;
663 }
664
665 /*
666   Find the vnn of the node that has a public ip address
667   returns -1 if the address is not known as a public address
668  */
669 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
670 {
671         struct ctdb_vnn *vnn;
672
673         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
674                 if (ctdb_same_ip(&vnn->public_address, addr)) {
675                         return vnn;
676                 }
677         }
678
679         return NULL;
680 }
681
682 /*
683   take over an ip address
684  */
685 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
686                                  struct ctdb_req_control *c,
687                                  TDB_DATA indata,
688                                  bool *async_reply)
689 {
690         int ret;
691         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
692         struct ctdb_vnn *vnn;
693         bool have_ip = false;
694         bool do_updateip = false;
695         bool do_takeip = false;
696         struct ctdb_iface *best_iface = NULL;
697
698         if (pip->pnn != ctdb->pnn) {
699                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
700                                  "with pnn %d, but we're node %d\n",
701                                  ctdb_addr_to_str(&pip->addr),
702                                  pip->pnn, ctdb->pnn));
703                 return -1;
704         }
705
706         /* update out vnn list */
707         vnn = find_public_ip_vnn(ctdb, &pip->addr);
708         if (vnn == NULL) {
709                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
710                         ctdb_addr_to_str(&pip->addr)));
711                 return 0;
712         }
713
714         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
715                 have_ip = ctdb_sys_have_ip(&pip->addr);
716         }
717         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
718         if (best_iface == NULL) {
719                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
720                                  "a usable interface (old %s, have_ip %d)\n",
721                                  ctdb_addr_to_str(&vnn->public_address),
722                                  vnn->public_netmask_bits,
723                                  ctdb_vnn_iface_string(vnn),
724                                  have_ip));
725                 return -1;
726         }
727
728         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
729                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
730                 have_ip = false;
731         }
732
733
734         if (vnn->iface == NULL && have_ip) {
735                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
736                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
737                                  ctdb_addr_to_str(&vnn->public_address)));
738                 return 0;
739         }
740
741         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "and we have it on iface[%s], but it was assigned to node %d"
744                                   "and we are node %d, banning ourself\n",
745                                  ctdb_addr_to_str(&vnn->public_address),
746                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
747                 ctdb_ban_self(ctdb);
748                 return -1;
749         }
750
751         if (vnn->pnn == -1 && have_ip) {
752                 vnn->pnn = ctdb->pnn;
753                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
754                                   "and we already have it on iface[%s], update local daemon\n",
755                                  ctdb_addr_to_str(&vnn->public_address),
756                                   ctdb_vnn_iface_string(vnn)));
757                 return 0;
758         }
759
760         if (vnn->iface) {
761                 if (vnn->iface != best_iface) {
762                         if (!vnn->iface->link_up) {
763                                 do_updateip = true;
764                         } else if (vnn->iface->references > (best_iface->references + 1)) {
765                                 /* only move when the rebalance gains something */
766                                         do_updateip = true;
767                         }
768                 }
769         }
770
771         if (!have_ip) {
772                 if (do_updateip) {
773                         ctdb_vnn_unassign_iface(ctdb, vnn);
774                         do_updateip = false;
775                 }
776                 do_takeip = true;
777         }
778
779         if (do_takeip) {
780                 ret = ctdb_do_takeip(ctdb, c, vnn);
781                 if (ret != 0) {
782                         return -1;
783                 }
784         } else if (do_updateip) {
785                 ret = ctdb_do_updateip(ctdb, c, vnn);
786                 if (ret != 0) {
787                         return -1;
788                 }
789         } else {
790                 /*
791                  * The interface is up and the kernel known the ip
792                  * => do nothing
793                  */
794                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
795                         ctdb_addr_to_str(&pip->addr),
796                         vnn->public_netmask_bits,
797                         ctdb_vnn_iface_string(vnn)));
798                 return 0;
799         }
800
801         /* tell ctdb_control.c that we will be replying asynchronously */
802         *async_reply = true;
803
804         return 0;
805 }
806
807 /*
808   kill any clients that are registered with a IP that is being released
809  */
810 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
811 {
812         struct ctdb_client_ip *ip;
813
814         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
815                 ctdb_addr_to_str(addr)));
816
817         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
818                 ctdb_sock_addr tmp_addr;
819
820                 tmp_addr = ip->addr;
821                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
822                         ip->client_id,
823                         ctdb_addr_to_str(&ip->addr)));
824
825                 if (ctdb_same_ip(&tmp_addr, addr)) {
826                         struct ctdb_client *client = reqid_find(ctdb->idr,
827                                                                 ip->client_id,
828                                                                 struct ctdb_client);
829                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
830                                 ip->client_id,
831                                 ctdb_addr_to_str(&ip->addr),
832                                 client->pid));
833
834                         if (client->pid != 0) {
835                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
836                                         (unsigned)client->pid,
837                                         ctdb_addr_to_str(addr),
838                                         ip->client_id));
839                                 kill(client->pid, SIGKILL);
840                         }
841                 }
842         }
843 }
844
845 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
846 {
847         DLIST_REMOVE(ctdb->vnn, vnn);
848         ctdb_vnn_unassign_iface(ctdb, vnn);
849         ctdb_remove_orphaned_ifaces(ctdb, vnn);
850         talloc_free(vnn);
851 }
852
853 /*
854   called when releaseip event finishes
855  */
856 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
857                                 void *private_data)
858 {
859         struct takeover_callback_state *state = 
860                 talloc_get_type(private_data, struct takeover_callback_state);
861         TDB_DATA data;
862
863         if (status == -ETIME) {
864                 ctdb_ban_self(ctdb);
865         }
866
867         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
868                 if  (ctdb_sys_have_ip(state->addr)) {
869                         DEBUG(DEBUG_ERR,
870                               ("IP %s still hosted during release IP callback, failing\n",
871                                ctdb_addr_to_str(state->addr)));
872                         ctdb_request_control_reply(ctdb, state->c,
873                                                    NULL, -1, NULL);
874                         talloc_free(state);
875                         return;
876                 }
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* Process the IP if it has been marked for deletion */
896         if (state->vnn->delete_pending) {
897                 do_delete_ip(ctdb, state->vnn);
898                 state->vnn = NULL;
899         }
900
901         /* the control succeeded */
902         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
903         talloc_free(state);
904 }
905
906 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
907 {
908         if (state->vnn != NULL) {
909                 state->vnn->update_in_flight = false;
910         }
911         return 0;
912 }
913
914 /*
915   release an ip address
916  */
917 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
918                                 struct ctdb_req_control *c,
919                                 TDB_DATA indata, 
920                                 bool *async_reply)
921 {
922         int ret;
923         struct takeover_callback_state *state;
924         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
925         struct ctdb_vnn *vnn;
926         char *iface;
927
928         /* update our vnn list */
929         vnn = find_public_ip_vnn(ctdb, &pip->addr);
930         if (vnn == NULL) {
931                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
932                         ctdb_addr_to_str(&pip->addr)));
933                 return 0;
934         }
935         vnn->pnn = pip->pnn;
936
937         /* stop any previous arps */
938         talloc_free(vnn->takeover_ctx);
939         vnn->takeover_ctx = NULL;
940
941         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
942          * lazy multicast to drop an IP from any node that isn't the
943          * intended new node.  The following causes makes ctdbd ignore
944          * a release for any address it doesn't host.
945          */
946         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
947                 if (!ctdb_sys_have_ip(&pip->addr)) {
948                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
949                                 ctdb_addr_to_str(&pip->addr),
950                                 vnn->public_netmask_bits,
951                                 ctdb_vnn_iface_string(vnn)));
952                         ctdb_vnn_unassign_iface(ctdb, vnn);
953                         return 0;
954                 }
955         } else {
956                 if (vnn->iface == NULL) {
957                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
958                                            ctdb_addr_to_str(&pip->addr),
959                                            vnn->public_netmask_bits));
960                         return 0;
961                 }
962         }
963
964         /* There is a potential race between take_ip and us because we
965          * update the VNN via a callback that run when the
966          * eventscripts have been run.  Avoid the race by allowing one
967          * update to be in flight at a time.
968          */
969         if (vnn->update_in_flight) {
970                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
971                                     "update for this IP already in flight\n",
972                                     ctdb_addr_to_str(&vnn->public_address),
973                                     vnn->public_netmask_bits));
974                 return -1;
975         }
976
977         iface = strdup(ctdb_vnn_iface_string(vnn));
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         if (state == NULL) {
987                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
988                                __FILE__, __LINE__);
989                 free(iface);
990                 return -1;
991         }
992
993         state->c = talloc_steal(state, c);
994         state->addr = talloc(state, ctdb_sock_addr);       
995         if (state->addr == NULL) {
996                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
997                                __FILE__, __LINE__);
998                 free(iface);
999                 talloc_free(state);
1000                 return -1;
1001         }
1002         *state->addr = pip->addr;
1003         state->vnn   = vnn;
1004
1005         vnn->update_in_flight = true;
1006         talloc_set_destructor(state, ctdb_releaseip_destructor);
1007
1008         ret = ctdb_event_script_callback(ctdb, 
1009                                          state, release_ip_callback, state,
1010                                          CTDB_EVENT_RELEASE_IP,
1011                                          "%s %s %u",
1012                                          iface,
1013                                          ctdb_addr_to_str(&pip->addr),
1014                                          vnn->public_netmask_bits);
1015         free(iface);
1016         if (ret != 0) {
1017                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1018                         ctdb_addr_to_str(&pip->addr),
1019                         ctdb_vnn_iface_string(vnn)));
1020                 talloc_free(state);
1021                 return -1;
1022         }
1023
1024         /* tell the control that we will be reply asynchronously */
1025         *async_reply = true;
1026         return 0;
1027 }
1028
1029 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1030                                    ctdb_sock_addr *addr,
1031                                    unsigned mask, const char *ifaces,
1032                                    bool check_address)
1033 {
1034         struct ctdb_vnn      *vnn;
1035         uint32_t num = 0;
1036         char *tmp;
1037         const char *iface;
1038         int i;
1039         int ret;
1040
1041         tmp = strdup(ifaces);
1042         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1043                 if (!ctdb_sys_check_iface_exists(iface)) {
1044                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1045                         free(tmp);
1046                         return -1;
1047                 }
1048         }
1049         free(tmp);
1050
1051         /* Verify that we dont have an entry for this ip yet */
1052         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1053                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1054                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1055                                 ctdb_addr_to_str(addr)));
1056                         return -1;
1057                 }               
1058         }
1059
1060         /* create a new vnn structure for this ip address */
1061         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1062         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1063         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1064         tmp = talloc_strdup(vnn, ifaces);
1065         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1066         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1067                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1068                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1069                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1070                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1071                 num++;
1072         }
1073         talloc_free(tmp);
1074         vnn->ifaces[num] = NULL;
1075         vnn->public_address      = *addr;
1076         vnn->public_netmask_bits = mask;
1077         vnn->pnn                 = -1;
1078         if (check_address) {
1079                 if (ctdb_sys_have_ip(addr)) {
1080                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1081                         vnn->pnn = ctdb->pnn;
1082                 }
1083         }
1084
1085         for (i=0; vnn->ifaces[i]; i++) {
1086                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1087                 if (ret != 0) {
1088                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1089                                            "for public_address[%s]\n",
1090                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1091                         talloc_free(vnn);
1092                         return -1;
1093                 }
1094         }
1095
1096         DLIST_ADD(ctdb->vnn, vnn);
1097
1098         return 0;
1099 }
1100
1101 /*
1102   setup the public address lists from a file
1103 */
1104 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1105 {
1106         char **lines;
1107         int nlines;
1108         int i;
1109
1110         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1111         if (lines == NULL) {
1112                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1113                 return -1;
1114         }
1115         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1116                 nlines--;
1117         }
1118
1119         for (i=0;i<nlines;i++) {
1120                 unsigned mask;
1121                 ctdb_sock_addr addr;
1122                 const char *addrstr;
1123                 const char *ifaces;
1124                 char *tok, *line;
1125
1126                 line = lines[i];
1127                 while ((*line == ' ') || (*line == '\t')) {
1128                         line++;
1129                 }
1130                 if (*line == '#') {
1131                         continue;
1132                 }
1133                 if (strcmp(line, "") == 0) {
1134                         continue;
1135                 }
1136                 tok = strtok(line, " \t");
1137                 addrstr = tok;
1138                 tok = strtok(NULL, " \t");
1139                 if (tok == NULL) {
1140                         if (NULL == ctdb->default_public_interface) {
1141                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1142                                          i+1));
1143                                 talloc_free(lines);
1144                                 return -1;
1145                         }
1146                         ifaces = ctdb->default_public_interface;
1147                 } else {
1148                         ifaces = tok;
1149                 }
1150
1151                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1152                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1153                         talloc_free(lines);
1154                         return -1;
1155                 }
1156                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1157                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1158                         talloc_free(lines);
1159                         return -1;
1160                 }
1161         }
1162
1163
1164         talloc_free(lines);
1165         return 0;
1166 }
1167
1168 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1169                               const char *iface,
1170                               const char *ip)
1171 {
1172         struct ctdb_vnn *svnn;
1173         struct ctdb_iface *cur = NULL;
1174         bool ok;
1175         int ret;
1176
1177         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1178         CTDB_NO_MEMORY(ctdb, svnn);
1179
1180         svnn->ifaces = talloc_array(svnn, const char *, 2);
1181         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1182         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1183         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1184         svnn->ifaces[1] = NULL;
1185
1186         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1187         if (!ok) {
1188                 talloc_free(svnn);
1189                 return -1;
1190         }
1191
1192         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1193         if (ret != 0) {
1194                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1195                                    "for single_ip[%s]\n",
1196                                    svnn->ifaces[0],
1197                                    ctdb_addr_to_str(&svnn->public_address)));
1198                 talloc_free(svnn);
1199                 return -1;
1200         }
1201
1202         /* assume the single public ip interface is initially "good" */
1203         cur = ctdb_find_iface(ctdb, iface);
1204         if (cur == NULL) {
1205                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1206                 return -1;
1207         }
1208         cur->link_up = true;
1209
1210         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1211         if (ret != 0) {
1212                 talloc_free(svnn);
1213                 return -1;
1214         }
1215
1216         ctdb->single_ip_vnn = svnn;
1217         return 0;
1218 }
1219
1220 struct ctdb_public_ip_list {
1221         struct ctdb_public_ip_list *next;
1222         uint32_t pnn;
1223         ctdb_sock_addr addr;
1224 };
1225
1226 /* Given a physical node, return the number of
1227    public addresses that is currently assigned to this node.
1228 */
1229 static int node_ip_coverage(struct ctdb_context *ctdb, 
1230         int32_t pnn,
1231         struct ctdb_public_ip_list *ips)
1232 {
1233         int num=0;
1234
1235         for (;ips;ips=ips->next) {
1236                 if (ips->pnn == pnn) {
1237                         num++;
1238                 }
1239         }
1240         return num;
1241 }
1242
1243
1244 /* Can the given node host the given IP: is the public IP known to the
1245  * node and is NOIPHOST unset?
1246 */
1247 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1248                              struct ctdb_ipflags ipflags,
1249                              struct ctdb_public_ip_list *ip)
1250 {
1251         struct ctdb_all_public_ips *public_ips;
1252         int i;
1253
1254         if (ipflags.noiphost) {
1255                 return false;
1256         }
1257
1258         public_ips = ctdb->nodes[pnn]->available_public_ips;
1259
1260         if (public_ips == NULL) {
1261                 return false;
1262         }
1263
1264         for (i=0; i<public_ips->num; i++) {
1265                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1266                         /* yes, this node can serve this public ip */
1267                         return true;
1268                 }
1269         }
1270
1271         return false;
1272 }
1273
1274 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1275                                  struct ctdb_ipflags ipflags,
1276                                  struct ctdb_public_ip_list *ip)
1277 {
1278         if (ipflags.noiptakeover) {
1279                 return false;
1280         }
1281
1282         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1283 }
1284
1285 /* search the node lists list for a node to takeover this ip.
1286    pick the node that currently are serving the least number of ips
1287    so that the ips get spread out evenly.
1288 */
1289 static int find_takeover_node(struct ctdb_context *ctdb, 
1290                 struct ctdb_ipflags *ipflags,
1291                 struct ctdb_public_ip_list *ip,
1292                 struct ctdb_public_ip_list *all_ips)
1293 {
1294         int pnn, min=0, num;
1295         int i, numnodes;
1296
1297         numnodes = talloc_array_length(ipflags);
1298         pnn    = -1;
1299         for (i=0; i<numnodes; i++) {
1300                 /* verify that this node can serve this ip */
1301                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1302                         /* no it couldnt   so skip to the next node */
1303                         continue;
1304                 }
1305
1306                 num = node_ip_coverage(ctdb, i, all_ips);
1307                 /* was this the first node we checked ? */
1308                 if (pnn == -1) {
1309                         pnn = i;
1310                         min  = num;
1311                 } else {
1312                         if (num < min) {
1313                                 pnn = i;
1314                                 min  = num;
1315                         }
1316                 }
1317         }       
1318         if (pnn == -1) {
1319                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1320                         ctdb_addr_to_str(&ip->addr)));
1321
1322                 return -1;
1323         }
1324
1325         ip->pnn = pnn;
1326         return 0;
1327 }
1328
1329 #define IP_KEYLEN       4
1330 static uint32_t *ip_key(ctdb_sock_addr *ip)
1331 {
1332         static uint32_t key[IP_KEYLEN];
1333
1334         bzero(key, sizeof(key));
1335
1336         switch (ip->sa.sa_family) {
1337         case AF_INET:
1338                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1339                 break;
1340         case AF_INET6: {
1341                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1342                 key[0]  = htonl(s6_a32[0]);
1343                 key[1]  = htonl(s6_a32[1]);
1344                 key[2]  = htonl(s6_a32[2]);
1345                 key[3]  = htonl(s6_a32[3]);
1346                 break;
1347         }
1348         default:
1349                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1350                 return key;
1351         }
1352
1353         return key;
1354 }
1355
1356 static void *add_ip_callback(void *parm, void *data)
1357 {
1358         struct ctdb_public_ip_list *this_ip = parm; 
1359         struct ctdb_public_ip_list *prev_ip = data; 
1360
1361         if (prev_ip == NULL) {
1362                 return parm;
1363         }
1364         if (this_ip->pnn == -1) {
1365                 this_ip->pnn = prev_ip->pnn;
1366         }
1367
1368         return parm;
1369 }
1370
1371 static int getips_count_callback(void *param, void *data)
1372 {
1373         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1374         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1375
1376         new_ip->next = *ip_list;
1377         *ip_list     = new_ip;
1378         return 0;
1379 }
1380
1381 static struct ctdb_public_ip_list *
1382 create_merged_ip_list(struct ctdb_context *ctdb)
1383 {
1384         int i, j;
1385         struct ctdb_public_ip_list *ip_list;
1386         struct ctdb_all_public_ips *public_ips;
1387
1388         if (ctdb->ip_tree != NULL) {
1389                 talloc_free(ctdb->ip_tree);
1390                 ctdb->ip_tree = NULL;
1391         }
1392         ctdb->ip_tree = trbt_create(ctdb, 0);
1393
1394         for (i=0;i<ctdb->num_nodes;i++) {
1395                 public_ips = ctdb->nodes[i]->known_public_ips;
1396
1397                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1398                         continue;
1399                 }
1400
1401                 /* there were no public ips for this node */
1402                 if (public_ips == NULL) {
1403                         continue;
1404                 }               
1405
1406                 for (j=0;j<public_ips->num;j++) {
1407                         struct ctdb_public_ip_list *tmp_ip; 
1408
1409                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1410                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1411                         /* Do not use information about IP addresses hosted
1412                          * on other nodes, it may not be accurate */
1413                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1414                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1415                         } else {
1416                                 tmp_ip->pnn = -1;
1417                         }
1418                         tmp_ip->addr = public_ips->ips[j].addr;
1419                         tmp_ip->next = NULL;
1420
1421                         trbt_insertarray32_callback(ctdb->ip_tree,
1422                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1423                                 add_ip_callback,
1424                                 tmp_ip);
1425                 }
1426         }
1427
1428         ip_list = NULL;
1429         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1430
1431         return ip_list;
1432 }
1433
1434 /* 
1435  * This is the length of the longtest common prefix between the IPs.
1436  * It is calculated by XOR-ing the 2 IPs together and counting the
1437  * number of leading zeroes.  The implementation means that all
1438  * addresses end up being 128 bits long.
1439  *
1440  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1441  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1442  * lots of nodes and IP addresses?
1443  */
1444 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1445 {
1446         uint32_t ip1_k[IP_KEYLEN];
1447         uint32_t *t;
1448         int i;
1449         uint32_t x;
1450
1451         uint32_t distance = 0;
1452
1453         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1454         t = ip_key(ip2);
1455         for (i=0; i<IP_KEYLEN; i++) {
1456                 x = ip1_k[i] ^ t[i];
1457                 if (x == 0) {
1458                         distance += 32;
1459                 } else {
1460                         /* Count number of leading zeroes. 
1461                          * FIXME? This could be optimised...
1462                          */
1463                         while ((x & (1 << 31)) == 0) {
1464                                 x <<= 1;
1465                                 distance += 1;
1466                         }
1467                 }
1468         }
1469
1470         return distance;
1471 }
1472
1473 /* Calculate the IP distance for the given IP relative to IPs on the
1474    given node.  The ips argument is generally the all_ips variable
1475    used in the main part of the algorithm.
1476  */
1477 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1478                                   struct ctdb_public_ip_list *ips,
1479                                   int pnn)
1480 {
1481         struct ctdb_public_ip_list *t;
1482         uint32_t d;
1483
1484         uint32_t sum = 0;
1485
1486         for (t=ips; t != NULL; t=t->next) {
1487                 if (t->pnn != pnn) {
1488                         continue;
1489                 }
1490
1491                 /* Optimisation: We never calculate the distance
1492                  * between an address and itself.  This allows us to
1493                  * calculate the effect of removing an address from a
1494                  * node by simply calculating the distance between
1495                  * that address and all of the exitsing addresses.
1496                  * Moreover, we assume that we're only ever dealing
1497                  * with addresses from all_ips so we can identify an
1498                  * address via a pointer rather than doing a more
1499                  * expensive address comparison. */
1500                 if (&(t->addr) == ip) {
1501                         continue;
1502                 }
1503
1504                 d = ip_distance(ip, &(t->addr));
1505                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1506         }
1507
1508         return sum;
1509 }
1510
1511 /* Return the LCP2 imbalance metric for addresses currently assigned
1512    to the given node.
1513  */
1514 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1515 {
1516         struct ctdb_public_ip_list *t;
1517
1518         uint32_t imbalance = 0;
1519
1520         for (t=all_ips; t!=NULL; t=t->next) {
1521                 if (t->pnn != pnn) {
1522                         continue;
1523                 }
1524                 /* Pass the rest of the IPs rather than the whole
1525                    all_ips input list.
1526                 */
1527                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1528         }
1529
1530         return imbalance;
1531 }
1532
1533 /* Allocate any unassigned IPs just by looping through the IPs and
1534  * finding the best node for each.
1535  */
1536 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1537                                       struct ctdb_ipflags *ipflags,
1538                                       struct ctdb_public_ip_list *all_ips)
1539 {
1540         struct ctdb_public_ip_list *tmp_ip;
1541
1542         /* loop over all ip's and find a physical node to cover for 
1543            each unassigned ip.
1544         */
1545         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1546                 if (tmp_ip->pnn == -1) {
1547                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1548                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1549                                         ctdb_addr_to_str(&tmp_ip->addr)));
1550                         }
1551                 }
1552         }
1553 }
1554
1555 /* Basic non-deterministic rebalancing algorithm.
1556  */
1557 static void basic_failback(struct ctdb_context *ctdb,
1558                            struct ctdb_ipflags *ipflags,
1559                            struct ctdb_public_ip_list *all_ips,
1560                            int num_ips)
1561 {
1562         int i, numnodes;
1563         int maxnode, maxnum, minnode, minnum, num, retries;
1564         struct ctdb_public_ip_list *tmp_ip;
1565
1566         numnodes = talloc_array_length(ipflags);
1567         retries = 0;
1568
1569 try_again:
1570         maxnum=0;
1571         minnum=0;
1572
1573         /* for each ip address, loop over all nodes that can serve
1574            this ip and make sure that the difference between the node
1575            serving the most and the node serving the least ip's are
1576            not greater than 1.
1577         */
1578         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1579                 if (tmp_ip->pnn == -1) {
1580                         continue;
1581                 }
1582
1583                 /* Get the highest and lowest number of ips's served by any 
1584                    valid node which can serve this ip.
1585                 */
1586                 maxnode = -1;
1587                 minnode = -1;
1588                 for (i=0; i<numnodes; i++) {
1589                         /* only check nodes that can actually serve this ip */
1590                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1591                                 /* no it couldnt   so skip to the next node */
1592                                 continue;
1593                         }
1594
1595                         num = node_ip_coverage(ctdb, i, all_ips);
1596                         if (maxnode == -1) {
1597                                 maxnode = i;
1598                                 maxnum  = num;
1599                         } else {
1600                                 if (num > maxnum) {
1601                                         maxnode = i;
1602                                         maxnum  = num;
1603                                 }
1604                         }
1605                         if (minnode == -1) {
1606                                 minnode = i;
1607                                 minnum  = num;
1608                         } else {
1609                                 if (num < minnum) {
1610                                         minnode = i;
1611                                         minnum  = num;
1612                                 }
1613                         }
1614                 }
1615                 if (maxnode == -1) {
1616                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1617                                 ctdb_addr_to_str(&tmp_ip->addr)));
1618
1619                         continue;
1620                 }
1621
1622                 /* if the spread between the smallest and largest coverage by
1623                    a node is >=2 we steal one of the ips from the node with
1624                    most coverage to even things out a bit.
1625                    try to do this a limited number of times since we dont
1626                    want to spend too much time balancing the ip coverage.
1627                 */
1628                 if ( (maxnum > minnum+1)
1629                      && (retries < (num_ips + 5)) ){
1630                         struct ctdb_public_ip_list *tmp;
1631
1632                         /* Reassign one of maxnode's VNNs */
1633                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1634                                 if (tmp->pnn == maxnode) {
1635                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1636                                         retries++;
1637                                         goto try_again;;
1638                                 }
1639                         }
1640                 }
1641         }
1642 }
1643
1644 static void lcp2_init(struct ctdb_context *tmp_ctx,
1645                       struct ctdb_ipflags *ipflags,
1646                       struct ctdb_public_ip_list *all_ips,
1647                       uint32_t *force_rebalance_nodes,
1648                       uint32_t **lcp2_imbalances,
1649                       bool **rebalance_candidates)
1650 {
1651         int i, numnodes;
1652         struct ctdb_public_ip_list *tmp_ip;
1653
1654         numnodes = talloc_array_length(ipflags);
1655
1656         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1657         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1658         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1659         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1660
1661         for (i=0; i<numnodes; i++) {
1662                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1663                 /* First step: assume all nodes are candidates */
1664                 (*rebalance_candidates)[i] = true;
1665         }
1666
1667         /* 2nd step: if a node has IPs assigned then it must have been
1668          * healthy before, so we remove it from consideration.  This
1669          * is overkill but is all we have because we don't maintain
1670          * state between takeover runs.  An alternative would be to
1671          * keep state and invalidate it every time the recovery master
1672          * changes.
1673          */
1674         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1675                 if (tmp_ip->pnn != -1) {
1676                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1677                 }
1678         }
1679
1680         /* 3rd step: if a node is forced to re-balance then
1681            we allow failback onto the node */
1682         if (force_rebalance_nodes == NULL) {
1683                 return;
1684         }
1685         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1686                 uint32_t pnn = force_rebalance_nodes[i];
1687                 if (pnn >= numnodes) {
1688                         DEBUG(DEBUG_ERR,
1689                               (__location__ "unknown node %u\n", pnn));
1690                         continue;
1691                 }
1692
1693                 DEBUG(DEBUG_NOTICE,
1694                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1695                 (*rebalance_candidates)[pnn] = true;
1696         }
1697 }
1698
1699 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1700  * the IP/node combination that will cost the least.
1701  */
1702 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1703                                      struct ctdb_ipflags *ipflags,
1704                                      struct ctdb_public_ip_list *all_ips,
1705                                      uint32_t *lcp2_imbalances)
1706 {
1707         struct ctdb_public_ip_list *tmp_ip;
1708         int dstnode, numnodes;
1709
1710         int minnode;
1711         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1712         struct ctdb_public_ip_list *minip;
1713
1714         bool should_loop = true;
1715         bool have_unassigned = true;
1716
1717         numnodes = talloc_array_length(ipflags);
1718
1719         while (have_unassigned && should_loop) {
1720                 should_loop = false;
1721
1722                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1723                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1724
1725                 minnode = -1;
1726                 mindsum = 0;
1727                 minip = NULL;
1728
1729                 /* loop over each unassigned ip. */
1730                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1731                         if (tmp_ip->pnn != -1) {
1732                                 continue;
1733                         }
1734
1735                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1736                                 /* only check nodes that can actually takeover this ip */
1737                                 if (!can_node_takeover_ip(ctdb, dstnode,
1738                                                           ipflags[dstnode],
1739                                                           tmp_ip)) {
1740                                         /* no it couldnt   so skip to the next node */
1741                                         continue;
1742                                 }
1743
1744                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1745                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1746                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1747                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1748                                                    dstnode,
1749                                                    dstimbl - lcp2_imbalances[dstnode]));
1750
1751
1752                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1753                                         minnode = dstnode;
1754                                         minimbl = dstimbl;
1755                                         mindsum = dstdsum;
1756                                         minip = tmp_ip;
1757                                         should_loop = true;
1758                                 }
1759                         }
1760                 }
1761
1762                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1763
1764                 /* If we found one then assign it to the given node. */
1765                 if (minnode != -1) {
1766                         minip->pnn = minnode;
1767                         lcp2_imbalances[minnode] = minimbl;
1768                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1769                                           ctdb_addr_to_str(&(minip->addr)),
1770                                           minnode,
1771                                           mindsum));
1772                 }
1773
1774                 /* There might be a better way but at least this is clear. */
1775                 have_unassigned = false;
1776                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1777                         if (tmp_ip->pnn == -1) {
1778                                 have_unassigned = true;
1779                         }
1780                 }
1781         }
1782
1783         /* We know if we have an unassigned addresses so we might as
1784          * well optimise.
1785          */
1786         if (have_unassigned) {
1787                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1788                         if (tmp_ip->pnn == -1) {
1789                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1790                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1791                         }
1792                 }
1793         }
1794 }
1795
1796 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1797  * to move IPs from, determines the best IP/destination node
1798  * combination to move from the source node.
1799  */
1800 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1801                                     struct ctdb_ipflags *ipflags,
1802                                     struct ctdb_public_ip_list *all_ips,
1803                                     int srcnode,
1804                                     uint32_t *lcp2_imbalances,
1805                                     bool *rebalance_candidates)
1806 {
1807         int dstnode, mindstnode, numnodes;
1808         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1809         uint32_t minsrcimbl, mindstimbl;
1810         struct ctdb_public_ip_list *minip;
1811         struct ctdb_public_ip_list *tmp_ip;
1812
1813         /* Find an IP and destination node that best reduces imbalance. */
1814         srcimbl = 0;
1815         minip = NULL;
1816         minsrcimbl = 0;
1817         mindstnode = -1;
1818         mindstimbl = 0;
1819
1820         numnodes = talloc_array_length(ipflags);
1821
1822         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1823         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1824                            srcnode, lcp2_imbalances[srcnode]));
1825
1826         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1827                 /* Only consider addresses on srcnode. */
1828                 if (tmp_ip->pnn != srcnode) {
1829                         continue;
1830                 }
1831
1832                 /* What is this IP address costing the source node? */
1833                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1834                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1835
1836                 /* Consider this IP address would cost each potential
1837                  * destination node.  Destination nodes are limited to
1838                  * those that are newly healthy, since we don't want
1839                  * to do gratuitous failover of IPs just to make minor
1840                  * balance improvements.
1841                  */
1842                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1843                         if (!rebalance_candidates[dstnode]) {
1844                                 continue;
1845                         }
1846
1847                         /* only check nodes that can actually takeover this ip */
1848                         if (!can_node_takeover_ip(ctdb, dstnode,
1849                                                   ipflags[dstnode], tmp_ip)) {
1850                                 /* no it couldnt   so skip to the next node */
1851                                 continue;
1852                         }
1853
1854                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1855                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1856                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1857                                            srcnode, -srcdsum,
1858                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1859                                            dstnode, dstdsum));
1860
1861                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1862                             (dstdsum < srcdsum) &&                      \
1863                             ((mindstnode == -1) ||                              \
1864                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1865
1866                                 minip = tmp_ip;
1867                                 minsrcimbl = srcimbl;
1868                                 mindstnode = dstnode;
1869                                 mindstimbl = dstimbl;
1870                         }
1871                 }
1872         }
1873         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1874
1875         if (mindstnode != -1) {
1876                 /* We found a move that makes things better... */
1877                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1878                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1879                                   ctdb_addr_to_str(&(minip->addr)),
1880                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1881
1882
1883                 lcp2_imbalances[srcnode] = minsrcimbl;
1884                 lcp2_imbalances[mindstnode] = mindstimbl;
1885                 minip->pnn = mindstnode;
1886
1887                 return true;
1888         }
1889
1890         return false;
1891         
1892 }
1893
1894 struct lcp2_imbalance_pnn {
1895         uint32_t imbalance;
1896         int pnn;
1897 };
1898
1899 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1900 {
1901         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1902         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1903
1904         if (lipa->imbalance > lipb->imbalance) {
1905                 return -1;
1906         } else if (lipa->imbalance == lipb->imbalance) {
1907                 return 0;
1908         } else {
1909                 return 1;
1910         }
1911 }
1912
1913 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1914  * node with the highest LCP2 imbalance, and then determines the best
1915  * IP/destination node combination to move from the source node.
1916  */
1917 static void lcp2_failback(struct ctdb_context *ctdb,
1918                           struct ctdb_ipflags *ipflags,
1919                           struct ctdb_public_ip_list *all_ips,
1920                           uint32_t *lcp2_imbalances,
1921                           bool *rebalance_candidates)
1922 {
1923         int i, numnodes;
1924         struct lcp2_imbalance_pnn * lips;
1925         bool again;
1926
1927         numnodes = talloc_array_length(ipflags);
1928
1929 try_again:
1930         /* Put the imbalances and nodes into an array, sort them and
1931          * iterate through candidates.  Usually the 1st one will be
1932          * used, so this doesn't cost much...
1933          */
1934         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1935         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1936         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1937         for (i=0; i<numnodes; i++) {
1938                 lips[i].imbalance = lcp2_imbalances[i];
1939                 lips[i].pnn = i;
1940                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1941         }
1942         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1943               lcp2_cmp_imbalance_pnn);
1944
1945         again = false;
1946         for (i=0; i<numnodes; i++) {
1947                 /* This means that all nodes had 0 or 1 addresses, so
1948                  * can't be imbalanced.
1949                  */
1950                 if (lips[i].imbalance == 0) {
1951                         break;
1952                 }
1953
1954                 if (lcp2_failback_candidate(ctdb,
1955                                             ipflags,
1956                                             all_ips,
1957                                             lips[i].pnn,
1958                                             lcp2_imbalances,
1959                                             rebalance_candidates)) {
1960                         again = true;
1961                         break;
1962                 }
1963         }
1964
1965         talloc_free(lips);
1966         if (again) {
1967                 goto try_again;
1968         }
1969 }
1970
1971 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1972                                     struct ctdb_ipflags *ipflags,
1973                                     struct ctdb_public_ip_list *all_ips)
1974 {
1975         struct ctdb_public_ip_list *tmp_ip;
1976
1977         /* verify that the assigned nodes can serve that public ip
1978            and set it to -1 if not
1979         */
1980         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1981                 if (tmp_ip->pnn == -1) {
1982                         continue;
1983                 }
1984                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1985                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1986                         /* this node can not serve this ip. */
1987                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1988                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1989                                            tmp_ip->pnn));
1990                         tmp_ip->pnn = -1;
1991                 }
1992         }
1993 }
1994
1995 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
1996                                        struct ctdb_ipflags *ipflags,
1997                                        struct ctdb_public_ip_list *all_ips)
1998 {
1999         struct ctdb_public_ip_list *tmp_ip;
2000         int i, numnodes;
2001
2002         numnodes = talloc_array_length(ipflags);
2003
2004         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2005        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2006         *  always be allocated the same way for a specific set of
2007         *  available/unavailable nodes.
2008         */
2009
2010         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2011                 tmp_ip->pnn = i % numnodes;
2012         }
2013
2014         /* IP failback doesn't make sense with deterministic
2015          * IPs, since the modulo step above implicitly fails
2016          * back IPs to their "home" node.
2017          */
2018         if (1 == ctdb->tunable.no_ip_failback) {
2019                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2020         }
2021
2022         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2023
2024         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2025
2026         /* No failback here! */
2027 }
2028
2029 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2030                                           struct ctdb_ipflags *ipflags,
2031                                           struct ctdb_public_ip_list *all_ips)
2032 {
2033         /* This should be pushed down into basic_failback. */
2034         struct ctdb_public_ip_list *tmp_ip;
2035         int num_ips = 0;
2036         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2037                 num_ips++;
2038         }
2039
2040         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2041
2042         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2043
2044         /* If we don't want IPs to fail back then don't rebalance IPs. */
2045         if (1 == ctdb->tunable.no_ip_failback) {
2046                 return;
2047         }
2048
2049         /* Now, try to make sure the ip adresses are evenly distributed
2050            across the nodes.
2051         */
2052         basic_failback(ctdb, ipflags, all_ips, num_ips);
2053 }
2054
2055 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2056                           struct ctdb_ipflags *ipflags,
2057                           struct ctdb_public_ip_list *all_ips,
2058                           uint32_t *force_rebalance_nodes)
2059 {
2060         uint32_t *lcp2_imbalances;
2061         bool *rebalance_candidates;
2062         int numnodes, num_rebalance_candidates, i;
2063
2064         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2065
2066         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2067
2068         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2069                   &lcp2_imbalances, &rebalance_candidates);
2070
2071         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2072
2073         /* If we don't want IPs to fail back then don't rebalance IPs. */
2074         if (1 == ctdb->tunable.no_ip_failback) {
2075                 goto finished;
2076         }
2077
2078         /* It is only worth continuing if we have suitable target
2079          * nodes to transfer IPs to.  This check is much cheaper than
2080          * continuing on...
2081          */
2082         numnodes = talloc_array_length(ipflags);
2083         num_rebalance_candidates = 0;
2084         for (i=0; i<numnodes; i++) {
2085                 if (rebalance_candidates[i]) {
2086                         num_rebalance_candidates++;
2087                 }
2088         }
2089         if (num_rebalance_candidates == 0) {
2090                 goto finished;
2091         }
2092
2093         /* Now, try to make sure the ip adresses are evenly distributed
2094            across the nodes.
2095         */
2096         lcp2_failback(ctdb, ipflags, all_ips,
2097                       lcp2_imbalances, rebalance_candidates);
2098
2099 finished:
2100         talloc_free(tmp_ctx);
2101 }
2102
2103 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2104 {
2105         int i;
2106
2107         for (i=0;i<nodemap->num;i++) {
2108                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2109                         /* Found one completely healthy node */
2110                         return false;
2111                 }
2112         }
2113
2114         return true;
2115 }
2116
2117 /* The calculation part of the IP allocation algorithm. */
2118 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2119                                    struct ctdb_ipflags *ipflags,
2120                                    struct ctdb_public_ip_list **all_ips_p,
2121                                    uint32_t *force_rebalance_nodes)
2122 {
2123         /* since nodes only know about those public addresses that
2124            can be served by that particular node, no single node has
2125            a full list of all public addresses that exist in the cluster.
2126            Walk over all node structures and create a merged list of
2127            all public addresses that exist in the cluster.
2128
2129            keep the tree of ips around as ctdb->ip_tree
2130         */
2131         *all_ips_p = create_merged_ip_list(ctdb);
2132
2133         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2134                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2135         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2136                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2137         } else {
2138                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2139         }
2140
2141         /* at this point ->pnn is the node which will own each IP
2142            or -1 if there is no node that can cover this ip
2143         */
2144
2145         return;
2146 }
2147
2148 struct get_tunable_callback_data {
2149         const char *tunable;
2150         uint32_t *out;
2151         bool fatal;
2152 };
2153
2154 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2155                                  int32_t res, TDB_DATA outdata,
2156                                  void *callback)
2157 {
2158         struct get_tunable_callback_data *cd =
2159                 (struct get_tunable_callback_data *)callback;
2160         int size;
2161
2162         if (res != 0) {
2163                 /* Already handled in fail callback */
2164                 return;
2165         }
2166
2167         if (outdata.dsize != sizeof(uint32_t)) {
2168                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2169                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2170                                  (int)outdata.dsize));
2171                 cd->fatal = true;
2172                 return;
2173         }
2174
2175         size = talloc_array_length(cd->out);
2176         if (pnn >= size) {
2177                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2178                                  cd->tunable, pnn, size));
2179                 return;
2180         }
2181
2182                 
2183         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2184 }
2185
2186 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2187                                        int32_t res, TDB_DATA outdata,
2188                                        void *callback)
2189 {
2190         struct get_tunable_callback_data *cd =
2191                 (struct get_tunable_callback_data *)callback;
2192
2193         switch (res) {
2194         case -ETIME:
2195                 DEBUG(DEBUG_ERR,
2196                       ("Timed out getting tunable \"%s\" from node %d\n",
2197                        cd->tunable, pnn));
2198                 cd->fatal = true;
2199                 break;
2200         case -EINVAL:
2201         case -1:
2202                 DEBUG(DEBUG_WARNING,
2203                       ("Tunable \"%s\" not implemented on node %d\n",
2204                        cd->tunable, pnn));
2205                 break;
2206         default:
2207                 DEBUG(DEBUG_ERR,
2208                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2209                        cd->tunable, pnn));
2210                 cd->fatal = true;
2211         }
2212 }
2213
2214 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2215                                         TALLOC_CTX *tmp_ctx,
2216                                         struct ctdb_node_map *nodemap,
2217                                         const char *tunable,
2218                                         uint32_t default_value)
2219 {
2220         TDB_DATA data;
2221         struct ctdb_control_get_tunable *t;
2222         uint32_t *nodes;
2223         uint32_t *tvals;
2224         struct get_tunable_callback_data callback_data;
2225         int i;
2226
2227         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2228         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2229         for (i=0; i<nodemap->num; i++) {
2230                 tvals[i] = default_value;
2231         }
2232                 
2233         callback_data.out = tvals;
2234         callback_data.tunable = tunable;
2235         callback_data.fatal = false;
2236
2237         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2238         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2239         t = (struct ctdb_control_get_tunable *)data.dptr;
2240         t->length = strlen(tunable)+1;
2241         memcpy(t->name, tunable, t->length);
2242         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2243         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2244                                       nodes, 0, TAKEOVER_TIMEOUT(),
2245                                       false, data,
2246                                       get_tunable_callback,
2247                                       get_tunable_fail_callback,
2248                                       &callback_data) != 0) {
2249                 if (callback_data.fatal) {
2250                         talloc_free(tvals);
2251                         tvals = NULL;
2252                 }
2253         }
2254         talloc_free(nodes);
2255         talloc_free(data.dptr);
2256
2257         return tvals;
2258 }
2259
2260 struct get_runstate_callback_data {
2261         enum ctdb_runstate *out;
2262         bool fatal;
2263 };
2264
2265 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2266                                   int32_t res, TDB_DATA outdata,
2267                                   void *callback_data)
2268 {
2269         struct get_runstate_callback_data *cd =
2270                 (struct get_runstate_callback_data *)callback_data;
2271         int size;
2272
2273         if (res != 0) {
2274                 /* Already handled in fail callback */
2275                 return;
2276         }
2277
2278         if (outdata.dsize != sizeof(uint32_t)) {
2279                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2280                                  pnn, (int)sizeof(uint32_t),
2281                                  (int)outdata.dsize));
2282                 cd->fatal = true;
2283                 return;
2284         }
2285
2286         size = talloc_array_length(cd->out);
2287         if (pnn >= size) {
2288                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2289                                  pnn, size));
2290                 return;
2291         }
2292
2293         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2294 }
2295
2296 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2297                                        int32_t res, TDB_DATA outdata,
2298                                        void *callback)
2299 {
2300         struct get_runstate_callback_data *cd =
2301                 (struct get_runstate_callback_data *)callback;
2302
2303         switch (res) {
2304         case -ETIME:
2305                 DEBUG(DEBUG_ERR,
2306                       ("Timed out getting runstate from node %d\n", pnn));
2307                 cd->fatal = true;
2308                 break;
2309         default:
2310                 DEBUG(DEBUG_WARNING,
2311                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2312                        pnn));
2313         }
2314 }
2315
2316 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2317                                                     TALLOC_CTX *tmp_ctx,
2318                                                     struct ctdb_node_map *nodemap,
2319                                                     enum ctdb_runstate default_value)
2320 {
2321         uint32_t *nodes;
2322         enum ctdb_runstate *rs;
2323         struct get_runstate_callback_data callback_data;
2324         int i;
2325
2326         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2327         CTDB_NO_MEMORY_NULL(ctdb, rs);
2328         for (i=0; i<nodemap->num; i++) {
2329                 rs[i] = default_value;
2330         }
2331
2332         callback_data.out = rs;
2333         callback_data.fatal = false;
2334
2335         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2336         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2337                                       nodes, 0, TAKEOVER_TIMEOUT(),
2338                                       true, tdb_null,
2339                                       get_runstate_callback,
2340                                       get_runstate_fail_callback,
2341                                       &callback_data) != 0) {
2342                 if (callback_data.fatal) {
2343                         free(rs);
2344                         rs = NULL;
2345                 }
2346         }
2347         talloc_free(nodes);
2348
2349         return rs;
2350 }
2351
2352 /* Set internal flags for IP allocation:
2353  *   Clear ip flags
2354  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2355  *   Set NOIPHOST ip flag for each INACTIVE node
2356  *   if all nodes are disabled:
2357  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2358  *   else
2359  *     Set NOIPHOST ip flags for disabled nodes
2360  */
2361 static struct ctdb_ipflags *
2362 set_ipflags_internal(struct ctdb_context *ctdb,
2363                      TALLOC_CTX *tmp_ctx,
2364                      struct ctdb_node_map *nodemap,
2365                      uint32_t *tval_noiptakeover,
2366                      uint32_t *tval_noiphostonalldisabled,
2367                      enum ctdb_runstate *runstate)
2368 {
2369         int i;
2370         struct ctdb_ipflags *ipflags;
2371
2372         /* Clear IP flags - implicit due to talloc_zero */
2373         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2374         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2375
2376         for (i=0;i<nodemap->num;i++) {
2377                 /* Can not take IPs on node with NoIPTakeover set */
2378                 if (tval_noiptakeover[i] != 0) {
2379                         ipflags[i].noiptakeover = true;
2380                 }
2381
2382                 /* Can not host IPs on node not in RUNNING state */
2383                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2384                         ipflags[i].noiphost = true;
2385                         continue;
2386                 }
2387                 /* Can not host IPs on INACTIVE node */
2388                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2389                         ipflags[i].noiphost = true;
2390                 }
2391                 /* Remember the runstate */
2392                 ipflags[i].runstate = runstate[i];
2393         }
2394
2395         if (all_nodes_are_disabled(nodemap)) {
2396                 /* If all nodes are disabled, can not host IPs on node
2397                  * with NoIPHostOnAllDisabled set
2398                  */
2399                 for (i=0;i<nodemap->num;i++) {
2400                         if (tval_noiphostonalldisabled[i] != 0) {
2401                                 ipflags[i].noiphost = true;
2402                         }
2403                 }
2404         } else {
2405                 /* If some nodes are not disabled, then can not host
2406                  * IPs on DISABLED node
2407                  */
2408                 for (i=0;i<nodemap->num;i++) {
2409                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2410                                 ipflags[i].noiphost = true;
2411                         }
2412                 }
2413         }
2414
2415         return ipflags;
2416 }
2417
2418 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2419                                         TALLOC_CTX *tmp_ctx,
2420                                         struct ctdb_node_map *nodemap)
2421 {
2422         uint32_t *tval_noiptakeover;
2423         uint32_t *tval_noiphostonalldisabled;
2424         struct ctdb_ipflags *ipflags;
2425         enum ctdb_runstate *runstate;
2426
2427
2428         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2429                                                    "NoIPTakeover", 0);
2430         if (tval_noiptakeover == NULL) {
2431                 return NULL;
2432         }
2433
2434         tval_noiphostonalldisabled =
2435                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2436                                        "NoIPHostOnAllDisabled", 0);
2437         if (tval_noiphostonalldisabled == NULL) {
2438                 /* Caller frees tmp_ctx */
2439                 return NULL;
2440         }
2441
2442         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2443          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2444          * reasonable behaviour on a mixed cluster during upgrade.
2445          */
2446         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2447                                            CTDB_RUNSTATE_RUNNING);
2448         if (runstate == NULL) {
2449                 /* Caller frees tmp_ctx */
2450                 return NULL;
2451         }
2452
2453         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2454                                        tval_noiptakeover,
2455                                        tval_noiphostonalldisabled,
2456                                        runstate);
2457
2458         talloc_free(tval_noiptakeover);
2459         talloc_free(tval_noiphostonalldisabled);
2460         talloc_free(runstate);
2461
2462         return ipflags;
2463 }
2464
2465 struct iprealloc_callback_data {
2466         bool *retry_nodes;
2467         int retry_count;
2468         client_async_callback fail_callback;
2469         void *fail_callback_data;
2470         struct ctdb_node_map *nodemap;
2471 };
2472
2473 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2474                                         int32_t res, TDB_DATA outdata,
2475                                         void *callback)
2476 {
2477         int numnodes;
2478         struct iprealloc_callback_data *cd =
2479                 (struct iprealloc_callback_data *)callback;
2480
2481         numnodes = talloc_array_length(cd->retry_nodes);
2482         if (pnn > numnodes) {
2483                 DEBUG(DEBUG_ERR,
2484                       ("ipreallocated failure from node %d, "
2485                        "but only %d nodes in nodemap\n",
2486                        pnn, numnodes));
2487                 return;
2488         }
2489
2490         /* Can't run the "ipreallocated" event on a INACTIVE node */
2491         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2492                 DEBUG(DEBUG_WARNING,
2493                       ("ipreallocated failed on inactive node %d, ignoring\n",
2494                        pnn));
2495                 return;
2496         }
2497
2498         switch (res) {
2499         case -ETIME:
2500                 /* If the control timed out then that's a real error,
2501                  * so call the real fail callback
2502                  */
2503                 if (cd->fail_callback) {
2504                         cd->fail_callback(ctdb, pnn, res, outdata,
2505                                           cd->fail_callback_data);
2506                 } else {
2507                         DEBUG(DEBUG_WARNING,
2508                               ("iprealloc timed out but no callback registered\n"));
2509                 }
2510                 break;
2511         default:
2512                 /* If not a timeout then either the ipreallocated
2513                  * eventscript (or some setup) failed.  This might
2514                  * have failed because the IPREALLOCATED control isn't
2515                  * implemented - right now there is no way of knowing
2516                  * because the error codes are all folded down to -1.
2517                  * Consider retrying using EVENTSCRIPT control...
2518                  */
2519                 DEBUG(DEBUG_WARNING,
2520                       ("ipreallocated failure from node %d, flagging retry\n",
2521                        pnn));
2522                 cd->retry_nodes[pnn] = true;
2523                 cd->retry_count++;
2524         }
2525 }
2526
2527 struct takeover_callback_data {
2528         bool *node_failed;
2529         client_async_callback fail_callback;
2530         void *fail_callback_data;
2531         struct ctdb_node_map *nodemap;
2532 };
2533
2534 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2535                                        uint32_t node_pnn, int32_t res,
2536                                        TDB_DATA outdata, void *callback_data)
2537 {
2538         struct takeover_callback_data *cd =
2539                 talloc_get_type_abort(callback_data,
2540                                       struct takeover_callback_data);
2541         int i;
2542
2543         for (i = 0; i < cd->nodemap->num; i++) {
2544                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2545                         break;
2546                 }
2547         }
2548
2549         if (i == cd->nodemap->num) {
2550                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2551                 return;
2552         }
2553
2554         if (!cd->node_failed[i]) {
2555                 cd->node_failed[i] = true;
2556                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2557                                   cd->fail_callback_data);
2558         }
2559 }
2560
2561 /*
2562   make any IP alias changes for public addresses that are necessary 
2563  */
2564 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2565                       uint32_t *force_rebalance_nodes,
2566                       client_async_callback fail_callback, void *callback_data)
2567 {
2568         int i, j, ret;
2569         struct ctdb_public_ip ip;
2570         uint32_t *nodes;
2571         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2572         TDB_DATA data;
2573         struct timeval timeout;
2574         struct client_async_data *async_data;
2575         struct ctdb_client_control_state *state;
2576         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2577         struct ctdb_ipflags *ipflags;
2578         struct takeover_callback_data *takeover_data;
2579         struct iprealloc_callback_data iprealloc_data;
2580         bool *retry_data;
2581         bool can_host_ips;
2582
2583         /*
2584          * ip failover is completely disabled, just send out the 
2585          * ipreallocated event.
2586          */
2587         if (ctdb->tunable.disable_ip_failover != 0) {
2588                 goto ipreallocated;
2589         }
2590
2591         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2592         if (ipflags == NULL) {
2593                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2594                 talloc_free(tmp_ctx);
2595                 return -1;
2596         }
2597
2598         /* Short-circuit IP allocation if no nodes are in the RUNNING
2599          * runstate yet, since no nodes will be able to host IPs */
2600         can_host_ips = false;
2601         for (i=0; i<nodemap->num; i++) {
2602                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2603                         can_host_ips = true;
2604                 }
2605         }
2606         if (!can_host_ips) {
2607                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2608                 return 0;
2609         }
2610
2611         /* Do the IP reassignment calculations */
2612         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2613
2614         /* Now tell all nodes to release any public IPs should not
2615          * host.  This will be a NOOP on nodes that don't currently
2616          * hold the given IP.
2617          */
2618         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2619         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2620
2621         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2622                                                        bool, nodemap->num);
2623         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2624         takeover_data->fail_callback = fail_callback;
2625         takeover_data->fail_callback_data = callback_data;
2626         takeover_data->nodemap = nodemap;
2627
2628         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2629         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2630
2631         async_data->fail_callback = takeover_run_fail_callback;
2632         async_data->callback_data = takeover_data;
2633
2634         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2635
2636         /* Send a RELEASE_IP to all nodes that should not be hosting
2637          * each IP.  For each IP, all but one of these will be
2638          * redundant.  However, the redundant ones are used to tell
2639          * nodes which node should be hosting the IP so that commands
2640          * like "ctdb ip" can display a particular nodes idea of who
2641          * is hosting what. */
2642         for (i=0;i<nodemap->num;i++) {
2643                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2644                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2645                         continue;
2646                 }
2647
2648                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2649                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2650                                 /* This node should be serving this
2651                                    vnn so dont tell it to release the ip
2652                                 */
2653                                 continue;
2654                         }
2655                         ip.pnn  = tmp_ip->pnn;
2656                         ip.addr = tmp_ip->addr;
2657
2658                         timeout = TAKEOVER_TIMEOUT();
2659                         data.dsize = sizeof(ip);
2660                         data.dptr  = (uint8_t *)&ip;
2661                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2662                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2663                                                   data, async_data,
2664                                                   &timeout, NULL);
2665                         if (state == NULL) {
2666                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2667                                 talloc_free(tmp_ctx);
2668                                 return -1;
2669                         }
2670
2671                         ctdb_client_async_add(async_data, state);
2672                 }
2673         }
2674         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2675                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2676                 talloc_free(tmp_ctx);
2677                 return -1;
2678         }
2679         talloc_free(async_data);
2680
2681
2682         /* For each IP, send a TAKOVER_IP to the node that should be
2683          * hosting it.  Many of these will often be redundant (since
2684          * the allocation won't have changed) but they can be useful
2685          * to recover from inconsistencies. */
2686         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2687         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2688
2689         async_data->fail_callback = fail_callback;
2690         async_data->callback_data = callback_data;
2691
2692         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2693                 if (tmp_ip->pnn == -1) {
2694                         /* this IP won't be taken over */
2695                         continue;
2696                 }
2697
2698                 ip.pnn  = tmp_ip->pnn;
2699                 ip.addr = tmp_ip->addr;
2700
2701                 timeout = TAKEOVER_TIMEOUT();
2702                 data.dsize = sizeof(ip);
2703                 data.dptr  = (uint8_t *)&ip;
2704                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2705                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2706                                           data, async_data, &timeout, NULL);
2707                 if (state == NULL) {
2708                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2709                         talloc_free(tmp_ctx);
2710                         return -1;
2711                 }
2712
2713                 ctdb_client_async_add(async_data, state);
2714         }
2715         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2716                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2717                 talloc_free(tmp_ctx);
2718                 return -1;
2719         }
2720
2721 ipreallocated:
2722         /*
2723          * Tell all nodes to run eventscripts to process the
2724          * "ipreallocated" event.  This can do a lot of things,
2725          * including restarting services to reconfigure them if public
2726          * IPs have moved.  Once upon a time this event only used to
2727          * update natgw.
2728          */
2729         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2730         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2731         iprealloc_data.retry_nodes = retry_data;
2732         iprealloc_data.retry_count = 0;
2733         iprealloc_data.fail_callback = fail_callback;
2734         iprealloc_data.fail_callback_data = callback_data;
2735         iprealloc_data.nodemap = nodemap;
2736
2737         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2738         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2739                                         nodes, 0, TAKEOVER_TIMEOUT(),
2740                                         false, tdb_null,
2741                                         NULL, iprealloc_fail_callback,
2742                                         &iprealloc_data);
2743         if (ret != 0) {
2744                 /* If the control failed then we should retry to any
2745                  * nodes flagged by iprealloc_fail_callback using the
2746                  * EVENTSCRIPT control.  This is a best-effort at
2747                  * backward compatiblity when running a mixed cluster
2748                  * where some nodes have not yet been upgraded to
2749                  * support the IPREALLOCATED control.
2750                  */
2751                 DEBUG(DEBUG_WARNING,
2752                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2753
2754                 nodes = talloc_array(tmp_ctx, uint32_t,
2755                                      iprealloc_data.retry_count);
2756                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2757
2758                 j = 0;
2759                 for (i=0; i<nodemap->num; i++) {
2760                         if (iprealloc_data.retry_nodes[i]) {
2761                                 nodes[j] = i;
2762                                 j++;
2763                         }
2764                 }
2765
2766                 data.dptr  = discard_const("ipreallocated");
2767                 data.dsize = strlen((char *)data.dptr) + 1; 
2768                 ret = ctdb_client_async_control(ctdb,
2769                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2770                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2771                                                 false, data,
2772                                                 NULL, fail_callback,
2773                                                 callback_data);
2774                 if (ret != 0) {
2775                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2776                 }
2777         }
2778
2779         talloc_free(tmp_ctx);
2780         return ret;
2781 }
2782
2783
2784 /*
2785   destroy a ctdb_client_ip structure
2786  */
2787 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2788 {
2789         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2790                 ctdb_addr_to_str(&ip->addr),
2791                 ntohs(ip->addr.ip.sin_port),
2792                 ip->client_id));
2793
2794         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2795         return 0;
2796 }
2797
2798 /*
2799   called by a client to inform us of a TCP connection that it is managing
2800   that should tickled with an ACK when IP takeover is done
2801  */
2802 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2803                                 TDB_DATA indata)
2804 {
2805         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2806         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2807         struct ctdb_tcp_list *tcp;
2808         struct ctdb_tcp_connection t;
2809         int ret;
2810         TDB_DATA data;
2811         struct ctdb_client_ip *ip;
2812         struct ctdb_vnn *vnn;
2813         ctdb_sock_addr addr;
2814
2815         /* If we don't have public IPs, tickles are useless */
2816         if (ctdb->vnn == NULL) {
2817                 return 0;
2818         }
2819
2820         tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2821
2822         addr = tcp_sock->src;
2823         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2824         addr = tcp_sock->dest;
2825         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2826
2827         ZERO_STRUCT(addr);
2828         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2829         vnn = find_public_ip_vnn(ctdb, &addr);
2830         if (vnn == NULL) {
2831                 switch (addr.sa.sa_family) {
2832                 case AF_INET:
2833                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2834                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2835                                         ctdb_addr_to_str(&addr)));
2836                         }
2837                         break;
2838                 case AF_INET6:
2839                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2840                                 ctdb_addr_to_str(&addr)));
2841                         break;
2842                 default:
2843                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2844                 }
2845
2846                 return 0;
2847         }
2848
2849         if (vnn->pnn != ctdb->pnn) {
2850                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2851                         ctdb_addr_to_str(&addr),
2852                         client_id, client->pid));
2853                 /* failing this call will tell smbd to die */
2854                 return -1;
2855         }
2856
2857         ip = talloc(client, struct ctdb_client_ip);
2858         CTDB_NO_MEMORY(ctdb, ip);
2859
2860         ip->ctdb      = ctdb;
2861         ip->addr      = addr;
2862         ip->client_id = client_id;
2863         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2864         DLIST_ADD(ctdb->client_ip_list, ip);
2865
2866         tcp = talloc(client, struct ctdb_tcp_list);
2867         CTDB_NO_MEMORY(ctdb, tcp);
2868
2869         tcp->connection.src_addr = tcp_sock->src;
2870         tcp->connection.dst_addr = tcp_sock->dest;
2871
2872         DLIST_ADD(client->tcp_list, tcp);
2873
2874         t.src_addr = tcp_sock->src;
2875         t.dst_addr = tcp_sock->dest;
2876
2877         data.dptr = (uint8_t *)&t;
2878         data.dsize = sizeof(t);
2879
2880         switch (addr.sa.sa_family) {
2881         case AF_INET:
2882                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2883                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2884                         ctdb_addr_to_str(&tcp_sock->src),
2885                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2886                 break;
2887         case AF_INET6:
2888                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2889                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2890                         ctdb_addr_to_str(&tcp_sock->src),
2891                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2892                 break;
2893         default:
2894                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2895         }
2896
2897
2898         /* tell all nodes about this tcp connection */
2899         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2900                                        CTDB_CONTROL_TCP_ADD,
2901                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2902         if (ret != 0) {
2903                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2904                 return -1;
2905         }
2906
2907         return 0;
2908 }
2909
2910 /*
2911   find a tcp address on a list
2912  */
2913 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2914                                            struct ctdb_tcp_connection *tcp)
2915 {
2916         int i;
2917
2918         if (array == NULL) {
2919                 return NULL;
2920         }
2921
2922         for (i=0;i<array->num;i++) {
2923                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2924                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2925                         return &array->connections[i];
2926                 }
2927         }
2928         return NULL;
2929 }
2930
2931
2932
2933 /*
2934   called by a daemon to inform us of a TCP connection that one of its
2935   clients managing that should tickled with an ACK when IP takeover is
2936   done
2937  */
2938 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2939 {
2940         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2941         struct ctdb_tcp_array *tcparray;
2942         struct ctdb_tcp_connection tcp;
2943         struct ctdb_vnn *vnn;
2944
2945         /* If we don't have public IPs, tickles are useless */
2946         if (ctdb->vnn == NULL) {
2947                 return 0;
2948         }
2949
2950         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2951         if (vnn == NULL) {
2952                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2953                         ctdb_addr_to_str(&p->dst_addr)));
2954
2955                 return -1;
2956         }
2957
2958
2959         tcparray = vnn->tcp_array;
2960
2961         /* If this is the first tickle */
2962         if (tcparray == NULL) {
2963                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2964                 CTDB_NO_MEMORY(ctdb, tcparray);
2965                 vnn->tcp_array = tcparray;
2966
2967                 tcparray->num = 0;
2968                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2969                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2970
2971                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2972                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2973                 tcparray->num++;
2974
2975                 if (tcp_update_needed) {
2976                         vnn->tcp_update_needed = true;
2977                 }
2978                 return 0;
2979         }
2980
2981
2982         /* Do we already have this tickle ?*/
2983         tcp.src_addr = p->src_addr;
2984         tcp.dst_addr = p->dst_addr;
2985         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2986                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2987                         ctdb_addr_to_str(&tcp.dst_addr),
2988                         ntohs(tcp.dst_addr.ip.sin_port),
2989                         vnn->pnn));
2990                 return 0;
2991         }
2992
2993         /* A new tickle, we must add it to the array */
2994         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2995                                         struct ctdb_tcp_connection,
2996                                         tcparray->num+1);
2997         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2998
2999         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3000         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3001         tcparray->num++;
3002
3003         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3004                 ctdb_addr_to_str(&tcp.dst_addr),
3005                 ntohs(tcp.dst_addr.ip.sin_port),
3006                 vnn->pnn));
3007
3008         if (tcp_update_needed) {
3009                 vnn->tcp_update_needed = true;
3010         }
3011
3012         return 0;
3013 }
3014
3015
3016 /*
3017   called by a daemon to inform us of a TCP connection that one of its
3018   clients managing that should tickled with an ACK when IP takeover is
3019   done
3020  */
3021 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3022 {
3023         struct ctdb_tcp_connection *tcpp;
3024         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3025
3026         if (vnn == NULL) {
3027                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3028                         ctdb_addr_to_str(&conn->dst_addr)));
3029                 return;
3030         }
3031
3032         /* if the array is empty we cant remove it
3033            and we dont need to do anything
3034          */
3035         if (vnn->tcp_array == NULL) {
3036                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3037                         ctdb_addr_to_str(&conn->dst_addr),
3038                         ntohs(conn->dst_addr.ip.sin_port)));
3039                 return;
3040         }
3041
3042
3043         /* See if we know this connection
3044            if we dont know this connection  then we dont need to do anything
3045          */
3046         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3047         if (tcpp == NULL) {
3048                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3049                         ctdb_addr_to_str(&conn->dst_addr),
3050                         ntohs(conn->dst_addr.ip.sin_port)));
3051                 return;
3052         }
3053
3054
3055         /* We need to remove this entry from the array.
3056            Instead of allocating a new array and copying data to it
3057            we cheat and just copy the last entry in the existing array
3058            to the entry that is to be removed and just shring the 
3059            ->num field
3060          */
3061         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3062         vnn->tcp_array->num--;
3063
3064         /* If we deleted the last entry we also need to remove the entire array
3065          */
3066         if (vnn->tcp_array->num == 0) {
3067                 talloc_free(vnn->tcp_array);
3068                 vnn->tcp_array = NULL;
3069         }               
3070
3071         vnn->tcp_update_needed = true;
3072
3073         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3074                 ctdb_addr_to_str(&conn->src_addr),
3075                 ntohs(conn->src_addr.ip.sin_port)));
3076 }
3077
3078
3079 /*
3080   called by a daemon to inform us of a TCP connection that one of its
3081   clients used are no longer needed in the tickle database
3082  */
3083 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3084 {
3085         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3086
3087         /* If we don't have public IPs, tickles are useless */
3088         if (ctdb->vnn == NULL) {
3089                 return 0;
3090         }
3091
3092         ctdb_remove_tcp_connection(ctdb, conn);
3093
3094         return 0;
3095 }
3096
3097
3098 /*
3099   Called when another daemon starts - causes all tickles for all
3100   public addresses we are serving to be sent to the new node on the
3101   next check.  This actually causes the next scheduled call to
3102   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3103   doesn't require careful error handling.
3104  */
3105 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3106 {
3107         struct ctdb_vnn *vnn;
3108
3109         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3110                            (unsigned long) pnn));
3111
3112         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3113                 vnn->tcp_update_needed = true;
3114         }
3115
3116         return 0;
3117 }
3118
3119
3120 /*
3121   called when a client structure goes away - hook to remove
3122   elements from the tcp_list in all daemons
3123  */
3124 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3125 {
3126         while (client->tcp_list) {
3127                 struct ctdb_tcp_list *tcp = client->tcp_list;
3128                 DLIST_REMOVE(client->tcp_list, tcp);
3129                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3130         }
3131 }
3132
3133
3134 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3135 {
3136         struct ctdb_vnn *vnn;
3137         int count = 0;
3138
3139         if (ctdb->tunable.disable_ip_failover == 1) {
3140                 return;
3141         }
3142
3143         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3144                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3145                         ctdb_vnn_unassign_iface(ctdb, vnn);
3146                         continue;
3147                 }
3148                 if (!vnn->iface) {
3149                         continue;
3150                 }
3151
3152                 /* Don't allow multiple releases at once.  Some code,
3153                  * particularly ctdb_tickle_sentenced_connections() is
3154                  * not re-entrant */
3155                 if (vnn->update_in_flight) {
3156                         DEBUG(DEBUG_WARNING,
3157                               (__location__
3158                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3159                                     ctdb_addr_to_str(&vnn->public_address),
3160                                     vnn->public_netmask_bits,
3161                                     ctdb_vnn_iface_string(vnn)));
3162                         continue;
3163                 }
3164                 vnn->update_in_flight = true;
3165
3166                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3167                                     ctdb_addr_to_str(&vnn->public_address),
3168                                     vnn->public_netmask_bits,
3169                                     ctdb_vnn_iface_string(vnn)));
3170
3171                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3172                                   ctdb_vnn_iface_string(vnn),
3173                                   ctdb_addr_to_str(&vnn->public_address),
3174                                   vnn->public_netmask_bits);
3175                 release_kill_clients(ctdb, &vnn->public_address);
3176                 ctdb_vnn_unassign_iface(ctdb, vnn);
3177                 vnn->update_in_flight = false;
3178                 count++;
3179         }
3180
3181         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3182 }
3183
3184
3185 /*
3186   get list of public IPs
3187  */
3188 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3189                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3190 {
3191         int i, num, len;
3192         struct ctdb_all_public_ips *ips;
3193         struct ctdb_vnn *vnn;
3194         bool only_available = false;
3195
3196         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3197                 only_available = true;
3198         }
3199
3200         /* count how many public ip structures we have */
3201         num = 0;
3202         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3203                 num++;
3204         }
3205
3206         len = offsetof(struct ctdb_all_public_ips, ips) + 
3207                 num*sizeof(struct ctdb_public_ip);
3208         ips = talloc_zero_size(outdata, len);
3209         CTDB_NO_MEMORY(ctdb, ips);
3210
3211         i = 0;
3212         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3213                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3214                         continue;
3215                 }
3216                 ips->ips[i].pnn  = vnn->pnn;
3217                 ips->ips[i].addr = vnn->public_address;
3218                 i++;
3219         }
3220         ips->num = i;
3221         len = offsetof(struct ctdb_all_public_ips, ips) +
3222                 i*sizeof(struct ctdb_public_ip);
3223
3224         outdata->dsize = len;
3225         outdata->dptr  = (uint8_t *)ips;
3226
3227         return 0;
3228 }
3229
3230
3231 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3232                                         struct ctdb_req_control *c,
3233                                         TDB_DATA indata,
3234                                         TDB_DATA *outdata)
3235 {
3236         int i, num, len;
3237         ctdb_sock_addr *addr;
3238         struct ctdb_control_public_ip_info *info;
3239         struct ctdb_vnn *vnn;
3240
3241         addr = (ctdb_sock_addr *)indata.dptr;
3242
3243         vnn = find_public_ip_vnn(ctdb, addr);
3244         if (vnn == NULL) {
3245                 /* if it is not a public ip   it could be our 'single ip' */
3246                 if (ctdb->single_ip_vnn) {
3247                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3248                                 vnn = ctdb->single_ip_vnn;
3249                         }
3250                 }
3251         }
3252         if (vnn == NULL) {
3253                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3254                                  "'%s'not a public address\n",
3255                                  ctdb_addr_to_str(addr)));
3256                 return -1;
3257         }
3258
3259         /* count how many public ip structures we have */
3260         num = 0;
3261         for (;vnn->ifaces[num];) {
3262                 num++;
3263         }
3264
3265         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3266                 num*sizeof(struct ctdb_control_iface_info);
3267         info = talloc_zero_size(outdata, len);
3268         CTDB_NO_MEMORY(ctdb, info);
3269
3270         info->ip.addr = vnn->public_address;
3271         info->ip.pnn = vnn->pnn;
3272         info->active_idx = 0xFFFFFFFF;
3273
3274         for (i=0; vnn->ifaces[i]; i++) {
3275                 struct ctdb_iface *cur;
3276
3277                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3278                 if (cur == NULL) {
3279                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3280                                            vnn->ifaces[i]));
3281                         return -1;
3282                 }
3283                 if (vnn->iface == cur) {
3284                         info->active_idx = i;
3285                 }
3286                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3287                 info->ifaces[i].link_state = cur->link_up;
3288                 info->ifaces[i].references = cur->references;
3289         }
3290         info->num = i;
3291         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3292                 i*sizeof(struct ctdb_control_iface_info);
3293
3294         outdata->dsize = len;
3295         outdata->dptr  = (uint8_t *)info;
3296
3297         return 0;
3298 }
3299
3300 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3301                                 struct ctdb_req_control *c,
3302                                 TDB_DATA *outdata)
3303 {
3304         int i, num, len;
3305         struct ctdb_control_get_ifaces *ifaces;
3306         struct ctdb_iface *cur;
3307
3308         /* count how many public ip structures we have */
3309         num = 0;
3310         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3311                 num++;
3312         }
3313
3314         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3315                 num*sizeof(struct ctdb_control_iface_info);
3316         ifaces = talloc_zero_size(outdata, len);
3317         CTDB_NO_MEMORY(ctdb, ifaces);
3318
3319         i = 0;
3320         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3321                 strcpy(ifaces->ifaces[i].name, cur->name);
3322                 ifaces->ifaces[i].link_state = cur->link_up;
3323                 ifaces->ifaces[i].references = cur->references;
3324                 i++;
3325         }
3326         ifaces->num = i;
3327         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3328                 i*sizeof(struct ctdb_control_iface_info);
3329
3330         outdata->dsize = len;
3331         outdata->dptr  = (uint8_t *)ifaces;
3332
3333         return 0;
3334 }
3335
3336 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3337                                     struct ctdb_req_control *c,
3338                                     TDB_DATA indata)
3339 {
3340         struct ctdb_control_iface_info *info;
3341         struct ctdb_iface *iface;
3342         bool link_up = false;
3343
3344         info = (struct ctdb_control_iface_info *)indata.dptr;
3345
3346         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3347                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3348                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3349                                   len, len, info->name));
3350                 return -1;
3351         }
3352
3353         switch (info->link_state) {
3354         case 0:
3355                 link_up = false;
3356                 break;
3357         case 1:
3358                 link_up = true;
3359                 break;
3360         default:
3361                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3362                                   (unsigned int)info->link_state));
3363                 return -1;
3364         }
3365
3366         if (info->references != 0) {
3367                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3368                                   (unsigned int)info->references));
3369                 return -1;
3370         }
3371
3372         iface = ctdb_find_iface(ctdb, info->name);
3373         if (iface == NULL) {
3374                 return -1;
3375         }
3376
3377         if (link_up == iface->link_up) {
3378                 return 0;
3379         }
3380
3381         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3382               ("iface[%s] has changed it's link status %s => %s\n",
3383                iface->name,
3384                iface->link_up?"up":"down",
3385                link_up?"up":"down"));
3386
3387         iface->link_up = link_up;
3388         return 0;
3389 }
3390
3391
3392 /* 
3393    structure containing the listening socket and the list of tcp connections
3394    that the ctdb daemon is to kill
3395 */
3396 struct ctdb_kill_tcp {
3397         struct ctdb_vnn *vnn;
3398         struct ctdb_context *ctdb;
3399         int capture_fd;
3400         struct tevent_fd *fde;
3401         trbt_tree_t *connections;
3402         void *private_data;
3403 };
3404
3405 /*
3406   a tcp connection that is to be killed
3407  */
3408 struct ctdb_killtcp_con {
3409         ctdb_sock_addr src_addr;
3410         ctdb_sock_addr dst_addr;
3411         int count;
3412         struct ctdb_kill_tcp *killtcp;
3413 };
3414
3415 /* this function is used to create a key to represent this socketpair
3416    in the killtcp tree.
3417    this key is used to insert and lookup matching socketpairs that are
3418    to be tickled and RST
3419 */
3420 #define KILLTCP_KEYLEN  10
3421 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3422 {
3423         static uint32_t key[KILLTCP_KEYLEN];
3424
3425         bzero(key, sizeof(key));
3426
3427         if (src->sa.sa_family != dst->sa.sa_family) {
3428                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3429                 return key;
3430         }
3431         
3432         switch (src->sa.sa_family) {
3433         case AF_INET:
3434                 key[0]  = dst->ip.sin_addr.s_addr;
3435                 key[1]  = src->ip.sin_addr.s_addr;
3436                 key[2]  = dst->ip.sin_port;
3437                 key[3]  = src->ip.sin_port;
3438                 break;
3439         case AF_INET6: {
3440                 uint32_t *dst6_addr32 =
3441                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3442                 uint32_t *src6_addr32 =
3443                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3444                 key[0]  = dst6_addr32[3];
3445                 key[1]  = src6_addr32[3];
3446                 key[2]  = dst6_addr32[2];
3447                 key[3]  = src6_addr32[2];
3448                 key[4]  = dst6_addr32[1];
3449                 key[5]  = src6_addr32[1];
3450                 key[6]  = dst6_addr32[0];
3451                 key[7]  = src6_addr32[0];
3452                 key[8]  = dst->ip6.sin6_port;
3453                 key[9]  = src->ip6.sin6_port;
3454                 break;
3455         }
3456         default:
3457                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3458                 return key;
3459         }
3460
3461         return key;
3462 }
3463
3464 /*
3465   called when we get a read event on the raw socket
3466  */
3467 static void capture_tcp_handler(struct tevent_context *ev,
3468                                 struct tevent_fd *fde,
3469                                 uint16_t flags, void *private_data)
3470 {
3471         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3472         struct ctdb_killtcp_con *con;
3473         ctdb_sock_addr src, dst;
3474         uint32_t ack_seq, seq;
3475
3476         if (!(flags & TEVENT_FD_READ)) {
3477                 return;
3478         }
3479
3480         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3481                                 killtcp->private_data,
3482                                 &src, &dst,
3483                                 &ack_seq, &seq) != 0) {
3484                 /* probably a non-tcp ACK packet */
3485                 return;
3486         }
3487
3488         /* check if we have this guy in our list of connections
3489            to kill
3490         */
3491         con = trbt_lookuparray32(killtcp->connections, 
3492                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3493         if (con == NULL) {
3494                 /* no this was some other packet we can just ignore */
3495                 return;
3496         }
3497
3498         /* This one has been tickled !
3499            now reset him and remove him from the list.
3500          */
3501         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3502                 ntohs(con->dst_addr.ip.sin_port),
3503                 ctdb_addr_to_str(&con->src_addr),
3504                 ntohs(con->src_addr.ip.sin_port)));
3505
3506         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3507         talloc_free(con);
3508 }
3509
3510
3511 /* when traversing the list of all tcp connections to send tickle acks to
3512    (so that we can capture the ack coming back and kill the connection
3513     by a RST)
3514    this callback is called for each connection we are currently trying to kill
3515 */
3516 static int tickle_connection_traverse(void *param, void *data)
3517 {
3518         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3519
3520         /* have tried too many times, just give up */
3521         if (con->count >= 5) {
3522                 /* can't delete in traverse: reparent to delete_cons */
3523                 talloc_steal(param, con);
3524                 return 0;
3525         }
3526
3527         /* othervise, try tickling it again */
3528         con->count++;
3529         ctdb_sys_send_tcp(
3530                 (ctdb_sock_addr *)&con->dst_addr,
3531                 (ctdb_sock_addr *)&con->src_addr,
3532                 0, 0, 0);
3533         return 0;
3534 }
3535
3536
3537 /* 
3538    called every second until all sentenced connections have been reset
3539  */
3540 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3541                                               struct tevent_timer *te,
3542                                               struct timeval t, void *private_data)
3543 {
3544         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3545         void *delete_cons = talloc_new(NULL);
3546
3547         /* loop over all connections sending tickle ACKs */
3548         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3549
3550         /* now we've finished traverse, it's safe to do deletion. */
3551         talloc_free(delete_cons);
3552
3553         /* If there are no more connections to kill we can remove the
3554            entire killtcp structure
3555          */
3556         if ( (killtcp->connections == NULL) || 
3557              (killtcp->connections->root == NULL) ) {
3558                 talloc_free(killtcp);
3559                 return;
3560         }
3561
3562         /* try tickling them again in a seconds time
3563          */
3564         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3565                          timeval_current_ofs(1, 0),
3566                          ctdb_tickle_sentenced_connections, killtcp);
3567 }
3568
3569 /*
3570   destroy the killtcp structure
3571  */
3572 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3573 {
3574         struct ctdb_vnn *tmpvnn;
3575
3576         /* verify that this vnn is still active */
3577         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3578                 if (tmpvnn == killtcp->vnn) {
3579                         break;
3580                 }
3581         }
3582
3583         if (tmpvnn == NULL) {
3584                 return 0;
3585         }
3586
3587         if (killtcp->vnn->killtcp != killtcp) {
3588                 return 0;
3589         }
3590
3591         killtcp->vnn->killtcp = NULL;
3592
3593         return 0;
3594 }
3595
3596
3597 /* nothing fancy here, just unconditionally replace any existing
3598    connection structure with the new one.
3599
3600    dont even free the old one if it did exist, that one is talloc_stolen
3601    by the same node in the tree anyway and will be deleted when the new data 
3602    is deleted
3603 */
3604 static void *add_killtcp_callback(void *parm, void *data)
3605 {
3606         return parm;
3607 }
3608
3609 /*
3610   add a tcp socket to the list of connections we want to RST
3611  */
3612 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3613                                        ctdb_sock_addr *s,
3614                                        ctdb_sock_addr *d)
3615 {
3616         ctdb_sock_addr src, dst;
3617         struct ctdb_kill_tcp *killtcp;
3618         struct ctdb_killtcp_con *con;
3619         struct ctdb_vnn *vnn;
3620
3621         ctdb_canonicalize_ip(s, &src);
3622         ctdb_canonicalize_ip(d, &dst);
3623
3624         vnn = find_public_ip_vnn(ctdb, &dst);
3625         if (vnn == NULL) {
3626                 vnn = find_public_ip_vnn(ctdb, &src);
3627         }
3628         if (vnn == NULL) {
3629                 /* if it is not a public ip   it could be our 'single ip' */
3630                 if (ctdb->single_ip_vnn) {
3631                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3632                                 vnn = ctdb->single_ip_vnn;
3633                         }
3634                 }
3635         }
3636         if (vnn == NULL) {
3637                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3638                 return -1;
3639         }
3640
3641         killtcp = vnn->killtcp;
3642         
3643         /* If this is the first connection to kill we must allocate
3644            a new structure
3645          */
3646         if (killtcp == NULL) {
3647                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3648                 CTDB_NO_MEMORY(ctdb, killtcp);
3649
3650                 killtcp->vnn         = vnn;
3651                 killtcp->ctdb        = ctdb;
3652                 killtcp->capture_fd  = -1;
3653                 killtcp->connections = trbt_create(killtcp, 0);
3654
3655                 vnn->killtcp         = killtcp;
3656                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3657         }
3658
3659
3660
3661         /* create a structure that describes this connection we want to
3662            RST and store it in killtcp->connections
3663         */
3664         con = talloc(killtcp, struct ctdb_killtcp_con);
3665         CTDB_NO_MEMORY(ctdb, con);
3666         con->src_addr = src;
3667         con->dst_addr = dst;
3668         con->count    = 0;
3669         con->killtcp  = killtcp;
3670
3671
3672         trbt_insertarray32_callback(killtcp->connections,
3673                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3674                         add_killtcp_callback, con);
3675
3676         /* 
3677            If we dont have a socket to listen on yet we must create it
3678          */
3679         if (killtcp->capture_fd == -1) {
3680                 const char *iface = ctdb_vnn_iface_string(vnn);
3681                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3682                 if (killtcp->capture_fd == -1) {
3683                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3684                                           "socket on iface '%s' for killtcp (%s)\n",
3685                                           iface, strerror(errno)));
3686                         goto failed;
3687                 }
3688         }
3689
3690
3691         if (killtcp->fde == NULL) {
3692                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3693                                              killtcp->capture_fd,
3694                                              TEVENT_FD_READ,
3695                                              capture_tcp_handler, killtcp);
3696                 tevent_fd_set_auto_close(killtcp->fde);
3697
3698                 /* We also need to set up some events to tickle all these connections
3699                    until they are all reset
3700                 */
3701                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3702                                  ctdb_tickle_sentenced_connections, killtcp);
3703         }
3704
3705         /* tickle him once now */
3706         ctdb_sys_send_tcp(
3707                 &con->dst_addr,
3708                 &con->src_addr,
3709                 0, 0, 0);
3710
3711         return 0;
3712
3713 failed:
3714         talloc_free(vnn->killtcp);
3715         vnn->killtcp = NULL;
3716         return -1;
3717 }
3718
3719 /*
3720   kill a TCP connection.
3721  */
3722 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3723 {
3724         struct ctdb_tcp_connection *killtcp = (struct ctdb_tcp_connection *)indata.dptr;
3725
3726         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3727 }
3728
3729 /*
3730   called by a daemon to inform us of the entire list of TCP tickles for
3731   a particular public address.
3732   this control should only be sent by the node that is currently serving
3733   that public address.
3734  */
3735 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3736 {
3737         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3738         struct ctdb_tcp_array *tcparray;
3739         struct ctdb_vnn *vnn;
3740
3741         /* We must at least have tickles.num or else we cant verify the size
3742            of the received data blob
3743          */
3744         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3745                                         tickles.connections)) {
3746                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3747                 return -1;
3748         }
3749
3750         /* verify that the size of data matches what we expect */
3751         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3752                                 tickles.connections)
3753                          + sizeof(struct ctdb_tcp_connection)
3754                                  * list->tickles.num) {
3755                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3756                 return -1;
3757         }
3758
3759         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3760                            ctdb_addr_to_str(&list->addr)));
3761
3762         vnn = find_public_ip_vnn(ctdb, &list->addr);
3763         if (vnn == NULL) {
3764                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3765                         ctdb_addr_to_str(&list->addr)));
3766
3767                 return 1;
3768         }
3769
3770         /* remove any old ticklelist we might have */
3771         talloc_free(vnn->tcp_array);
3772         vnn->tcp_array = NULL;
3773
3774         tcparray = talloc(vnn, struct ctdb_tcp_array);
3775         CTDB_NO_MEMORY(ctdb, tcparray);
3776
3777         tcparray->num = list->tickles.num;
3778
3779         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3780         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3781
3782         memcpy(tcparray->connections, &list->tickles.connections[0],
3783                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3784
3785         /* We now have a new fresh tickle list array for this vnn */
3786         vnn->tcp_array = tcparray;
3787
3788         return 0;
3789 }
3790
3791 /*
3792   called to return the full list of tickles for the puclic address associated 
3793   with the provided vnn
3794  */
3795 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3796 {
3797         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3798         struct ctdb_control_tcp_tickle_list *list;
3799         struct ctdb_tcp_array *tcparray;
3800         int num;
3801         struct ctdb_vnn *vnn;
3802
3803         vnn = find_public_ip_vnn(ctdb, addr);
3804         if (vnn == NULL) {
3805                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3806                         ctdb_addr_to_str(addr)));
3807
3808                 return 1;
3809         }
3810
3811         tcparray = vnn->tcp_array;
3812         if (tcparray) {
3813                 num = tcparray->num;
3814         } else {
3815                 num = 0;
3816         }
3817
3818         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3819                                 tickles.connections)
3820                         + sizeof(struct ctdb_tcp_connection) * num;
3821
3822         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3823         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3824         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3825
3826         list->addr = *addr;
3827         list->tickles.num = num;
3828         if (num) {
3829                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3830                         sizeof(struct ctdb_tcp_connection) * num);
3831         }
3832
3833         return 0;
3834 }
3835
3836
3837 /*
3838   set the list of all tcp tickles for a public address
3839  */
3840 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3841                                             ctdb_sock_addr *addr,
3842                                             struct ctdb_tcp_array *tcparray)
3843 {
3844         int ret, num;
3845         TDB_DATA data;
3846         struct ctdb_control_tcp_tickle_list *list;
3847
3848         if (tcparray) {
3849                 num = tcparray->num;
3850         } else {
3851                 num = 0;
3852         }
3853
3854         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3855                                 tickles.connections) +
3856                         sizeof(struct ctdb_tcp_connection) * num;
3857         data.dptr = talloc_size(ctdb, data.dsize);
3858         CTDB_NO_MEMORY(ctdb, data.dptr);
3859
3860         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3861         list->addr = *addr;
3862         list->tickles.num = num;
3863         if (tcparray) {
3864                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3865         }
3866
3867         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3868                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3869                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3870         if (ret != 0) {
3871                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3872                 return -1;
3873         }
3874
3875         talloc_free(data.dptr);
3876
3877         return ret;
3878 }
3879
3880
3881 /*
3882   perform tickle updates if required
3883  */
3884 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3885                                     struct tevent_timer *te,
3886                                     struct timeval t, void *private_data)
3887 {
3888         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3889         int ret;
3890         struct ctdb_vnn *vnn;
3891
3892         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3893                 /* we only send out updates for public addresses that 
3894                    we have taken over
3895                  */
3896                 if (ctdb->pnn != vnn->pnn) {
3897                         continue;
3898                 }
3899                 /* We only send out the updates if we need to */
3900                 if (!vnn->tcp_update_needed) {
3901                         continue;
3902                 }
3903                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3904                                                        &vnn->public_address,
3905                                                        vnn->tcp_array);
3906                 if (ret != 0) {
3907                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3908                                 ctdb_addr_to_str(&vnn->public_address)));
3909                 } else {
3910                         DEBUG(DEBUG_INFO,
3911                               ("Sent tickle update for public address %s\n",
3912                                ctdb_addr_to_str(&vnn->public_address)));
3913                         vnn->tcp_update_needed = false;
3914                 }
3915         }
3916
3917         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3918                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3919                          ctdb_update_tcp_tickles, ctdb);
3920 }
3921
3922 /*
3923   start periodic update of tcp tickles
3924  */
3925 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3926 {
3927         ctdb->tickle_update_context = talloc_new(ctdb);
3928
3929         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3930                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3931                          ctdb_update_tcp_tickles, ctdb);
3932 }
3933
3934
3935
3936
3937 struct control_gratious_arp {
3938         struct ctdb_context *ctdb;
3939         ctdb_sock_addr addr;
3940         const char *iface;
3941         int count;
3942 };
3943
3944 /*
3945   send a control_gratuitous arp
3946  */
3947 static void send_gratious_arp(struct tevent_context *ev,
3948                               struct tevent_timer *te,
3949                               struct timeval t, void *private_data)
3950 {
3951         int ret;
3952         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3953                                                         struct control_gratious_arp);
3954
3955         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3956         if (ret != 0) {
3957                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3958                                  arp->iface, strerror(errno)));
3959         }
3960
3961
3962         arp->count++;
3963         if (arp->count == CTDB_ARP_REPEAT) {
3964                 talloc_free(arp);
3965                 return;
3966         }
3967
3968         tevent_add_timer(arp->ctdb->ev, arp,
3969                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3970                          send_gratious_arp, arp);
3971 }
3972
3973
3974 /*
3975   send a gratious arp 
3976  */
3977 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3978 {
3979         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3980         struct control_gratious_arp *arp;
3981
3982         /* verify the size of indata */
3983         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3984                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3985                                  (unsigned)indata.dsize, 
3986                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3987                 return -1;
3988         }
3989         if (indata.dsize != 
3990                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3991                 + gratious_arp->len ) ){
3992
3993                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3994                         "but should be %u bytes\n", 
3995                          (unsigned)indata.dsize, 
3996                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3997                 return -1;
3998         }
3999
4000
4001         arp = talloc(ctdb, struct control_gratious_arp);
4002         CTDB_NO_MEMORY(ctdb, arp);
4003
4004         arp->ctdb  = ctdb;
4005         arp->addr   = gratious_arp->addr;
4006         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4007         CTDB_NO_MEMORY(ctdb, arp->iface);
4008         arp->count = 0;
4009
4010         tevent_add_timer(arp->ctdb->ev, arp,
4011                          timeval_zero(), send_gratious_arp, arp);
4012
4013         return 0;
4014 }
4015
4016 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4017 {
4018         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4019         int ret;
4020
4021         /* verify the size of indata */
4022         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4023                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4024                 return -1;
4025         }
4026         if (indata.dsize != 
4027                 ( offsetof(struct ctdb_control_ip_iface, iface)
4028                 + pub->len ) ){
4029
4030                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4031                         "but should be %u bytes\n", 
4032                          (unsigned)indata.dsize, 
4033                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4034                 return -1;
4035         }
4036
4037         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4038
4039         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4040
4041         if (ret != 0) {
4042                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4043                 return -1;
4044         }
4045
4046         return 0;
4047 }
4048
4049 struct delete_ip_callback_state {
4050         struct ctdb_req_control *c;
4051 };
4052
4053 /*
4054   called when releaseip event finishes for del_public_address
4055  */
4056 static void delete_ip_callback(struct ctdb_context *ctdb,
4057                                int32_t status, TDB_DATA data,
4058                                const char *errormsg,
4059                                void *private_data)
4060 {
4061         struct delete_ip_callback_state *state =
4062                 talloc_get_type(private_data, struct delete_ip_callback_state);
4063
4064         /* If release failed then fail. */
4065         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4066         talloc_free(private_data);
4067 }
4068
4069 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4070                                         struct ctdb_req_control *c,
4071                                         TDB_DATA indata, bool *async_reply)
4072 {
4073         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4074         struct ctdb_vnn *vnn;
4075
4076         /* verify the size of indata */
4077         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4078                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4079                 return -1;
4080         }
4081         if (indata.dsize != 
4082                 ( offsetof(struct ctdb_control_ip_iface, iface)
4083                 + pub->len ) ){
4084
4085                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4086                         "but should be %u bytes\n", 
4087                          (unsigned)indata.dsize, 
4088                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4089                 return -1;
4090         }
4091
4092         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4093
4094         /* walk over all public addresses until we find a match */
4095         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4096                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4097                         if (vnn->pnn == ctdb->pnn) {
4098                                 struct delete_ip_callback_state *state;
4099                                 struct ctdb_public_ip *ip;
4100                                 TDB_DATA data;
4101                                 int ret;
4102
4103                                 vnn->delete_pending = true;
4104
4105                                 state = talloc(ctdb,
4106                                                struct delete_ip_callback_state);
4107                                 CTDB_NO_MEMORY(ctdb, state);
4108                                 state->c = c;
4109
4110                                 ip = talloc(state, struct ctdb_public_ip);
4111                                 if (ip == NULL) {
4112                                         DEBUG(DEBUG_ERR,
4113                                               (__location__ " Out of memory\n"));
4114                                         talloc_free(state);
4115                                         return -1;
4116                                 }
4117                                 ip->pnn = -1;
4118                                 ip->addr = pub->addr;
4119
4120                                 data.dsize = sizeof(struct ctdb_public_ip);
4121                                 data.dptr = (unsigned char *)ip;
4122
4123                                 ret = ctdb_daemon_send_control(ctdb,
4124                                                                ctdb_get_pnn(ctdb),
4125                                                                0,
4126                                                                CTDB_CONTROL_RELEASE_IP,
4127                                                                0, 0,
4128                                                                data,
4129                                                                delete_ip_callback,
4130                                                                state);
4131                                 if (ret == -1) {
4132                                         DEBUG(DEBUG_ERR,
4133                                               (__location__ "Unable to send "
4134                                                "CTDB_CONTROL_RELEASE_IP\n"));
4135                                         talloc_free(state);
4136                                         return -1;
4137                                 }
4138
4139                                 state->c = talloc_steal(state, c);
4140                                 *async_reply = true;
4141                         } else {
4142                                 /* This IP is not hosted on the
4143                                  * current node so just delete it
4144                                  * now. */
4145                                 do_delete_ip(ctdb, vnn);
4146                         }
4147
4148                         return 0;
4149                 }
4150         }
4151
4152         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4153                          ctdb_addr_to_str(&pub->addr)));
4154         return -1;
4155 }
4156
4157
4158 struct ipreallocated_callback_state {
4159         struct ctdb_req_control *c;
4160 };
4161
4162 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4163                                         int status, void *p)
4164 {
4165         struct ipreallocated_callback_state *state =
4166                 talloc_get_type(p, struct ipreallocated_callback_state);
4167
4168         if (status != 0) {
4169                 DEBUG(DEBUG_ERR,
4170                       (" \"ipreallocated\" event script failed (status %d)\n",
4171                        status));
4172                 if (status == -ETIME) {
4173                         ctdb_ban_self(ctdb);
4174                 }
4175         }
4176
4177         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4178         talloc_free(state);
4179 }
4180
4181 /* A control to run the ipreallocated event */
4182 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4183                                    struct ctdb_req_control *c,
4184                                    bool *async_reply)
4185 {
4186         int ret;
4187         struct ipreallocated_callback_state *state;
4188
4189         state = talloc(ctdb, struct ipreallocated_callback_state);
4190         CTDB_NO_MEMORY(ctdb, state);
4191
4192         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4193
4194         ret = ctdb_event_script_callback(ctdb, state,
4195                                          ctdb_ipreallocated_callback, state,
4196                                          CTDB_EVENT_IPREALLOCATED,
4197                                          "%s", "");
4198
4199         if (ret != 0) {
4200                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4201                 talloc_free(state);
4202                 return -1;
4203         }
4204
4205         /* tell the control that we will be reply asynchronously */
4206         state->c    = talloc_steal(state, c);
4207         *async_reply = true;
4208
4209         return 0;
4210 }
4211
4212
4213 /* This function is called from the recovery daemon to verify that a remote
4214    node has the expected ip allocation.
4215    This is verified against ctdb->ip_tree
4216 */
4217 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4218                                 struct ctdb_all_public_ips *ips,
4219                                 uint32_t pnn)
4220 {
4221         struct ctdb_public_ip_list *tmp_ip; 
4222         int i;
4223
4224         if (ctdb->ip_tree == NULL) {
4225                 /* dont know the expected allocation yet, assume remote node
4226                    is correct. */
4227                 return 0;
4228         }
4229
4230         if (ips == NULL) {
4231                 return 0;
4232         }
4233
4234         for (i=0; i<ips->num; i++) {
4235                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4236                 if (tmp_ip == NULL) {
4237                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4238                         return -1;
4239                 }
4240
4241                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4242                         continue;
4243                 }
4244
4245                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4246                         DEBUG(DEBUG_ERR,
4247                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4248                                pnn,
4249                                ctdb_addr_to_str(&ips->ips[i].addr),
4250                                ips->ips[i].pnn, tmp_ip->pnn));
4251                         return -1;
4252                 }
4253         }
4254
4255         return 0;
4256 }
4257
4258 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4259 {
4260         struct ctdb_public_ip_list *tmp_ip;
4261
4262         /* IP tree is never built if DisableIPFailover is set */
4263         if (ctdb->tunable.disable_ip_failover != 0) {
4264                 return 0;
4265         }
4266
4267         if (ctdb->ip_tree == NULL) {
4268                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4269                 return -1;
4270         }
4271
4272         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4273         if (tmp_ip == NULL) {
4274                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4275                 return -1;
4276         }
4277
4278         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4279         tmp_ip->pnn = ip->pnn;
4280
4281         return 0;
4282 }
4283
4284 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4285 {
4286         TALLOC_FREE(ctdb->ip_tree);
4287 }
4288
4289 struct ctdb_reloadips_handle {
4290         struct ctdb_context *ctdb;
4291         struct ctdb_req_control *c;
4292         int status;
4293         int fd[2];
4294         pid_t child;
4295         struct tevent_fd *fde;
4296 };
4297
4298 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4299 {
4300         if (h == h->ctdb->reload_ips) {
4301                 h->ctdb->reload_ips = NULL;
4302         }
4303         if (h->c != NULL) {
4304                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4305                 h->c = NULL;
4306         }
4307         ctdb_kill(h->ctdb, h->child, SIGKILL);
4308         return 0;
4309 }
4310
4311 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4312                                          struct tevent_timer *te,
4313                                          struct timeval t, void *private_data)
4314 {
4315         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4316
4317         talloc_free(h);
4318 }
4319
4320 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4321                                          struct tevent_fd *fde,
4322                                          uint16_t flags, void *private_data)
4323 {
4324         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4325
4326         char res;
4327         int ret;
4328
4329         ret = sys_read(h->fd[0], &res, 1);
4330         if (ret < 1 || res != 0) {
4331                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4332                 res = 1;
4333         }
4334         h->status = res;
4335
4336         talloc_free(h);
4337 }
4338
4339 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4340 {
4341         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4342         struct ctdb_all_public_ips *ips;
4343         struct ctdb_vnn *vnn;
4344         struct client_async_data *async_data;
4345         struct timeval timeout;
4346         TDB_DATA data;
4347         struct ctdb_client_control_state *state;
4348         bool first_add;
4349         int i, ret;
4350
4351         CTDB_NO_MEMORY(ctdb, mem_ctx);
4352
4353         /* Read IPs from local node */
4354         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4355                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4356         if (ret != 0) {
4357                 DEBUG(DEBUG_ERR,
4358                       ("Unable to fetch public IPs from local node\n"));
4359                 talloc_free(mem_ctx);
4360                 return -1;
4361         }
4362
4363         /* Read IPs file - this is safe since this is a child process */
4364         ctdb->vnn = NULL;
4365         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4366                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4367                 talloc_free(mem_ctx);
4368                 return -1;
4369         }
4370
4371         async_data = talloc_zero(mem_ctx, struct client_async_data);
4372         CTDB_NO_MEMORY(ctdb, async_data);
4373
4374         /* Compare IPs between node and file for IPs to be deleted */
4375         for (i = 0; i < ips->num; i++) {
4376                 /* */
4377                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4378                         if (ctdb_same_ip(&vnn->public_address,
4379                                          &ips->ips[i].addr)) {
4380                                 /* IP is still in file */
4381                                 break;
4382                         }
4383                 }
4384
4385                 if (vnn == NULL) {
4386                         /* Delete IP ips->ips[i] */
4387                         struct ctdb_control_ip_iface *pub;
4388
4389                         DEBUG(DEBUG_NOTICE,
4390                               ("IP %s no longer configured, deleting it\n",
4391                                ctdb_addr_to_str(&ips->ips[i].addr)));
4392
4393                         pub = talloc_zero(mem_ctx,
4394                                           struct ctdb_control_ip_iface);
4395                         CTDB_NO_MEMORY(ctdb, pub);
4396
4397                         pub->addr  = ips->ips[i].addr;
4398                         pub->mask  = 0;
4399                         pub->len   = 0;
4400
4401                         timeout = TAKEOVER_TIMEOUT();
4402
4403                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4404                                               iface) + pub->len;
4405                         data.dptr = (uint8_t *)pub;
4406
4407                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4408                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4409                                                   0, data, async_data,
4410                                                   &timeout, NULL);
4411                         if (state == NULL) {
4412                                 DEBUG(DEBUG_ERR,
4413                                       (__location__
4414                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4415                                 goto failed;
4416                         }
4417
4418                         ctdb_client_async_add(async_data, state);
4419                 }
4420         }
4421
4422         /* Compare IPs between node and file for IPs to be added */
4423         first_add = true;
4424         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4425                 for (i = 0; i < ips->num; i++) {
4426                         if (ctdb_same_ip(&vnn->public_address,
4427                                          &ips->ips[i].addr)) {
4428                                 /* IP already on node */
4429                                 break;
4430                         }
4431                 }
4432                 if (i == ips->num) {
4433                         /* Add IP ips->ips[i] */
4434                         struct ctdb_control_ip_iface *pub;
4435                         const char *ifaces = NULL;
4436                         uint32_t len;
4437                         int iface = 0;
4438
4439                         DEBUG(DEBUG_NOTICE,
4440                               ("New IP %s configured, adding it\n",
4441                                ctdb_addr_to_str(&vnn->public_address)));
4442                         if (first_add) {
4443                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4444
4445                                 data.dsize = sizeof(pnn);
4446                                 data.dptr  = (uint8_t *)&pnn;
4447
4448                                 ret = ctdb_client_send_message(
4449                                         ctdb,
4450                                         CTDB_BROADCAST_CONNECTED,
4451                                         CTDB_SRVID_REBALANCE_NODE,
4452                                         data);
4453                                 if (ret != 0) {
4454                                         DEBUG(DEBUG_WARNING,
4455                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4456                                 }
4457
4458                                 first_add = false;
4459                         }
4460
4461                         ifaces = vnn->ifaces[0];
4462                         iface = 1;
4463                         while (vnn->ifaces[iface] != NULL) {
4464                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4465                                                          vnn->ifaces[iface]);
4466                                 iface++;
4467                         }
4468
4469                         len   = strlen(ifaces) + 1;
4470                         pub = talloc_zero_size(mem_ctx,
4471                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4472                         CTDB_NO_MEMORY(ctdb, pub);
4473
4474                         pub->addr  = vnn->public_address;
4475                         pub->mask  = vnn->public_netmask_bits;
4476                         pub->len   = len;
4477                         memcpy(&pub->iface[0], ifaces, pub->len);
4478
4479                         timeout = TAKEOVER_TIMEOUT();
4480
4481                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4482                                               iface) + pub->len;
4483                         data.dptr = (uint8_t *)pub;
4484
4485                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4486                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4487                                                   0, data, async_data,
4488                                                   &timeout, NULL);
4489                         if (state == NULL) {
4490                                 DEBUG(DEBUG_ERR,
4491                                       (__location__
4492                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4493                                 goto failed;
4494                         }
4495
4496                         ctdb_client_async_add(async_data, state);
4497                 }
4498         }
4499
4500         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4501                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4502                 goto failed;
4503         }
4504
4505         talloc_free(mem_ctx);
4506         return 0;
4507
4508 failed:
4509         talloc_free(mem_ctx);
4510         return -1;
4511 }
4512
4513 /* This control is sent to force the node to re-read the public addresses file
4514    and drop any addresses we should nnot longer host, and add new addresses
4515    that we are now able to host
4516 */
4517 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4518 {
4519         struct ctdb_reloadips_handle *h;
4520         pid_t parent = getpid();
4521
4522         if (ctdb->reload_ips != NULL) {
4523                 talloc_free(ctdb->reload_ips);
4524                 ctdb->reload_ips = NULL;
4525         }
4526
4527         h = talloc(ctdb, struct ctdb_reloadips_handle);
4528         CTDB_NO_MEMORY(ctdb, h);
4529         h->ctdb     = ctdb;
4530         h->c        = NULL;
4531         h->status   = -1;
4532         
4533         if (pipe(h->fd) == -1) {
4534                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4535                 talloc_free(h);
4536                 return -1;
4537         }
4538
4539         h->child = ctdb_fork(ctdb);
4540         if (h->child == (pid_t)-1) {
4541                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4542                 close(h->fd[0]);
4543                 close(h->fd[1]);
4544                 talloc_free(h);
4545                 return -1;
4546         }
4547
4548         /* child process */
4549         if (h->child == 0) {
4550                 signed char res = 0;
4551
4552                 close(h->fd[0]);
4553                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4554
4555                 ctdb_set_process_name("ctdb_reloadips");
4556                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4557                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4558                         res = -1;
4559                 } else {
4560                         res = ctdb_reloadips_child(ctdb);
4561                         if (res != 0) {
4562                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4563                         }
4564                 }
4565
4566                 sys_write(h->fd[1], &res, 1);
4567                 /* make sure we die when our parent dies */
4568                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4569                         sleep(5);
4570                 }
4571                 _exit(0);
4572         }
4573
4574         h->c             = talloc_steal(h, c);
4575
4576         close(h->fd[1]);
4577         set_close_on_exec(h->fd[0]);
4578
4579         talloc_set_destructor(h, ctdb_reloadips_destructor);
4580
4581
4582         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4583                                ctdb_reloadips_child_handler, (void *)h);
4584         tevent_fd_set_auto_close(h->fde);
4585
4586         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4587                          ctdb_reloadips_timeout_event, h);
4588
4589         /* we reply later */
4590         *async_reply = true;
4591         return 0;
4592 }