ctdb-takeover: NoIPHostOnAllDisabled is global across cluster
[kamenim/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/sys_rw.h"
34 #include "lib/util/util_process.h"
35
36 #include "ctdb_private.h"
37 #include "ctdb_client.h"
38
39 #include "common/rb_tree.h"
40 #include "common/reqid.h"
41 #include "common/system.h"
42 #include "common/common.h"
43 #include "common/logging.h"
44
45 #include "server/ipalloc.h"
46
47 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
48
49 #define CTDB_ARP_INTERVAL 1
50 #define CTDB_ARP_REPEAT   3
51
52 struct ctdb_interface {
53         struct ctdb_interface *prev, *next;
54         const char *name;
55         bool link_up;
56         uint32_t references;
57 };
58
59 struct vnn_interface {
60         struct vnn_interface *prev, *next;
61         struct ctdb_interface *iface;
62 };
63
64 /* state associated with a public ip address */
65 struct ctdb_vnn {
66         struct ctdb_vnn *prev, *next;
67
68         struct ctdb_interface *iface;
69         struct vnn_interface *ifaces;
70         ctdb_sock_addr public_address;
71         uint8_t public_netmask_bits;
72
73         /* the node number that is serving this public address, if any.
74            If no node serves this ip it is set to -1 */
75         int32_t pnn;
76
77         /* List of clients to tickle for this public address */
78         struct ctdb_tcp_array *tcp_array;
79
80         /* whether we need to update the other nodes with changes to our list
81            of connected clients */
82         bool tcp_update_needed;
83
84         /* a context to hang sending gratious arp events off */
85         TALLOC_CTX *takeover_ctx;
86
87         /* Set to true any time an update to this VNN is in flight.
88            This helps to avoid races. */
89         bool update_in_flight;
90
91         /* If CTDB_CONTROL_DEL_PUBLIC_IP is received for this IP
92          * address then this flag is set.  It will be deleted in the
93          * release IP callback. */
94         bool delete_pending;
95 };
96
97 static const char *iface_string(const struct ctdb_interface *iface)
98 {
99         return (iface != NULL ? iface->name : "__none__");
100 }
101
102 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
103 {
104         return iface_string(vnn->iface);
105 }
106
107 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
108                                               const char *iface);
109
110 static struct ctdb_interface *
111 ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
112 {
113         struct ctdb_interface *i;
114
115         if (strlen(iface) > CTDB_IFACE_SIZE) {
116                 DEBUG(DEBUG_ERR, ("Interface name too long \"%s\"\n", iface));
117                 return NULL;
118         }
119
120         /* Verify that we don't have an entry for this ip yet */
121         i = ctdb_find_iface(ctdb, iface);
122         if (i != NULL) {
123                 return i;
124         }
125
126         /* create a new structure for this interface */
127         i = talloc_zero(ctdb, struct ctdb_interface);
128         if (i == NULL) {
129                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
130                 return NULL;
131         }
132         i->name = talloc_strdup(i, iface);
133         if (i->name == NULL) {
134                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
135                 talloc_free(i);
136                 return NULL;
137         }
138
139         i->link_up = true;
140
141         DLIST_ADD(ctdb->ifaces, i);
142
143         return i;
144 }
145
146 static bool vnn_has_interface(struct ctdb_vnn *vnn,
147                               const struct ctdb_interface *iface)
148 {
149         struct vnn_interface *i;
150
151         for (i = vnn->ifaces; i != NULL; i = i->next) {
152                 if (iface == i->iface) {
153                         return true;
154                 }
155         }
156
157         return false;
158 }
159
160 /* If any interfaces now have no possible IPs then delete them.  This
161  * implementation is naive (i.e. simple) rather than clever
162  * (i.e. complex).  Given that this is run on delip and that operation
163  * is rare, this doesn't need to be efficient - it needs to be
164  * foolproof.  One alternative is reference counting, where the logic
165  * is distributed and can, therefore, be broken in multiple places.
166  * Another alternative is to build a red-black tree of interfaces that
167  * can have addresses (by walking ctdb->vnn once) and then walking
168  * ctdb->ifaces once and deleting those not in the tree.  Let's go to
169  * one of those if the naive implementation causes problems...  :-)
170  */
171 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
172                                         struct ctdb_vnn *vnn)
173 {
174         struct ctdb_interface *i, *next;
175
176         /* For each interface, check if there's an IP using it. */
177         for (i = ctdb->ifaces; i != NULL; i = next) {
178                 struct ctdb_vnn *tv;
179                 bool found;
180                 next = i->next;
181
182                 /* Only consider interfaces named in the given VNN. */
183                 if (!vnn_has_interface(vnn, i)) {
184                         continue;
185                 }
186
187                 /* Search for a vnn with this interface. */
188                 found = false;
189                 for (tv=ctdb->vnn; tv; tv=tv->next) {
190                         if (vnn_has_interface(tv, i)) {
191                                 found = true;
192                                 break;
193                         }
194                 }
195
196                 if (!found) {
197                         /* None of the VNNs are using this interface. */
198                         DLIST_REMOVE(ctdb->ifaces, i);
199                         talloc_free(i);
200                 }
201         }
202 }
203
204
205 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
206                                               const char *iface)
207 {
208         struct ctdb_interface *i;
209
210         for (i=ctdb->ifaces;i;i=i->next) {
211                 if (strcmp(i->name, iface) == 0) {
212                         return i;
213                 }
214         }
215
216         return NULL;
217 }
218
219 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
220                                                   struct ctdb_vnn *vnn)
221 {
222         struct vnn_interface *i;
223         struct ctdb_interface *cur = NULL;
224         struct ctdb_interface *best = NULL;
225
226         for (i = vnn->ifaces; i != NULL; i = i->next) {
227
228                 cur = i->iface;
229
230                 if (!cur->link_up) {
231                         continue;
232                 }
233
234                 if (best == NULL) {
235                         best = cur;
236                         continue;
237                 }
238
239                 if (cur->references < best->references) {
240                         best = cur;
241                         continue;
242                 }
243         }
244
245         return best;
246 }
247
248 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
249                                      struct ctdb_vnn *vnn)
250 {
251         struct ctdb_interface *best = NULL;
252
253         if (vnn->iface) {
254                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
255                                    "still assigned to iface '%s'\n",
256                                    ctdb_addr_to_str(&vnn->public_address),
257                                    ctdb_vnn_iface_string(vnn)));
258                 return 0;
259         }
260
261         best = ctdb_vnn_best_iface(ctdb, vnn);
262         if (best == NULL) {
263                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
264                                   "cannot assign to iface any iface\n",
265                                   ctdb_addr_to_str(&vnn->public_address)));
266                 return -1;
267         }
268
269         vnn->iface = best;
270         best->references++;
271         vnn->pnn = ctdb->pnn;
272
273         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
274                            "now assigned to iface '%s' refs[%d]\n",
275                            ctdb_addr_to_str(&vnn->public_address),
276                            ctdb_vnn_iface_string(vnn),
277                            best->references));
278         return 0;
279 }
280
281 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
282                                     struct ctdb_vnn *vnn)
283 {
284         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
285                            "now unassigned (old iface '%s' refs[%d])\n",
286                            ctdb_addr_to_str(&vnn->public_address),
287                            ctdb_vnn_iface_string(vnn),
288                            vnn->iface?vnn->iface->references:0));
289         if (vnn->iface) {
290                 vnn->iface->references--;
291         }
292         vnn->iface = NULL;
293         if (vnn->pnn == ctdb->pnn) {
294                 vnn->pnn = -1;
295         }
296 }
297
298 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
299                                struct ctdb_vnn *vnn)
300 {
301         struct vnn_interface *i;
302
303         /* Nodes that are not RUNNING can not host IPs */
304         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
305                 return false;
306         }
307
308         if (vnn->delete_pending) {
309                 return false;
310         }
311
312         if (vnn->iface && vnn->iface->link_up) {
313                 return true;
314         }
315
316         for (i = vnn->ifaces; i != NULL; i = i->next) {
317                 if (i->iface->link_up) {
318                         return true;
319                 }
320         }
321
322         return false;
323 }
324
325 struct ctdb_takeover_arp {
326         struct ctdb_context *ctdb;
327         uint32_t count;
328         ctdb_sock_addr addr;
329         struct ctdb_tcp_array *tcparray;
330         struct ctdb_vnn *vnn;
331 };
332
333
334 /*
335   lists of tcp endpoints
336  */
337 struct ctdb_tcp_list {
338         struct ctdb_tcp_list *prev, *next;
339         struct ctdb_connection connection;
340 };
341
342 /*
343   list of clients to kill on IP release
344  */
345 struct ctdb_client_ip {
346         struct ctdb_client_ip *prev, *next;
347         struct ctdb_context *ctdb;
348         ctdb_sock_addr addr;
349         uint32_t client_id;
350 };
351
352
353 /*
354   send a gratuitous arp
355  */
356 static void ctdb_control_send_arp(struct tevent_context *ev,
357                                   struct tevent_timer *te,
358                                   struct timeval t, void *private_data)
359 {
360         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
361                                                         struct ctdb_takeover_arp);
362         int i, ret;
363         struct ctdb_tcp_array *tcparray;
364         const char *iface = ctdb_vnn_iface_string(arp->vnn);
365
366         ret = ctdb_sys_send_arp(&arp->addr, iface);
367         if (ret != 0) {
368                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
369                                   iface, strerror(errno)));
370         }
371
372         tcparray = arp->tcparray;
373         if (tcparray) {
374                 for (i=0;i<tcparray->num;i++) {
375                         struct ctdb_connection *tcon;
376
377                         tcon = &tcparray->connections[i];
378                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
379                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
380                                 ctdb_addr_to_str(&tcon->src),
381                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
382                         ret = ctdb_sys_send_tcp(
383                                 &tcon->src,
384                                 &tcon->dst,
385                                 0, 0, 0);
386                         if (ret != 0) {
387                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
388                                         ctdb_addr_to_str(&tcon->src)));
389                         }
390                 }
391         }
392
393         arp->count++;
394
395         if (arp->count == CTDB_ARP_REPEAT) {
396                 talloc_free(arp);
397                 return;
398         }
399
400         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
401                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
402                          ctdb_control_send_arp, arp);
403 }
404
405 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
406                                        struct ctdb_vnn *vnn)
407 {
408         struct ctdb_takeover_arp *arp;
409         struct ctdb_tcp_array *tcparray;
410
411         if (!vnn->takeover_ctx) {
412                 vnn->takeover_ctx = talloc_new(vnn);
413                 if (!vnn->takeover_ctx) {
414                         return -1;
415                 }
416         }
417
418         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
419         if (!arp) {
420                 return -1;
421         }
422
423         arp->ctdb = ctdb;
424         arp->addr = vnn->public_address;
425         arp->vnn  = vnn;
426
427         tcparray = vnn->tcp_array;
428         if (tcparray) {
429                 /* add all of the known tcp connections for this IP to the
430                    list of tcp connections to send tickle acks for */
431                 arp->tcparray = talloc_steal(arp, tcparray);
432
433                 vnn->tcp_array = NULL;
434                 vnn->tcp_update_needed = true;
435         }
436
437         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
438                          timeval_zero(), ctdb_control_send_arp, arp);
439
440         return 0;
441 }
442
443 struct ctdb_do_takeip_state {
444         struct ctdb_req_control_old *c;
445         struct ctdb_vnn *vnn;
446 };
447
448 /*
449   called when takeip event finishes
450  */
451 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
452                                     void *private_data)
453 {
454         struct ctdb_do_takeip_state *state =
455                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
456         int32_t ret;
457         TDB_DATA data;
458
459         if (status != 0) {
460                 if (status == -ETIME) {
461                         ctdb_ban_self(ctdb);
462                 }
463                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
464                                  ctdb_addr_to_str(&state->vnn->public_address),
465                                  ctdb_vnn_iface_string(state->vnn)));
466                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
467
468                 talloc_free(state);
469                 return;
470         }
471
472         if (ctdb->do_checkpublicip) {
473
474         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
475         if (ret != 0) {
476                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
477                 talloc_free(state);
478                 return;
479         }
480
481         }
482
483         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
484         data.dsize = strlen((char *)data.dptr) + 1;
485         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
486
487         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
488
489
490         /* the control succeeded */
491         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
492         talloc_free(state);
493         return;
494 }
495
496 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
497 {
498         state->vnn->update_in_flight = false;
499         return 0;
500 }
501
502 /*
503   take over an ip address
504  */
505 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
506                               struct ctdb_req_control_old *c,
507                               struct ctdb_vnn *vnn)
508 {
509         int ret;
510         struct ctdb_do_takeip_state *state;
511
512         if (vnn->update_in_flight) {
513                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
514                                     "update for this IP already in flight\n",
515                                     ctdb_addr_to_str(&vnn->public_address),
516                                     vnn->public_netmask_bits));
517                 return -1;
518         }
519
520         ret = ctdb_vnn_assign_iface(ctdb, vnn);
521         if (ret != 0) {
522                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
523                                  "assign a usable interface\n",
524                                  ctdb_addr_to_str(&vnn->public_address),
525                                  vnn->public_netmask_bits));
526                 return -1;
527         }
528
529         state = talloc(vnn, struct ctdb_do_takeip_state);
530         CTDB_NO_MEMORY(ctdb, state);
531
532         state->c = NULL;
533         state->vnn   = vnn;
534
535         vnn->update_in_flight = true;
536         talloc_set_destructor(state, ctdb_takeip_destructor);
537
538         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
539                             ctdb_addr_to_str(&vnn->public_address),
540                             vnn->public_netmask_bits,
541                             ctdb_vnn_iface_string(vnn)));
542
543         ret = ctdb_event_script_callback(ctdb,
544                                          state,
545                                          ctdb_do_takeip_callback,
546                                          state,
547                                          CTDB_EVENT_TAKE_IP,
548                                          "%s %s %u",
549                                          ctdb_vnn_iface_string(vnn),
550                                          ctdb_addr_to_str(&vnn->public_address),
551                                          vnn->public_netmask_bits);
552
553         if (ret != 0) {
554                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
555                         ctdb_addr_to_str(&vnn->public_address),
556                         ctdb_vnn_iface_string(vnn)));
557                 talloc_free(state);
558                 return -1;
559         }
560
561         state->c = talloc_steal(ctdb, c);
562         return 0;
563 }
564
565 struct ctdb_do_updateip_state {
566         struct ctdb_req_control_old *c;
567         struct ctdb_interface *old;
568         struct ctdb_vnn *vnn;
569 };
570
571 /*
572   called when updateip event finishes
573  */
574 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
575                                       void *private_data)
576 {
577         struct ctdb_do_updateip_state *state =
578                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
579         int32_t ret;
580
581         if (status != 0) {
582                 if (status == -ETIME) {
583                         ctdb_ban_self(ctdb);
584                 }
585                 DEBUG(DEBUG_ERR,
586                       ("Failed update of IP %s from interface %s to %s\n",
587                        ctdb_addr_to_str(&state->vnn->public_address),
588                        iface_string(state->old),
589                        ctdb_vnn_iface_string(state->vnn)));
590
591                 /*
592                  * All we can do is reset the old interface
593                  * and let the next run fix it
594                  */
595                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
596                 state->vnn->iface = state->old;
597                 state->vnn->iface->references++;
598
599                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
600                 talloc_free(state);
601                 return;
602         }
603
604         if (ctdb->do_checkpublicip) {
605
606         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
607         if (ret != 0) {
608                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
609                 talloc_free(state);
610                 return;
611         }
612
613         }
614
615         /* the control succeeded */
616         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
617         talloc_free(state);
618         return;
619 }
620
621 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
622 {
623         state->vnn->update_in_flight = false;
624         return 0;
625 }
626
627 /*
628   update (move) an ip address
629  */
630 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
631                                 struct ctdb_req_control_old *c,
632                                 struct ctdb_vnn *vnn)
633 {
634         int ret;
635         struct ctdb_do_updateip_state *state;
636         struct ctdb_interface *old = vnn->iface;
637         const char *old_name = iface_string(old);
638         const char *new_name;
639
640         if (vnn->update_in_flight) {
641                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
642                                     "update for this IP already in flight\n",
643                                     ctdb_addr_to_str(&vnn->public_address),
644                                     vnn->public_netmask_bits));
645                 return -1;
646         }
647
648         ctdb_vnn_unassign_iface(ctdb, vnn);
649         ret = ctdb_vnn_assign_iface(ctdb, vnn);
650         if (ret != 0) {
651                 DEBUG(DEBUG_ERR,("Update of IP %s/%u failed to "
652                                  "assign a usable interface (old iface '%s')\n",
653                                  ctdb_addr_to_str(&vnn->public_address),
654                                  vnn->public_netmask_bits,
655                                  old_name));
656                 return -1;
657         }
658
659         if (old == vnn->iface) {
660                 /* A benign update from one interface onto itself.
661                  * no need to run the eventscripts in this case, just return
662                  * success.
663                  */
664                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
665                 return 0;
666         }
667
668         state = talloc(vnn, struct ctdb_do_updateip_state);
669         CTDB_NO_MEMORY(ctdb, state);
670
671         state->c = NULL;
672         state->old = old;
673         state->vnn = vnn;
674
675         vnn->update_in_flight = true;
676         talloc_set_destructor(state, ctdb_updateip_destructor);
677
678         new_name = ctdb_vnn_iface_string(vnn);
679         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
680                             "interface %s to %s\n",
681                             ctdb_addr_to_str(&vnn->public_address),
682                             vnn->public_netmask_bits,
683                             old_name,
684                             new_name));
685
686         ret = ctdb_event_script_callback(ctdb,
687                                          state,
688                                          ctdb_do_updateip_callback,
689                                          state,
690                                          CTDB_EVENT_UPDATE_IP,
691                                          "%s %s %s %u",
692                                          old_name,
693                                          new_name,
694                                          ctdb_addr_to_str(&vnn->public_address),
695                                          vnn->public_netmask_bits);
696         if (ret != 0) {
697                 DEBUG(DEBUG_ERR,
698                       ("Failed update IP %s from interface %s to %s\n",
699                        ctdb_addr_to_str(&vnn->public_address),
700                        old_name, new_name));
701                 talloc_free(state);
702                 return -1;
703         }
704
705         state->c = talloc_steal(ctdb, c);
706         return 0;
707 }
708
709 /*
710   Find the vnn of the node that has a public ip address
711   returns -1 if the address is not known as a public address
712  */
713 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
714 {
715         struct ctdb_vnn *vnn;
716
717         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
718                 if (ctdb_same_ip(&vnn->public_address, addr)) {
719                         return vnn;
720                 }
721         }
722
723         return NULL;
724 }
725
726 /*
727   take over an ip address
728  */
729 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
730                                  struct ctdb_req_control_old *c,
731                                  TDB_DATA indata,
732                                  bool *async_reply)
733 {
734         int ret;
735         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
736         struct ctdb_vnn *vnn;
737         bool have_ip = false;
738         bool do_updateip = false;
739         bool do_takeip = false;
740         struct ctdb_interface *best_iface = NULL;
741
742         if (pip->pnn != ctdb->pnn) {
743                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
744                                  "with pnn %d, but we're node %d\n",
745                                  ctdb_addr_to_str(&pip->addr),
746                                  pip->pnn, ctdb->pnn));
747                 return -1;
748         }
749
750         /* update out vnn list */
751         vnn = find_public_ip_vnn(ctdb, &pip->addr);
752         if (vnn == NULL) {
753                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
754                         ctdb_addr_to_str(&pip->addr)));
755                 return 0;
756         }
757
758         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
759                 have_ip = ctdb_sys_have_ip(&pip->addr);
760         }
761         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
762         if (best_iface == NULL) {
763                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
764                                  "a usable interface (old %s, have_ip %d)\n",
765                                  ctdb_addr_to_str(&vnn->public_address),
766                                  vnn->public_netmask_bits,
767                                  ctdb_vnn_iface_string(vnn),
768                                  have_ip));
769                 return -1;
770         }
771
772         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
773                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
774                                   "and we have it on iface[%s], but it was assigned to node %d"
775                                   "and we are node %d, banning ourself\n",
776                                  ctdb_addr_to_str(&vnn->public_address),
777                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
778                 ctdb_ban_self(ctdb);
779                 return -1;
780         }
781
782         if (vnn->pnn == -1 && have_ip) {
783                 /* This will cause connections to be reset and
784                  * reestablished.  However, this is a very unusual
785                  * situation and doing this will completely repair the
786                  * inconsistency in the VNN.
787                  */
788                 DEBUG(DEBUG_WARNING,
789                       (__location__
790                        " Doing updateip for IP %s already on an interface\n",
791                        ctdb_addr_to_str(&vnn->public_address)));
792                 do_updateip = true;
793         }
794
795         if (vnn->iface) {
796                 if (vnn->iface != best_iface) {
797                         if (!vnn->iface->link_up) {
798                                 do_updateip = true;
799                         } else if (vnn->iface->references > (best_iface->references + 1)) {
800                                 /* only move when the rebalance gains something */
801                                         do_updateip = true;
802                         }
803                 }
804         }
805
806         if (!have_ip) {
807                 if (do_updateip) {
808                         ctdb_vnn_unassign_iface(ctdb, vnn);
809                         do_updateip = false;
810                 }
811                 do_takeip = true;
812         }
813
814         if (do_takeip) {
815                 ret = ctdb_do_takeip(ctdb, c, vnn);
816                 if (ret != 0) {
817                         return -1;
818                 }
819         } else if (do_updateip) {
820                 ret = ctdb_do_updateip(ctdb, c, vnn);
821                 if (ret != 0) {
822                         return -1;
823                 }
824         } else {
825                 /*
826                  * The interface is up and the kernel known the ip
827                  * => do nothing
828                  */
829                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
830                         ctdb_addr_to_str(&pip->addr),
831                         vnn->public_netmask_bits,
832                         ctdb_vnn_iface_string(vnn)));
833                 return 0;
834         }
835
836         /* tell ctdb_control.c that we will be replying asynchronously */
837         *async_reply = true;
838
839         return 0;
840 }
841
842 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
843 {
844         DLIST_REMOVE(ctdb->vnn, vnn);
845         ctdb_vnn_unassign_iface(ctdb, vnn);
846         ctdb_remove_orphaned_ifaces(ctdb, vnn);
847         talloc_free(vnn);
848 }
849
850 static struct ctdb_vnn *release_ip_post(struct ctdb_context *ctdb,
851                                         struct ctdb_vnn *vnn,
852                                         ctdb_sock_addr *addr)
853 {
854         TDB_DATA data;
855
856         /* Send a message to all clients of this node telling them
857          * that the cluster has been reconfigured and they should
858          * close any connections on this IP address
859          */
860         data.dptr = (uint8_t *)ctdb_addr_to_str(addr);
861         data.dsize = strlen((char *)data.dptr)+1;
862         DEBUG(DEBUG_INFO, ("Sending RELEASE_IP message for %s\n", data.dptr));
863         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
864
865         ctdb_vnn_unassign_iface(ctdb, vnn);
866
867         /* Process the IP if it has been marked for deletion */
868         if (vnn->delete_pending) {
869                 do_delete_ip(ctdb, vnn);
870                 return NULL;
871         }
872
873         return vnn;
874 }
875
876 struct release_ip_callback_state {
877         struct ctdb_req_control_old *c;
878         ctdb_sock_addr *addr;
879         struct ctdb_vnn *vnn;
880         uint32_t target_pnn;
881 };
882
883 /*
884   called when releaseip event finishes
885  */
886 static void release_ip_callback(struct ctdb_context *ctdb, int status,
887                                 void *private_data)
888 {
889         struct release_ip_callback_state *state =
890                 talloc_get_type(private_data, struct release_ip_callback_state);
891
892         if (status == -ETIME) {
893                 ctdb_ban_self(ctdb);
894         }
895
896         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
897                 if  (ctdb_sys_have_ip(state->addr)) {
898                         DEBUG(DEBUG_ERR,
899                               ("IP %s still hosted during release IP callback, failing\n",
900                                ctdb_addr_to_str(state->addr)));
901                         ctdb_request_control_reply(ctdb, state->c,
902                                                    NULL, -1, NULL);
903                         talloc_free(state);
904                         return;
905                 }
906         }
907
908         state->vnn->pnn = state->target_pnn;
909         state->vnn = release_ip_post(ctdb, state->vnn, state->addr);
910
911         /* the control succeeded */
912         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
913         talloc_free(state);
914 }
915
916 static int ctdb_releaseip_destructor(struct release_ip_callback_state *state)
917 {
918         if (state->vnn != NULL) {
919                 state->vnn->update_in_flight = false;
920         }
921         return 0;
922 }
923
924 /*
925   release an ip address
926  */
927 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
928                                 struct ctdb_req_control_old *c,
929                                 TDB_DATA indata, 
930                                 bool *async_reply)
931 {
932         int ret;
933         struct release_ip_callback_state *state;
934         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
935         struct ctdb_vnn *vnn;
936         const char *iface;
937
938         /* update our vnn list */
939         vnn = find_public_ip_vnn(ctdb, &pip->addr);
940         if (vnn == NULL) {
941                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
942                         ctdb_addr_to_str(&pip->addr)));
943                 return 0;
944         }
945
946         /* stop any previous arps */
947         talloc_free(vnn->takeover_ctx);
948         vnn->takeover_ctx = NULL;
949
950         /* RELEASE_IP controls are sent to all nodes that should not
951          * be hosting a particular IP.  This serves 2 purposes.  The
952          * first is to help resolve any inconsistencies.  If a node
953          * does unexpectly host an IP then it will be released.  The
954          * 2nd is to use a "redundant release" to tell non-takeover
955          * nodes where an IP is moving to.  This is how "ctdb ip" can
956          * report the (likely) location of an IP by only asking the
957          * local node.  Redundant releases need to update the PNN but
958          * are otherwise ignored.
959          */
960         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
961                 if (!ctdb_sys_have_ip(&pip->addr)) {
962                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
963                                 ctdb_addr_to_str(&pip->addr),
964                                 vnn->public_netmask_bits,
965                                 ctdb_vnn_iface_string(vnn)));
966                         vnn->pnn = pip->pnn;
967                         ctdb_vnn_unassign_iface(ctdb, vnn);
968                         return 0;
969                 }
970         } else {
971                 if (vnn->iface == NULL) {
972                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
973                                            ctdb_addr_to_str(&pip->addr),
974                                            vnn->public_netmask_bits));
975                         vnn->pnn = pip->pnn;
976                         return 0;
977                 }
978         }
979
980         /* There is a potential race between take_ip and us because we
981          * update the VNN via a callback that run when the
982          * eventscripts have been run.  Avoid the race by allowing one
983          * update to be in flight at a time.
984          */
985         if (vnn->update_in_flight) {
986                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
987                                     "update for this IP already in flight\n",
988                                     ctdb_addr_to_str(&vnn->public_address),
989                                     vnn->public_netmask_bits));
990                 return -1;
991         }
992
993         iface = ctdb_vnn_iface_string(vnn);
994
995         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
996                 ctdb_addr_to_str(&pip->addr),
997                 vnn->public_netmask_bits,
998                 iface,
999                 pip->pnn));
1000
1001         state = talloc(ctdb, struct release_ip_callback_state);
1002         if (state == NULL) {
1003                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1004                                __FILE__, __LINE__);
1005                 return -1;
1006         }
1007
1008         state->c = NULL;
1009         state->addr = talloc(state, ctdb_sock_addr);
1010         if (state->addr == NULL) {
1011                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1012                                __FILE__, __LINE__);
1013                 talloc_free(state);
1014                 return -1;
1015         }
1016         *state->addr = pip->addr;
1017         state->target_pnn = pip->pnn;
1018         state->vnn   = vnn;
1019
1020         vnn->update_in_flight = true;
1021         talloc_set_destructor(state, ctdb_releaseip_destructor);
1022
1023         ret = ctdb_event_script_callback(ctdb, 
1024                                          state, release_ip_callback, state,
1025                                          CTDB_EVENT_RELEASE_IP,
1026                                          "%s %s %u",
1027                                          iface,
1028                                          ctdb_addr_to_str(&pip->addr),
1029                                          vnn->public_netmask_bits);
1030         if (ret != 0) {
1031                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1032                         ctdb_addr_to_str(&pip->addr),
1033                         ctdb_vnn_iface_string(vnn)));
1034                 talloc_free(state);
1035                 return -1;
1036         }
1037
1038         /* tell the control that we will be reply asynchronously */
1039         *async_reply = true;
1040         state->c = talloc_steal(state, c);
1041         return 0;
1042 }
1043
1044 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1045                                    ctdb_sock_addr *addr,
1046                                    unsigned mask, const char *ifaces,
1047                                    bool check_address)
1048 {
1049         struct ctdb_vnn      *vnn;
1050         char *tmp;
1051         const char *iface;
1052
1053         /* Verify that we don't have an entry for this IP yet */
1054         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
1055                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1056                         DEBUG(DEBUG_ERR,
1057                               ("Duplicate public IP address '%s'\n",
1058                                ctdb_addr_to_str(addr)));
1059                         return -1;
1060                 }
1061         }
1062
1063         /* Create a new VNN structure for this IP address */
1064         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1065         if (vnn == NULL) {
1066                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1067                 return -1;
1068         }
1069         tmp = talloc_strdup(vnn, ifaces);
1070         if (tmp == NULL) {
1071                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1072                 talloc_free(vnn);
1073                 return -1;
1074         }
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 struct vnn_interface *vnn_iface;
1077                 struct ctdb_interface *i;
1078                 if (!ctdb_sys_check_iface_exists(iface)) {
1079                         DEBUG(DEBUG_ERR,
1080                               ("Unknown interface %s for public address %s\n",
1081                                iface, ctdb_addr_to_str(addr)));
1082                         talloc_free(vnn);
1083                         return -1;
1084                 }
1085
1086                 i = ctdb_add_local_iface(ctdb, iface);
1087                 if (i == NULL) {
1088                         DEBUG(DEBUG_ERR,
1089                               ("Failed to add interface '%s' "
1090                                "for public address %s\n",
1091                                iface, ctdb_addr_to_str(addr)));
1092                         talloc_free(vnn);
1093                         return -1;
1094                 }
1095
1096                 vnn_iface = talloc_zero(vnn, struct vnn_interface);
1097                 if (vnn_iface == NULL) {
1098                         DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1099                         talloc_free(vnn);
1100                         return -1;
1101                 }
1102
1103                 vnn_iface->iface = i;
1104                 DLIST_ADD_END(vnn->ifaces, vnn_iface);
1105         }
1106         talloc_free(tmp);
1107         vnn->public_address      = *addr;
1108         vnn->public_netmask_bits = mask;
1109         vnn->pnn                 = -1;
1110
1111         DLIST_ADD(ctdb->vnn, vnn);
1112
1113         return 0;
1114 }
1115
1116 /*
1117   setup the public address lists from a file
1118 */
1119 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1120 {
1121         char **lines;
1122         int nlines;
1123         int i;
1124
1125         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1126         if (lines == NULL) {
1127                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1128                 return -1;
1129         }
1130         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1131                 nlines--;
1132         }
1133
1134         for (i=0;i<nlines;i++) {
1135                 unsigned mask;
1136                 ctdb_sock_addr addr;
1137                 const char *addrstr;
1138                 const char *ifaces;
1139                 char *tok, *line;
1140
1141                 line = lines[i];
1142                 while ((*line == ' ') || (*line == '\t')) {
1143                         line++;
1144                 }
1145                 if (*line == '#') {
1146                         continue;
1147                 }
1148                 if (strcmp(line, "") == 0) {
1149                         continue;
1150                 }
1151                 tok = strtok(line, " \t");
1152                 addrstr = tok;
1153                 tok = strtok(NULL, " \t");
1154                 if (tok == NULL) {
1155                         if (NULL == ctdb->default_public_interface) {
1156                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1157                                          i+1));
1158                                 talloc_free(lines);
1159                                 return -1;
1160                         }
1161                         ifaces = ctdb->default_public_interface;
1162                 } else {
1163                         ifaces = tok;
1164                 }
1165
1166                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1167                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1168                         talloc_free(lines);
1169                         return -1;
1170                 }
1171                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1172                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1173                         talloc_free(lines);
1174                         return -1;
1175                 }
1176         }
1177
1178
1179         talloc_free(lines);
1180         return 0;
1181 }
1182
1183 static struct ctdb_public_ip_list *
1184 ctdb_fetch_remote_public_ips(struct ctdb_context *ctdb,
1185                              TALLOC_CTX *mem_ctx,
1186                              struct ctdb_node_map_old *nodemap,
1187                              uint32_t public_ip_flags)
1188 {
1189         int j, ret;
1190         struct ctdb_public_ip_list_old *ip_list;
1191         struct ctdb_public_ip_list *public_ips;
1192
1193         public_ips = talloc_zero_array(mem_ctx,
1194                                        struct ctdb_public_ip_list,
1195                                        nodemap->num);
1196         if (public_ips == NULL) {
1197                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1198                 return NULL;
1199         }
1200
1201         for (j = 0; j < nodemap->num; j++) {
1202                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1203                         continue;
1204                 }
1205
1206                 /* Retrieve the list of public IPs from the
1207                  * node. Flags says whether it is known or
1208                  * available. */
1209                 ret = ctdb_ctrl_get_public_ips_flags(
1210                         ctdb, TAKEOVER_TIMEOUT(), j, public_ips,
1211                         public_ip_flags, &ip_list);
1212                 if (ret != 0) {
1213                         DEBUG(DEBUG_ERR,
1214                               ("Failed to read public IPs from node: %u\n", j));
1215                         talloc_free(public_ips);
1216                         return NULL;
1217                 }
1218                 public_ips[j].num = ip_list->num;
1219                 if (ip_list->num == 0) {
1220                         talloc_free(ip_list);
1221                         continue;
1222                 }
1223                 public_ips[j].ip = talloc_zero_array(public_ips,
1224                                                      struct ctdb_public_ip,
1225                                                      ip_list->num);
1226                 if (public_ips[j].ip == NULL) {
1227                         DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1228                         talloc_free(public_ips);
1229                         return NULL;
1230                 }
1231                 memcpy(public_ips[j].ip, &ip_list->ips[0],
1232                        sizeof(struct ctdb_public_ip) * ip_list->num);
1233                 talloc_free(ip_list);
1234         }
1235
1236         return public_ips;
1237 }
1238
1239 static struct ctdb_node_map *
1240 ctdb_node_map_old_to_new(TALLOC_CTX *mem_ctx,
1241                          const struct ctdb_node_map_old *old)
1242 {
1243         struct ctdb_node_map *new;
1244
1245         new = talloc(mem_ctx, struct ctdb_node_map);
1246         if (new == NULL) {
1247                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1248                 return NULL;
1249         }
1250         new->num = old->num;
1251         new->node = talloc_zero_array(new,
1252                                       struct ctdb_node_and_flags, new->num);
1253         memcpy(new->node, &old->nodes[0],
1254                sizeof(struct ctdb_node_and_flags) * new->num);
1255
1256         return new;
1257 }
1258
1259
1260 static bool set_ipflags(struct ctdb_context *ctdb,
1261                         struct ipalloc_state *ipalloc_state,
1262                         struct ctdb_node_map_old *nodemap)
1263 {
1264         struct ctdb_node_map *new;
1265
1266         new = ctdb_node_map_old_to_new(ipalloc_state, nodemap);
1267         if (new == NULL) {
1268                 return false;
1269         }
1270
1271         ipalloc_set_node_flags(ipalloc_state, new);
1272
1273         talloc_free(new);
1274
1275         return true;
1276 }
1277
1278 static enum ipalloc_algorithm
1279 determine_algorithm(const struct ctdb_tunable_list *tunables)
1280 {
1281         if (1 == tunables->lcp2_public_ip_assignment) {
1282                 return IPALLOC_LCP2;
1283         } else if (1 == tunables->deterministic_public_ips) {
1284                 return IPALLOC_DETERMINISTIC;
1285         } else {
1286                 return IPALLOC_NONDETERMINISTIC;
1287         }
1288 }
1289
1290 struct takeover_callback_data {
1291         uint32_t num_nodes;
1292         unsigned int *fail_count;
1293 };
1294
1295 static struct takeover_callback_data *
1296 takeover_callback_data_init(TALLOC_CTX *mem_ctx,
1297                             uint32_t num_nodes)
1298 {
1299         static struct takeover_callback_data *takeover_data;
1300
1301         takeover_data = talloc_zero(mem_ctx, struct takeover_callback_data);
1302         if (takeover_data == NULL) {
1303                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1304                 return NULL;
1305         }
1306
1307         takeover_data->fail_count = talloc_zero_array(takeover_data,
1308                                                       unsigned int, num_nodes);
1309         if (takeover_data->fail_count == NULL) {
1310                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1311                 talloc_free(takeover_data);
1312                 return NULL;
1313         }
1314
1315         takeover_data->num_nodes = num_nodes;
1316
1317         return takeover_data;
1318 }
1319
1320 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
1321                                        uint32_t node_pnn, int32_t res,
1322                                        TDB_DATA outdata, void *callback_data)
1323 {
1324         struct takeover_callback_data *cd =
1325                 talloc_get_type_abort(callback_data,
1326                                       struct takeover_callback_data);
1327
1328         if (node_pnn >= cd->num_nodes) {
1329                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
1330                 return;
1331         }
1332
1333         if (cd->fail_count[node_pnn] == 0) {
1334                 DEBUG(DEBUG_ERR,
1335                       ("Node %u failed the takeover run\n", node_pnn));
1336         }
1337
1338         cd->fail_count[node_pnn]++;
1339 }
1340
1341 static void takeover_run_process_failures(struct ctdb_context *ctdb,
1342                                           struct takeover_callback_data *tcd)
1343 {
1344         unsigned int max_fails = 0;
1345         uint32_t max_pnn = -1;
1346         uint32_t i;
1347
1348         for (i = 0; i < tcd->num_nodes; i++) {
1349                 if (tcd->fail_count[i] > max_fails) {
1350                         max_pnn = i;
1351                         max_fails = tcd->fail_count[i];
1352                 }
1353         }
1354
1355         if (max_fails > 0) {
1356                 int ret;
1357                 TDB_DATA data;
1358
1359                 DEBUG(DEBUG_ERR,
1360                       ("Sending banning credits to %u with fail count %u\n",
1361                        max_pnn, max_fails));
1362
1363                 data.dptr = (uint8_t *)&max_pnn;
1364                 data.dsize = sizeof(uint32_t);
1365                 ret = ctdb_client_send_message(ctdb,
1366                                                CTDB_BROADCAST_CONNECTED,
1367                                                CTDB_SRVID_BANNING,
1368                                                data);
1369                 if (ret != 0) {
1370                         DEBUG(DEBUG_ERR,
1371                               ("Failed to set banning credits for node %u\n",
1372                                max_pnn));
1373                 }
1374         }
1375 }
1376
1377 /*
1378  * Recalculate the allocation of public IPs to nodes and have the
1379  * nodes host their allocated addresses.
1380  *
1381  * - Initialise IP allocation state.  Pass:
1382      + algorithm to be used;
1383      + whether IP rebalancing ("failback") should be done (this uses a
1384        cluster-wide configuration variable and only the value form the
1385        master node is used); and
1386  *   + list of nodes to force rebalance (internal structure, currently
1387  *     no way to fetch, only used by LCP2 for nodes that have had new
1388  *     IP addresses added).
1389  * - Set IP flags for IP allocation based on node map and tunables
1390  *   NoIPTakeover/NoIPHostOnAllDisabled from all connected nodes
1391  *   (tunable fetching done separately so values can be faked in unit
1392  *   testing)
1393  * - Retrieve known and available IP addresses (done separately so
1394  *   values can be faked in unit testing)
1395  * - Use ipalloc_set_public_ips() to set known and available IP
1396      addresses for allocation
1397  * - If cluster can't host IP addresses then early exit
1398  * - Run IP allocation algorithm
1399  * - Send RELEASE_IP to all nodes for IPs they should not host
1400  * - Send TAKE_IP to all nodes for IPs they should host
1401  * - Send IPREALLOCATED to all nodes (with backward compatibility hack)
1402  */
1403 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
1404                       uint32_t *force_rebalance_nodes)
1405 {
1406         int i, ret;
1407         struct ctdb_public_ip ip;
1408         uint32_t *nodes;
1409         struct public_ip_list *all_ips, *tmp_ip;
1410         TDB_DATA data;
1411         struct timeval timeout;
1412         struct client_async_data *async_data;
1413         struct ctdb_client_control_state *state;
1414         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1415         struct ipalloc_state *ipalloc_state;
1416         struct ctdb_public_ip_list *known_ips, *available_ips;
1417         struct takeover_callback_data *takeover_data;
1418
1419         /* Initialise fail callback data to be used with
1420          * takeover_run_fail_callback().  A failure in any of the
1421          * following steps will cause an early return, so this can be
1422          * reused for each of those steps without re-initialising. */
1423         takeover_data = takeover_callback_data_init(tmp_ctx,
1424                                                     nodemap->num);
1425         if (takeover_data == NULL) {
1426                 talloc_free(tmp_ctx);
1427                 return -1;
1428         }
1429
1430         /* Default timeout for early jump to IPREALLOCATED.  See below
1431          * for explanation of 3 times... */
1432         timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
1433
1434         /*
1435          * ip failover is completely disabled, just send out the 
1436          * ipreallocated event.
1437          */
1438         if (ctdb->tunable.disable_ip_failover != 0) {
1439                 goto ipreallocated;
1440         }
1441
1442         ipalloc_state = ipalloc_state_init(
1443                 tmp_ctx, ctdb->num_nodes,
1444                 determine_algorithm(&ctdb->tunable),
1445                 (ctdb->tunable.no_ip_takeover != 0),
1446                 (ctdb->tunable.no_ip_failback != 0),
1447                 (ctdb->tunable.no_ip_host_on_all_disabled != 0),
1448                 force_rebalance_nodes);
1449         if (ipalloc_state == NULL) {
1450                 talloc_free(tmp_ctx);
1451                 return -1;
1452         }
1453
1454         if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
1455                 DEBUG(DEBUG_ERR,
1456                       ("Failed to set IP flags - aborting takeover run\n"));
1457                 talloc_free(tmp_ctx);
1458                 return -1;
1459         }
1460
1461         /* Fetch known/available public IPs from each active node */
1462         /* Fetch lists of known public IPs from all nodes */
1463         known_ips = ctdb_fetch_remote_public_ips(ctdb, ipalloc_state,
1464                                                  nodemap, 0);
1465         if (known_ips == NULL) {
1466                 DEBUG(DEBUG_ERR, ("Failed to read known public IPs\n"));
1467                 talloc_free(tmp_ctx);
1468                 return -1;
1469         }
1470         available_ips = ctdb_fetch_remote_public_ips(
1471                 ctdb, ipalloc_state, nodemap,
1472                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE);
1473         if (available_ips == NULL) {
1474                 DEBUG(DEBUG_ERR, ("Failed to read available public IPs\n"));
1475                 talloc_free(tmp_ctx);
1476                 return -1;
1477         }
1478
1479         ipalloc_set_public_ips(ipalloc_state, known_ips, available_ips);
1480
1481         if (! ipalloc_can_host_ips(ipalloc_state)) {
1482                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
1483                 goto ipreallocated;
1484         }
1485
1486         /* Do the IP reassignment calculations */
1487         all_ips = ipalloc(ipalloc_state);
1488         if (all_ips == NULL) {
1489                 talloc_free(tmp_ctx);
1490                 return -1;
1491         }
1492
1493         /* Now tell all nodes to release any public IPs should not
1494          * host.  This will be a NOOP on nodes that don't currently
1495          * hold the given IP.
1496          */
1497         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1498         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1499
1500         async_data->fail_callback = takeover_run_fail_callback;
1501         async_data->callback_data = takeover_data;
1502
1503         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
1504
1505         /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
1506          * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
1507          * seconds.  However, RELEASE_IP can take longer due to TCP
1508          * connection killing, so sometimes needs more time.
1509          * Therefore, use a cumulative timeout of TakeoverTimeout * 3
1510          * seconds across all 3 stages.  No explicit expiry checks are
1511          * needed before each stage because tevent is smart enough to
1512          * fire the timeouts even if they are in the past.  Initialise
1513          * this here so it explicitly covers the stages we're
1514          * interested in but, in particular, not the time taken by the
1515          * ipalloc().
1516          */
1517         timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
1518
1519         /* Send a RELEASE_IP to all nodes that should not be hosting
1520          * each IP.  For each IP, all but one of these will be
1521          * redundant.  However, the redundant ones are used to tell
1522          * nodes which node should be hosting the IP so that commands
1523          * like "ctdb ip" can display a particular nodes idea of who
1524          * is hosting what. */
1525         for (i=0;i<nodemap->num;i++) {
1526                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1527                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1528                         continue;
1529                 }
1530
1531                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1532                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1533                                 /* This node should be serving this
1534                                    vnn so don't tell it to release the ip
1535                                 */
1536                                 continue;
1537                         }
1538                         ip.pnn  = tmp_ip->pnn;
1539                         ip.addr = tmp_ip->addr;
1540
1541                         data.dsize = sizeof(ip);
1542                         data.dptr  = (uint8_t *)&ip;
1543                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1544                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
1545                                                   data, async_data,
1546                                                   &timeout, NULL);
1547                         if (state == NULL) {
1548                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1549                                 talloc_free(tmp_ctx);
1550                                 return -1;
1551                         }
1552
1553                         ctdb_client_async_add(async_data, state);
1554                 }
1555         }
1556         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1557                 DEBUG(DEBUG_ERR,
1558                       ("Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1559                 goto fail;
1560         }
1561         talloc_free(async_data);
1562
1563
1564         /* For each IP, send a TAKOVER_IP to the node that should be
1565          * hosting it.  Many of these will often be redundant (since
1566          * the allocation won't have changed) but they can be useful
1567          * to recover from inconsistencies. */
1568         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1569         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1570
1571         async_data->fail_callback = takeover_run_fail_callback;
1572         async_data->callback_data = takeover_data;
1573
1574         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1575                 if (tmp_ip->pnn == -1) {
1576                         /* this IP won't be taken over */
1577                         continue;
1578                 }
1579
1580                 ip.pnn  = tmp_ip->pnn;
1581                 ip.addr = tmp_ip->addr;
1582
1583                 data.dsize = sizeof(ip);
1584                 data.dptr  = (uint8_t *)&ip;
1585                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1586                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
1587                                           data, async_data, &timeout, NULL);
1588                 if (state == NULL) {
1589                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1590                         talloc_free(tmp_ctx);
1591                         return -1;
1592                 }
1593
1594                 ctdb_client_async_add(async_data, state);
1595         }
1596         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1597                 DEBUG(DEBUG_ERR,
1598                       ("Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1599                 goto fail;
1600         }
1601
1602 ipreallocated:
1603         /*
1604          * Tell all nodes to run eventscripts to process the
1605          * "ipreallocated" event.  This can do a lot of things,
1606          * including restarting services to reconfigure them if public
1607          * IPs have moved.  Once upon a time this event only used to
1608          * update natgw.
1609          */
1610         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1611         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
1612                                         nodes, 0, timeout,
1613                                         false, tdb_null,
1614                                         NULL, takeover_run_fail_callback,
1615                                         takeover_data);
1616         if (ret != 0) {
1617                 DEBUG(DEBUG_ERR,
1618                       ("Async CTDB_CONTROL_IPREALLOCATED control failed\n"));
1619                 goto fail;
1620         }
1621
1622         talloc_free(tmp_ctx);
1623         return ret;
1624
1625 fail:
1626         takeover_run_process_failures(ctdb, takeover_data);
1627         talloc_free(tmp_ctx);
1628         return -1;
1629 }
1630
1631
1632 /*
1633   destroy a ctdb_client_ip structure
1634  */
1635 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1636 {
1637         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1638                 ctdb_addr_to_str(&ip->addr),
1639                 ntohs(ip->addr.ip.sin_port),
1640                 ip->client_id));
1641
1642         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1643         return 0;
1644 }
1645
1646 /*
1647   called by a client to inform us of a TCP connection that it is managing
1648   that should tickled with an ACK when IP takeover is done
1649  */
1650 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1651                                 TDB_DATA indata)
1652 {
1653         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
1654         struct ctdb_connection *tcp_sock = NULL;
1655         struct ctdb_tcp_list *tcp;
1656         struct ctdb_connection t;
1657         int ret;
1658         TDB_DATA data;
1659         struct ctdb_client_ip *ip;
1660         struct ctdb_vnn *vnn;
1661         ctdb_sock_addr addr;
1662
1663         /* If we don't have public IPs, tickles are useless */
1664         if (ctdb->vnn == NULL) {
1665                 return 0;
1666         }
1667
1668         tcp_sock = (struct ctdb_connection *)indata.dptr;
1669
1670         addr = tcp_sock->src;
1671         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1672         addr = tcp_sock->dst;
1673         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
1674
1675         ZERO_STRUCT(addr);
1676         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
1677         vnn = find_public_ip_vnn(ctdb, &addr);
1678         if (vnn == NULL) {
1679                 switch (addr.sa.sa_family) {
1680                 case AF_INET:
1681                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1682                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1683                                         ctdb_addr_to_str(&addr)));
1684                         }
1685                         break;
1686                 case AF_INET6:
1687                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1688                                 ctdb_addr_to_str(&addr)));
1689                         break;
1690                 default:
1691                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1692                 }
1693
1694                 return 0;
1695         }
1696
1697         if (vnn->pnn != ctdb->pnn) {
1698                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1699                         ctdb_addr_to_str(&addr),
1700                         client_id, client->pid));
1701                 /* failing this call will tell smbd to die */
1702                 return -1;
1703         }
1704
1705         ip = talloc(client, struct ctdb_client_ip);
1706         CTDB_NO_MEMORY(ctdb, ip);
1707
1708         ip->ctdb      = ctdb;
1709         ip->addr      = addr;
1710         ip->client_id = client_id;
1711         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1712         DLIST_ADD(ctdb->client_ip_list, ip);
1713
1714         tcp = talloc(client, struct ctdb_tcp_list);
1715         CTDB_NO_MEMORY(ctdb, tcp);
1716
1717         tcp->connection.src = tcp_sock->src;
1718         tcp->connection.dst = tcp_sock->dst;
1719
1720         DLIST_ADD(client->tcp_list, tcp);
1721
1722         t.src = tcp_sock->src;
1723         t.dst = tcp_sock->dst;
1724
1725         data.dptr = (uint8_t *)&t;
1726         data.dsize = sizeof(t);
1727
1728         switch (addr.sa.sa_family) {
1729         case AF_INET:
1730                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1731                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
1732                         ctdb_addr_to_str(&tcp_sock->src),
1733                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1734                 break;
1735         case AF_INET6:
1736                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1737                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
1738                         ctdb_addr_to_str(&tcp_sock->src),
1739                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1740                 break;
1741         default:
1742                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1743         }
1744
1745
1746         /* tell all nodes about this tcp connection */
1747         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1748                                        CTDB_CONTROL_TCP_ADD,
1749                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1750         if (ret != 0) {
1751                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1752                 return -1;
1753         }
1754
1755         return 0;
1756 }
1757
1758 /*
1759   find a tcp address on a list
1760  */
1761 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
1762                                            struct ctdb_connection *tcp)
1763 {
1764         int i;
1765
1766         if (array == NULL) {
1767                 return NULL;
1768         }
1769
1770         for (i=0;i<array->num;i++) {
1771                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
1772                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
1773                         return &array->connections[i];
1774                 }
1775         }
1776         return NULL;
1777 }
1778
1779
1780
1781 /*
1782   called by a daemon to inform us of a TCP connection that one of its
1783   clients managing that should tickled with an ACK when IP takeover is
1784   done
1785  */
1786 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
1787 {
1788         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
1789         struct ctdb_tcp_array *tcparray;
1790         struct ctdb_connection tcp;
1791         struct ctdb_vnn *vnn;
1792
1793         /* If we don't have public IPs, tickles are useless */
1794         if (ctdb->vnn == NULL) {
1795                 return 0;
1796         }
1797
1798         vnn = find_public_ip_vnn(ctdb, &p->dst);
1799         if (vnn == NULL) {
1800                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1801                         ctdb_addr_to_str(&p->dst)));
1802
1803                 return -1;
1804         }
1805
1806
1807         tcparray = vnn->tcp_array;
1808
1809         /* If this is the first tickle */
1810         if (tcparray == NULL) {
1811                 tcparray = talloc(vnn, struct ctdb_tcp_array);
1812                 CTDB_NO_MEMORY(ctdb, tcparray);
1813                 vnn->tcp_array = tcparray;
1814
1815                 tcparray->num = 0;
1816                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
1817                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1818
1819                 tcparray->connections[tcparray->num].src = p->src;
1820                 tcparray->connections[tcparray->num].dst = p->dst;
1821                 tcparray->num++;
1822
1823                 if (tcp_update_needed) {
1824                         vnn->tcp_update_needed = true;
1825                 }
1826                 return 0;
1827         }
1828
1829
1830         /* Do we already have this tickle ?*/
1831         tcp.src = p->src;
1832         tcp.dst = p->dst;
1833         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
1834                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1835                         ctdb_addr_to_str(&tcp.dst),
1836                         ntohs(tcp.dst.ip.sin_port),
1837                         vnn->pnn));
1838                 return 0;
1839         }
1840
1841         /* A new tickle, we must add it to the array */
1842         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1843                                         struct ctdb_connection,
1844                                         tcparray->num+1);
1845         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1846
1847         tcparray->connections[tcparray->num].src = p->src;
1848         tcparray->connections[tcparray->num].dst = p->dst;
1849         tcparray->num++;
1850
1851         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1852                 ctdb_addr_to_str(&tcp.dst),
1853                 ntohs(tcp.dst.ip.sin_port),
1854                 vnn->pnn));
1855
1856         if (tcp_update_needed) {
1857                 vnn->tcp_update_needed = true;
1858         }
1859
1860         return 0;
1861 }
1862
1863
1864 static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
1865 {
1866         struct ctdb_connection *tcpp;
1867
1868         if (vnn == NULL) {
1869                 return;
1870         }
1871
1872         /* if the array is empty we cant remove it
1873            and we don't need to do anything
1874          */
1875         if (vnn->tcp_array == NULL) {
1876                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1877                         ctdb_addr_to_str(&conn->dst),
1878                         ntohs(conn->dst.ip.sin_port)));
1879                 return;
1880         }
1881
1882
1883         /* See if we know this connection
1884            if we don't know this connection  then we dont need to do anything
1885          */
1886         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1887         if (tcpp == NULL) {
1888                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1889                         ctdb_addr_to_str(&conn->dst),
1890                         ntohs(conn->dst.ip.sin_port)));
1891                 return;
1892         }
1893
1894
1895         /* We need to remove this entry from the array.
1896            Instead of allocating a new array and copying data to it
1897            we cheat and just copy the last entry in the existing array
1898            to the entry that is to be removed and just shring the 
1899            ->num field
1900          */
1901         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1902         vnn->tcp_array->num--;
1903
1904         /* If we deleted the last entry we also need to remove the entire array
1905          */
1906         if (vnn->tcp_array->num == 0) {
1907                 talloc_free(vnn->tcp_array);
1908                 vnn->tcp_array = NULL;
1909         }               
1910
1911         vnn->tcp_update_needed = true;
1912
1913         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1914                 ctdb_addr_to_str(&conn->src),
1915                 ntohs(conn->src.ip.sin_port)));
1916 }
1917
1918
1919 /*
1920   called by a daemon to inform us of a TCP connection that one of its
1921   clients used are no longer needed in the tickle database
1922  */
1923 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
1924 {
1925         struct ctdb_vnn *vnn;
1926         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
1927
1928         /* If we don't have public IPs, tickles are useless */
1929         if (ctdb->vnn == NULL) {
1930                 return 0;
1931         }
1932
1933         vnn = find_public_ip_vnn(ctdb, &conn->dst);
1934         if (vnn == NULL) {
1935                 DEBUG(DEBUG_ERR,
1936                       (__location__ " unable to find public address %s\n",
1937                        ctdb_addr_to_str(&conn->dst)));
1938                 return 0;
1939         }
1940
1941         ctdb_remove_connection(vnn, conn);
1942
1943         return 0;
1944 }
1945
1946
1947 /*
1948   Called when another daemon starts - causes all tickles for all
1949   public addresses we are serving to be sent to the new node on the
1950   next check.  This actually causes the next scheduled call to
1951   tdb_update_tcp_tickles() to update all nodes.  This is simple and
1952   doesn't require careful error handling.
1953  */
1954 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
1955 {
1956         struct ctdb_vnn *vnn;
1957
1958         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
1959                            (unsigned long) pnn));
1960
1961         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
1962                 vnn->tcp_update_needed = true;
1963         }
1964
1965         return 0;
1966 }
1967
1968
1969 /*
1970   called when a client structure goes away - hook to remove
1971   elements from the tcp_list in all daemons
1972  */
1973 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1974 {
1975         while (client->tcp_list) {
1976                 struct ctdb_vnn *vnn;
1977                 struct ctdb_tcp_list *tcp = client->tcp_list;
1978                 struct ctdb_connection *conn = &tcp->connection;
1979
1980                 DLIST_REMOVE(client->tcp_list, tcp);
1981
1982                 vnn = find_public_ip_vnn(client->ctdb,
1983                                          &conn->dst);
1984                 if (vnn == NULL) {
1985                         DEBUG(DEBUG_ERR,
1986                               (__location__ " unable to find public address %s\n",
1987                                ctdb_addr_to_str(&conn->dst)));
1988                         continue;
1989                 }
1990
1991                 /* If the IP address is hosted on this node then
1992                  * remove the connection. */
1993                 if (vnn->pnn == client->ctdb->pnn) {
1994                         ctdb_remove_connection(vnn, conn);
1995                 }
1996
1997                 /* Otherwise this function has been called because the
1998                  * server IP address has been released to another node
1999                  * and the client has exited.  This means that we
2000                  * should not delete the connection information.  The
2001                  * takeover node processes connections too. */
2002         }
2003 }
2004
2005
2006 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2007 {
2008         struct ctdb_vnn *vnn, *next;
2009         int count = 0;
2010
2011         if (ctdb->tunable.disable_ip_failover == 1) {
2012                 return;
2013         }
2014
2015         for (vnn = ctdb->vnn; vnn != NULL; vnn = next) {
2016                 /* vnn can be freed below in release_ip_post() */
2017                 next = vnn->next;
2018
2019                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2020                         ctdb_vnn_unassign_iface(ctdb, vnn);
2021                         continue;
2022                 }
2023
2024                 /* Don't allow multiple releases at once.  Some code,
2025                  * particularly ctdb_tickle_sentenced_connections() is
2026                  * not re-entrant */
2027                 if (vnn->update_in_flight) {
2028                         DEBUG(DEBUG_WARNING,
2029                               (__location__
2030                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
2031                                     ctdb_addr_to_str(&vnn->public_address),
2032                                     vnn->public_netmask_bits,
2033                                     ctdb_vnn_iface_string(vnn)));
2034                         continue;
2035                 }
2036                 vnn->update_in_flight = true;
2037
2038                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
2039                                     ctdb_addr_to_str(&vnn->public_address),
2040                                     vnn->public_netmask_bits,
2041                                     ctdb_vnn_iface_string(vnn)));
2042
2043                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2044                                        ctdb_vnn_iface_string(vnn),
2045                                        ctdb_addr_to_str(&vnn->public_address),
2046                                        vnn->public_netmask_bits);
2047                 /* releaseip timeouts are converted to success, so to
2048                  * detect failures just check if the IP address is
2049                  * still there...
2050                  */
2051                 if (ctdb_sys_have_ip(&vnn->public_address)) {
2052                         DEBUG(DEBUG_ERR,
2053                               (__location__
2054                                " IP address %s not released\n",
2055                                ctdb_addr_to_str(&vnn->public_address)));
2056                         vnn->update_in_flight = false;
2057                         continue;
2058                 }
2059
2060                 vnn = release_ip_post(ctdb, vnn, &vnn->public_address);
2061                 if (vnn != NULL) {
2062                         vnn->update_in_flight = false;
2063                 }
2064                 count++;
2065         }
2066
2067         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
2068 }
2069
2070
2071 /*
2072   get list of public IPs
2073  */
2074 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2075                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
2076 {
2077         int i, num, len;
2078         struct ctdb_public_ip_list_old *ips;
2079         struct ctdb_vnn *vnn;
2080         bool only_available = false;
2081
2082         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2083                 only_available = true;
2084         }
2085
2086         /* count how many public ip structures we have */
2087         num = 0;
2088         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2089                 num++;
2090         }
2091
2092         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2093                 num*sizeof(struct ctdb_public_ip);
2094         ips = talloc_zero_size(outdata, len);
2095         CTDB_NO_MEMORY(ctdb, ips);
2096
2097         i = 0;
2098         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2099                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2100                         continue;
2101                 }
2102                 ips->ips[i].pnn  = vnn->pnn;
2103                 ips->ips[i].addr = vnn->public_address;
2104                 i++;
2105         }
2106         ips->num = i;
2107         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2108                 i*sizeof(struct ctdb_public_ip);
2109
2110         outdata->dsize = len;
2111         outdata->dptr  = (uint8_t *)ips;
2112
2113         return 0;
2114 }
2115
2116
2117 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2118                                         struct ctdb_req_control_old *c,
2119                                         TDB_DATA indata,
2120                                         TDB_DATA *outdata)
2121 {
2122         int i, num, len;
2123         ctdb_sock_addr *addr;
2124         struct ctdb_public_ip_info_old *info;
2125         struct ctdb_vnn *vnn;
2126         struct vnn_interface *iface;
2127
2128         addr = (ctdb_sock_addr *)indata.dptr;
2129
2130         vnn = find_public_ip_vnn(ctdb, addr);
2131         if (vnn == NULL) {
2132                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2133                                  "'%s'not a public address\n",
2134                                  ctdb_addr_to_str(addr)));
2135                 return -1;
2136         }
2137
2138         /* count how many public ip structures we have */
2139         num = 0;
2140         for (iface = vnn->ifaces; iface != NULL; iface = iface->next) {
2141                 num++;
2142         }
2143
2144         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2145                 num*sizeof(struct ctdb_iface);
2146         info = talloc_zero_size(outdata, len);
2147         CTDB_NO_MEMORY(ctdb, info);
2148
2149         info->ip.addr = vnn->public_address;
2150         info->ip.pnn = vnn->pnn;
2151         info->active_idx = 0xFFFFFFFF;
2152
2153         i = 0;
2154         for (iface = vnn->ifaces; iface != NULL; iface = iface->next) {
2155                 struct ctdb_interface *cur;
2156
2157                 cur = iface->iface;
2158                 if (vnn->iface == cur) {
2159                         info->active_idx = i;
2160                 }
2161                 strncpy(info->ifaces[i].name, cur->name,
2162                         sizeof(info->ifaces[i].name));
2163                 info->ifaces[i].name[sizeof(info->ifaces[i].name)-1] = '\0';
2164                 info->ifaces[i].link_state = cur->link_up;
2165                 info->ifaces[i].references = cur->references;
2166
2167                 i++;
2168         }
2169         info->num = i;
2170         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2171                 i*sizeof(struct ctdb_iface);
2172
2173         outdata->dsize = len;
2174         outdata->dptr  = (uint8_t *)info;
2175
2176         return 0;
2177 }
2178
2179 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2180                                 struct ctdb_req_control_old *c,
2181                                 TDB_DATA *outdata)
2182 {
2183         int i, num, len;
2184         struct ctdb_iface_list_old *ifaces;
2185         struct ctdb_interface *cur;
2186
2187         /* count how many public ip structures we have */
2188         num = 0;
2189         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2190                 num++;
2191         }
2192
2193         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2194                 num*sizeof(struct ctdb_iface);
2195         ifaces = talloc_zero_size(outdata, len);
2196         CTDB_NO_MEMORY(ctdb, ifaces);
2197
2198         i = 0;
2199         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2200                 strncpy(ifaces->ifaces[i].name, cur->name,
2201                         sizeof(ifaces->ifaces[i].name));
2202                 ifaces->ifaces[i].name[sizeof(ifaces->ifaces[i].name)-1] = '\0';
2203                 ifaces->ifaces[i].link_state = cur->link_up;
2204                 ifaces->ifaces[i].references = cur->references;
2205                 i++;
2206         }
2207         ifaces->num = i;
2208         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2209                 i*sizeof(struct ctdb_iface);
2210
2211         outdata->dsize = len;
2212         outdata->dptr  = (uint8_t *)ifaces;
2213
2214         return 0;
2215 }
2216
2217 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2218                                     struct ctdb_req_control_old *c,
2219                                     TDB_DATA indata)
2220 {
2221         struct ctdb_iface *info;
2222         struct ctdb_interface *iface;
2223         bool link_up = false;
2224
2225         info = (struct ctdb_iface *)indata.dptr;
2226
2227         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2228                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2229                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2230                                   len, len, info->name));
2231                 return -1;
2232         }
2233
2234         switch (info->link_state) {
2235         case 0:
2236                 link_up = false;
2237                 break;
2238         case 1:
2239                 link_up = true;
2240                 break;
2241         default:
2242                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2243                                   (unsigned int)info->link_state));
2244                 return -1;
2245         }
2246
2247         if (info->references != 0) {
2248                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2249                                   (unsigned int)info->references));
2250                 return -1;
2251         }
2252
2253         iface = ctdb_find_iface(ctdb, info->name);
2254         if (iface == NULL) {
2255                 return -1;
2256         }
2257
2258         if (link_up == iface->link_up) {
2259                 return 0;
2260         }
2261
2262         DEBUG(DEBUG_ERR,
2263               ("iface[%s] has changed it's link status %s => %s\n",
2264                iface->name,
2265                iface->link_up?"up":"down",
2266                link_up?"up":"down"));
2267
2268         iface->link_up = link_up;
2269         return 0;
2270 }
2271
2272
2273 /*
2274   called by a daemon to inform us of the entire list of TCP tickles for
2275   a particular public address.
2276   this control should only be sent by the node that is currently serving
2277   that public address.
2278  */
2279 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2280 {
2281         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
2282         struct ctdb_tcp_array *tcparray;
2283         struct ctdb_vnn *vnn;
2284
2285         /* We must at least have tickles.num or else we cant verify the size
2286            of the received data blob
2287          */
2288         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
2289                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
2290                 return -1;
2291         }
2292
2293         /* verify that the size of data matches what we expect */
2294         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
2295                          + sizeof(struct ctdb_connection) * list->num) {
2296                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
2297                 return -1;
2298         }
2299
2300         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
2301                            ctdb_addr_to_str(&list->addr)));
2302
2303         vnn = find_public_ip_vnn(ctdb, &list->addr);
2304         if (vnn == NULL) {
2305                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
2306                         ctdb_addr_to_str(&list->addr)));
2307
2308                 return 1;
2309         }
2310
2311         if (vnn->pnn == ctdb->pnn) {
2312                 DEBUG(DEBUG_INFO,
2313                       ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
2314                        ctdb_addr_to_str(&list->addr)));
2315                 return 0;
2316         }
2317
2318         /* remove any old ticklelist we might have */
2319         talloc_free(vnn->tcp_array);
2320         vnn->tcp_array = NULL;
2321
2322         tcparray = talloc(vnn, struct ctdb_tcp_array);
2323         CTDB_NO_MEMORY(ctdb, tcparray);
2324
2325         tcparray->num = list->num;
2326
2327         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
2328         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2329
2330         memcpy(tcparray->connections, &list->connections[0],
2331                sizeof(struct ctdb_connection)*tcparray->num);
2332
2333         /* We now have a new fresh tickle list array for this vnn */
2334         vnn->tcp_array = tcparray;
2335
2336         return 0;
2337 }
2338
2339 /*
2340   called to return the full list of tickles for the puclic address associated 
2341   with the provided vnn
2342  */
2343 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2344 {
2345         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2346         struct ctdb_tickle_list_old *list;
2347         struct ctdb_tcp_array *tcparray;
2348         int num, i;
2349         struct ctdb_vnn *vnn;
2350         unsigned port;
2351
2352         vnn = find_public_ip_vnn(ctdb, addr);
2353         if (vnn == NULL) {
2354                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
2355                         ctdb_addr_to_str(addr)));
2356
2357                 return 1;
2358         }
2359
2360         port = ctdb_addr_to_port(addr);
2361
2362         tcparray = vnn->tcp_array;
2363         num = 0;
2364         if (tcparray != NULL) {
2365                 if (port == 0) {
2366                         /* All connections */
2367                         num = tcparray->num;
2368                 } else {
2369                         /* Count connections for port */
2370                         for (i = 0; i < tcparray->num; i++) {
2371                                 if (port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
2372                                         num++;
2373                                 }
2374                         }
2375                 }
2376         }
2377
2378         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
2379                         + sizeof(struct ctdb_connection) * num;
2380
2381         outdata->dptr  = talloc_size(outdata, outdata->dsize);
2382         CTDB_NO_MEMORY(ctdb, outdata->dptr);
2383         list = (struct ctdb_tickle_list_old *)outdata->dptr;
2384
2385         list->addr = *addr;
2386         list->num = num;
2387
2388         if (num == 0) {
2389                 return 0;
2390         }
2391
2392         num = 0;
2393         for (i = 0; i < tcparray->num; i++) {
2394                 if (port == 0 || \
2395                     port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
2396                         list->connections[num] = tcparray->connections[i];
2397                         num++;
2398                 }
2399         }
2400
2401         return 0;
2402 }
2403
2404
2405 /*
2406   set the list of all tcp tickles for a public address
2407  */
2408 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
2409                                             ctdb_sock_addr *addr,
2410                                             struct ctdb_tcp_array *tcparray)
2411 {
2412         int ret, num;
2413         TDB_DATA data;
2414         struct ctdb_tickle_list_old *list;
2415
2416         if (tcparray) {
2417                 num = tcparray->num;
2418         } else {
2419                 num = 0;
2420         }
2421
2422         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
2423                         sizeof(struct ctdb_connection) * num;
2424         data.dptr = talloc_size(ctdb, data.dsize);
2425         CTDB_NO_MEMORY(ctdb, data.dptr);
2426
2427         list = (struct ctdb_tickle_list_old *)data.dptr;
2428         list->addr = *addr;
2429         list->num = num;
2430         if (tcparray) {
2431                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
2432         }
2433
2434         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
2435                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2436                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2437         if (ret != 0) {
2438                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2439                 return -1;
2440         }
2441
2442         talloc_free(data.dptr);
2443
2444         return ret;
2445 }
2446
2447
2448 /*
2449   perform tickle updates if required
2450  */
2451 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
2452                                     struct tevent_timer *te,
2453                                     struct timeval t, void *private_data)
2454 {
2455         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2456         int ret;
2457         struct ctdb_vnn *vnn;
2458
2459         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2460                 /* we only send out updates for public addresses that 
2461                    we have taken over
2462                  */
2463                 if (ctdb->pnn != vnn->pnn) {
2464                         continue;
2465                 }
2466                 /* We only send out the updates if we need to */
2467                 if (!vnn->tcp_update_needed) {
2468                         continue;
2469                 }
2470                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
2471                                                        &vnn->public_address,
2472                                                        vnn->tcp_array);
2473                 if (ret != 0) {
2474                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2475                                 ctdb_addr_to_str(&vnn->public_address)));
2476                 } else {
2477                         DEBUG(DEBUG_INFO,
2478                               ("Sent tickle update for public address %s\n",
2479                                ctdb_addr_to_str(&vnn->public_address)));
2480                         vnn->tcp_update_needed = false;
2481                 }
2482         }
2483
2484         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
2485                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2486                          ctdb_update_tcp_tickles, ctdb);
2487 }
2488
2489 /*
2490   start periodic update of tcp tickles
2491  */
2492 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2493 {
2494         ctdb->tickle_update_context = talloc_new(ctdb);
2495
2496         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
2497                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2498                          ctdb_update_tcp_tickles, ctdb);
2499 }
2500
2501
2502
2503
2504 struct control_gratious_arp {
2505         struct ctdb_context *ctdb;
2506         ctdb_sock_addr addr;
2507         const char *iface;
2508         int count;
2509 };
2510
2511 /*
2512   send a control_gratuitous arp
2513  */
2514 static void send_gratious_arp(struct tevent_context *ev,
2515                               struct tevent_timer *te,
2516                               struct timeval t, void *private_data)
2517 {
2518         int ret;
2519         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2520                                                         struct control_gratious_arp);
2521
2522         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2523         if (ret != 0) {
2524                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2525                                  arp->iface, strerror(errno)));
2526         }
2527
2528
2529         arp->count++;
2530         if (arp->count == CTDB_ARP_REPEAT) {
2531                 talloc_free(arp);
2532                 return;
2533         }
2534
2535         tevent_add_timer(arp->ctdb->ev, arp,
2536                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
2537                          send_gratious_arp, arp);
2538 }
2539
2540
2541 /*
2542   send a gratious arp 
2543  */
2544 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2545 {
2546         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
2547         struct control_gratious_arp *arp;
2548
2549         /* verify the size of indata */
2550         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
2551                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2552                                  (unsigned)indata.dsize, 
2553                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
2554                 return -1;
2555         }
2556         if (indata.dsize != 
2557                 ( offsetof(struct ctdb_addr_info_old, iface)
2558                 + gratious_arp->len ) ){
2559
2560                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2561                         "but should be %u bytes\n", 
2562                          (unsigned)indata.dsize, 
2563                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
2564                 return -1;
2565         }
2566
2567
2568         arp = talloc(ctdb, struct control_gratious_arp);
2569         CTDB_NO_MEMORY(ctdb, arp);
2570
2571         arp->ctdb  = ctdb;
2572         arp->addr   = gratious_arp->addr;
2573         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2574         CTDB_NO_MEMORY(ctdb, arp->iface);
2575         arp->count = 0;
2576
2577         tevent_add_timer(arp->ctdb->ev, arp,
2578                          timeval_zero(), send_gratious_arp, arp);
2579
2580         return 0;
2581 }
2582
2583 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2584 {
2585         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
2586         int ret;
2587
2588         /* verify the size of indata */
2589         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
2590                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
2591                 return -1;
2592         }
2593         if (indata.dsize != 
2594                 ( offsetof(struct ctdb_addr_info_old, iface)
2595                 + pub->len ) ){
2596
2597                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2598                         "but should be %u bytes\n", 
2599                          (unsigned)indata.dsize, 
2600                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
2601                 return -1;
2602         }
2603
2604         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
2605
2606         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
2607
2608         if (ret != 0) {
2609                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2610                 return -1;
2611         }
2612
2613         return 0;
2614 }
2615
2616 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2617 {
2618         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
2619         struct ctdb_vnn *vnn;
2620
2621         /* verify the size of indata */
2622         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
2623                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
2624                 return -1;
2625         }
2626         if (indata.dsize != 
2627                 ( offsetof(struct ctdb_addr_info_old, iface)
2628                 + pub->len ) ){
2629
2630                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2631                         "but should be %u bytes\n", 
2632                          (unsigned)indata.dsize, 
2633                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
2634                 return -1;
2635         }
2636
2637         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
2638
2639         /* walk over all public addresses until we find a match */
2640         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2641                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2642                         if (vnn->pnn == ctdb->pnn) {
2643                                 /* This IP is currently being hosted.
2644                                  * Defer the deletion until the next
2645                                  * takeover run. "ctdb reloadips" will
2646                                  * always cause a takeover run.  "ctdb
2647                                  * delip" will now need an explicit
2648                                  * "ctdb ipreallocated" afterwards. */
2649                                 vnn->delete_pending = true;
2650                         } else {
2651                                 /* This IP is not hosted on the
2652                                  * current node so just delete it
2653                                  * now. */
2654                                 do_delete_ip(ctdb, vnn);
2655                         }
2656
2657                         return 0;
2658                 }
2659         }
2660
2661         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
2662                          ctdb_addr_to_str(&pub->addr)));
2663         return -1;
2664 }
2665
2666
2667 struct ipreallocated_callback_state {
2668         struct ctdb_req_control_old *c;
2669 };
2670
2671 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
2672                                         int status, void *p)
2673 {
2674         struct ipreallocated_callback_state *state =
2675                 talloc_get_type(p, struct ipreallocated_callback_state);
2676
2677         if (status != 0) {
2678                 DEBUG(DEBUG_ERR,
2679                       (" \"ipreallocated\" event script failed (status %d)\n",
2680                        status));
2681                 if (status == -ETIME) {
2682                         ctdb_ban_self(ctdb);
2683                 }
2684         }
2685
2686         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
2687         talloc_free(state);
2688 }
2689
2690 /* A control to run the ipreallocated event */
2691 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
2692                                    struct ctdb_req_control_old *c,
2693                                    bool *async_reply)
2694 {
2695         int ret;
2696         struct ipreallocated_callback_state *state;
2697
2698         state = talloc(ctdb, struct ipreallocated_callback_state);
2699         CTDB_NO_MEMORY(ctdb, state);
2700
2701         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
2702
2703         ret = ctdb_event_script_callback(ctdb, state,
2704                                          ctdb_ipreallocated_callback, state,
2705                                          CTDB_EVENT_IPREALLOCATED,
2706                                          "%s", "");
2707
2708         if (ret != 0) {
2709                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
2710                 talloc_free(state);
2711                 return -1;
2712         }
2713
2714         /* tell the control that we will be reply asynchronously */
2715         state->c    = talloc_steal(state, c);
2716         *async_reply = true;
2717
2718         return 0;
2719 }
2720
2721
2722 struct ctdb_reloadips_handle {
2723         struct ctdb_context *ctdb;
2724         struct ctdb_req_control_old *c;
2725         int status;
2726         int fd[2];
2727         pid_t child;
2728         struct tevent_fd *fde;
2729 };
2730
2731 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
2732 {
2733         if (h == h->ctdb->reload_ips) {
2734                 h->ctdb->reload_ips = NULL;
2735         }
2736         if (h->c != NULL) {
2737                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
2738                 h->c = NULL;
2739         }
2740         ctdb_kill(h->ctdb, h->child, SIGKILL);
2741         return 0;
2742 }
2743
2744 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
2745                                          struct tevent_timer *te,
2746                                          struct timeval t, void *private_data)
2747 {
2748         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
2749
2750         talloc_free(h);
2751 }
2752
2753 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
2754                                          struct tevent_fd *fde,
2755                                          uint16_t flags, void *private_data)
2756 {
2757         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
2758
2759         char res;
2760         int ret;
2761
2762         ret = sys_read(h->fd[0], &res, 1);
2763         if (ret < 1 || res != 0) {
2764                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
2765                 res = 1;
2766         }
2767         h->status = res;
2768
2769         talloc_free(h);
2770 }
2771
2772 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
2773 {
2774         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2775         struct ctdb_public_ip_list_old *ips;
2776         struct ctdb_vnn *vnn;
2777         struct client_async_data *async_data;
2778         struct timeval timeout;
2779         TDB_DATA data;
2780         struct ctdb_client_control_state *state;
2781         bool first_add;
2782         int i, ret;
2783
2784         CTDB_NO_MEMORY(ctdb, mem_ctx);
2785
2786         /* Read IPs from local node */
2787         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
2788                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
2789         if (ret != 0) {
2790                 DEBUG(DEBUG_ERR,
2791                       ("Unable to fetch public IPs from local node\n"));
2792                 talloc_free(mem_ctx);
2793                 return -1;
2794         }
2795
2796         /* Read IPs file - this is safe since this is a child process */
2797         ctdb->vnn = NULL;
2798         if (ctdb_set_public_addresses(ctdb, false) != 0) {
2799                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
2800                 talloc_free(mem_ctx);
2801                 return -1;
2802         }
2803
2804         async_data = talloc_zero(mem_ctx, struct client_async_data);
2805         CTDB_NO_MEMORY(ctdb, async_data);
2806
2807         /* Compare IPs between node and file for IPs to be deleted */
2808         for (i = 0; i < ips->num; i++) {
2809                 /* */
2810                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
2811                         if (ctdb_same_ip(&vnn->public_address,
2812                                          &ips->ips[i].addr)) {
2813                                 /* IP is still in file */
2814                                 break;
2815                         }
2816                 }
2817
2818                 if (vnn == NULL) {
2819                         /* Delete IP ips->ips[i] */
2820                         struct ctdb_addr_info_old *pub;
2821
2822                         DEBUG(DEBUG_NOTICE,
2823                               ("IP %s no longer configured, deleting it\n",
2824                                ctdb_addr_to_str(&ips->ips[i].addr)));
2825
2826                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
2827                         CTDB_NO_MEMORY(ctdb, pub);
2828
2829                         pub->addr  = ips->ips[i].addr;
2830                         pub->mask  = 0;
2831                         pub->len   = 0;
2832
2833                         timeout = TAKEOVER_TIMEOUT();
2834
2835                         data.dsize = offsetof(struct ctdb_addr_info_old,
2836                                               iface) + pub->len;
2837                         data.dptr = (uint8_t *)pub;
2838
2839                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
2840                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
2841                                                   0, data, async_data,
2842                                                   &timeout, NULL);
2843                         if (state == NULL) {
2844                                 DEBUG(DEBUG_ERR,
2845                                       (__location__
2846                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
2847                                 goto failed;
2848                         }
2849
2850                         ctdb_client_async_add(async_data, state);
2851                 }
2852         }
2853
2854         /* Compare IPs between node and file for IPs to be added */
2855         first_add = true;
2856         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
2857                 for (i = 0; i < ips->num; i++) {
2858                         if (ctdb_same_ip(&vnn->public_address,
2859                                          &ips->ips[i].addr)) {
2860                                 /* IP already on node */
2861                                 break;
2862                         }
2863                 }
2864                 if (i == ips->num) {
2865                         /* Add IP ips->ips[i] */
2866                         struct ctdb_addr_info_old *pub;
2867                         const char *ifaces = NULL;
2868                         uint32_t len;
2869                         struct vnn_interface *iface = NULL;
2870
2871                         DEBUG(DEBUG_NOTICE,
2872                               ("New IP %s configured, adding it\n",
2873                                ctdb_addr_to_str(&vnn->public_address)));
2874                         if (first_add) {
2875                                 uint32_t pnn = ctdb_get_pnn(ctdb);
2876
2877                                 data.dsize = sizeof(pnn);
2878                                 data.dptr  = (uint8_t *)&pnn;
2879
2880                                 ret = ctdb_client_send_message(
2881                                         ctdb,
2882                                         CTDB_BROADCAST_CONNECTED,
2883                                         CTDB_SRVID_REBALANCE_NODE,
2884                                         data);
2885                                 if (ret != 0) {
2886                                         DEBUG(DEBUG_WARNING,
2887                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
2888                                 }
2889
2890                                 first_add = false;
2891                         }
2892
2893                         ifaces = vnn->ifaces->iface->name;
2894                         iface = vnn->ifaces->next;
2895                         while (iface != NULL) {
2896                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
2897                                                          iface->iface->name);
2898                                 iface = iface->next;
2899                         }
2900
2901                         len   = strlen(ifaces) + 1;
2902                         pub = talloc_zero_size(mem_ctx,
2903                                                offsetof(struct ctdb_addr_info_old, iface) + len);
2904                         CTDB_NO_MEMORY(ctdb, pub);
2905
2906                         pub->addr  = vnn->public_address;
2907                         pub->mask  = vnn->public_netmask_bits;
2908                         pub->len   = len;
2909                         memcpy(&pub->iface[0], ifaces, pub->len);
2910
2911                         timeout = TAKEOVER_TIMEOUT();
2912
2913                         data.dsize = offsetof(struct ctdb_addr_info_old,
2914                                               iface) + pub->len;
2915                         data.dptr = (uint8_t *)pub;
2916
2917                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
2918                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
2919                                                   0, data, async_data,
2920                                                   &timeout, NULL);
2921                         if (state == NULL) {
2922                                 DEBUG(DEBUG_ERR,
2923                                       (__location__
2924                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
2925                                 goto failed;
2926                         }
2927
2928                         ctdb_client_async_add(async_data, state);
2929                 }
2930         }
2931
2932         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2933                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
2934                 goto failed;
2935         }
2936
2937         talloc_free(mem_ctx);
2938         return 0;
2939
2940 failed:
2941         talloc_free(mem_ctx);
2942         return -1;
2943 }
2944
2945 /* This control is sent to force the node to re-read the public addresses file
2946    and drop any addresses we should nnot longer host, and add new addresses
2947    that we are now able to host
2948 */
2949 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
2950 {
2951         struct ctdb_reloadips_handle *h;
2952         pid_t parent = getpid();
2953
2954         if (ctdb->reload_ips != NULL) {
2955                 talloc_free(ctdb->reload_ips);
2956                 ctdb->reload_ips = NULL;
2957         }
2958
2959         h = talloc(ctdb, struct ctdb_reloadips_handle);
2960         CTDB_NO_MEMORY(ctdb, h);
2961         h->ctdb     = ctdb;
2962         h->c        = NULL;
2963         h->status   = -1;
2964         
2965         if (pipe(h->fd) == -1) {
2966                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
2967                 talloc_free(h);
2968                 return -1;
2969         }
2970
2971         h->child = ctdb_fork(ctdb);
2972         if (h->child == (pid_t)-1) {
2973                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
2974                 close(h->fd[0]);
2975                 close(h->fd[1]);
2976                 talloc_free(h);
2977                 return -1;
2978         }
2979
2980         /* child process */
2981         if (h->child == 0) {
2982                 signed char res = 0;
2983
2984                 close(h->fd[0]);
2985
2986                 prctl_set_comment("ctdb_reloadips");
2987                 if (switch_from_server_to_client(ctdb) != 0) {
2988                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
2989                         res = -1;
2990                 } else {
2991                         res = ctdb_reloadips_child(ctdb);
2992                         if (res != 0) {
2993                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
2994                         }
2995                 }
2996
2997                 sys_write(h->fd[1], &res, 1);
2998                 ctdb_wait_for_process_to_exit(parent);
2999                 _exit(0);
3000         }
3001
3002         h->c             = talloc_steal(h, c);
3003
3004         close(h->fd[1]);
3005         set_close_on_exec(h->fd[0]);
3006
3007         talloc_set_destructor(h, ctdb_reloadips_destructor);
3008
3009
3010         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
3011                                ctdb_reloadips_child_handler, (void *)h);
3012         tevent_fd_set_auto_close(h->fde);
3013
3014         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
3015                          ctdb_reloadips_timeout_event, h);
3016
3017         /* we reply later */
3018         *async_reply = true;
3019         return 0;
3020 }