RELOADIPS: simplify the reloadips code a bit
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         i->link_up = false;
69
70         DLIST_ADD(ctdb->ifaces, i);
71
72         return 0;
73 }
74
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
76                                           const char *iface)
77 {
78         struct ctdb_iface *i;
79
80         /* Verify that we dont have an entry for this ip yet */
81         for (i=ctdb->ifaces;i;i=i->next) {
82                 if (strcmp(i->name, iface) == 0) {
83                         return i;
84                 }
85         }
86
87         return NULL;
88 }
89
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
91                                               struct ctdb_vnn *vnn)
92 {
93         int i;
94         struct ctdb_iface *cur = NULL;
95         struct ctdb_iface *best = NULL;
96
97         for (i=0; vnn->ifaces[i]; i++) {
98
99                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
100                 if (cur == NULL) {
101                         continue;
102                 }
103
104                 if (!cur->link_up) {
105                         continue;
106                 }
107
108                 if (best == NULL) {
109                         best = cur;
110                         continue;
111                 }
112
113                 if (cur->references < best->references) {
114                         best = cur;
115                         continue;
116                 }
117         }
118
119         return best;
120 }
121
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123                                      struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *best = NULL;
126
127         if (vnn->iface) {
128                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129                                    "still assigned to iface '%s'\n",
130                                    ctdb_addr_to_str(&vnn->public_address),
131                                    ctdb_vnn_iface_string(vnn)));
132                 return 0;
133         }
134
135         best = ctdb_vnn_best_iface(ctdb, vnn);
136         if (best == NULL) {
137                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138                                   "cannot assign to iface any iface\n",
139                                   ctdb_addr_to_str(&vnn->public_address)));
140                 return -1;
141         }
142
143         vnn->iface = best;
144         best->references++;
145         vnn->pnn = ctdb->pnn;
146
147         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148                            "now assigned to iface '%s' refs[%d]\n",
149                            ctdb_addr_to_str(&vnn->public_address),
150                            ctdb_vnn_iface_string(vnn),
151                            best->references));
152         return 0;
153 }
154
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156                                     struct ctdb_vnn *vnn)
157 {
158         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159                            "now unassigned (old iface '%s' refs[%d])\n",
160                            ctdb_addr_to_str(&vnn->public_address),
161                            ctdb_vnn_iface_string(vnn),
162                            vnn->iface?vnn->iface->references:0));
163         if (vnn->iface) {
164                 vnn->iface->references--;
165         }
166         vnn->iface = NULL;
167         if (vnn->pnn == ctdb->pnn) {
168                 vnn->pnn = -1;
169         }
170 }
171
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173                                struct ctdb_vnn *vnn)
174 {
175         int i;
176
177         if (vnn->iface && vnn->iface->link_up) {
178                 return true;
179         }
180
181         for (i=0; vnn->ifaces[i]; i++) {
182                 struct ctdb_iface *cur;
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (cur->link_up) {
190                         return true;
191                 }
192         }
193
194         return false;
195 }
196
197 struct ctdb_takeover_arp {
198         struct ctdb_context *ctdb;
199         uint32_t count;
200         ctdb_sock_addr addr;
201         struct ctdb_tcp_array *tcparray;
202         struct ctdb_vnn *vnn;
203 };
204
205
206 /*
207   lists of tcp endpoints
208  */
209 struct ctdb_tcp_list {
210         struct ctdb_tcp_list *prev, *next;
211         struct ctdb_tcp_connection connection;
212 };
213
214 /*
215   list of clients to kill on IP release
216  */
217 struct ctdb_client_ip {
218         struct ctdb_client_ip *prev, *next;
219         struct ctdb_context *ctdb;
220         ctdb_sock_addr addr;
221         uint32_t client_id;
222 };
223
224
225 /*
226   send a gratuitous arp
227  */
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
229                                   struct timeval t, void *private_data)
230 {
231         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
232                                                         struct ctdb_takeover_arp);
233         int i, ret;
234         struct ctdb_tcp_array *tcparray;
235         const char *iface = ctdb_vnn_iface_string(arp->vnn);
236
237         ret = ctdb_sys_send_arp(&arp->addr, iface);
238         if (ret != 0) {
239                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240                                   iface, strerror(errno)));
241         }
242
243         tcparray = arp->tcparray;
244         if (tcparray) {
245                 for (i=0;i<tcparray->num;i++) {
246                         struct ctdb_tcp_connection *tcon;
247
248                         tcon = &tcparray->connections[i];
249                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
251                                 ctdb_addr_to_str(&tcon->src_addr),
252                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253                         ret = ctdb_sys_send_tcp(
254                                 &tcon->src_addr, 
255                                 &tcon->dst_addr,
256                                 0, 0, 0);
257                         if (ret != 0) {
258                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259                                         ctdb_addr_to_str(&tcon->src_addr)));
260                         }
261                 }
262         }
263
264         arp->count++;
265
266         if (arp->count == CTDB_ARP_REPEAT) {
267                 talloc_free(arp);
268                 return;
269         }
270
271         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
272                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
273                         ctdb_control_send_arp, arp);
274 }
275
276 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
277                                        struct ctdb_vnn *vnn)
278 {
279         struct ctdb_takeover_arp *arp;
280         struct ctdb_tcp_array *tcparray;
281
282         if (!vnn->takeover_ctx) {
283                 vnn->takeover_ctx = talloc_new(vnn);
284                 if (!vnn->takeover_ctx) {
285                         return -1;
286                 }
287         }
288
289         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
290         if (!arp) {
291                 return -1;
292         }
293
294         arp->ctdb = ctdb;
295         arp->addr = vnn->public_address;
296         arp->vnn  = vnn;
297
298         tcparray = vnn->tcp_array;
299         if (tcparray) {
300                 /* add all of the known tcp connections for this IP to the
301                    list of tcp connections to send tickle acks for */
302                 arp->tcparray = talloc_steal(arp, tcparray);
303
304                 vnn->tcp_array = NULL;
305                 vnn->tcp_update_needed = true;
306         }
307
308         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
309                         timeval_zero(), ctdb_control_send_arp, arp);
310
311         return 0;
312 }
313
314 struct takeover_callback_state {
315         struct ctdb_req_control *c;
316         ctdb_sock_addr *addr;
317         struct ctdb_vnn *vnn;
318 };
319
320 struct ctdb_do_takeip_state {
321         struct ctdb_req_control *c;
322         struct ctdb_vnn *vnn;
323 };
324
325 /*
326   called when takeip event finishes
327  */
328 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
329                                     void *private_data)
330 {
331         struct ctdb_do_takeip_state *state =
332                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
333         int32_t ret;
334         TDB_DATA data;
335
336         if (status != 0) {
337                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
338         
339                 if (status == -ETIME) {
340                         ctdb_ban_self(ctdb);
341                 }
342                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
343                                  ctdb_addr_to_str(&state->vnn->public_address),
344                                  ctdb_vnn_iface_string(state->vnn)));
345                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
346
347                 node->flags |= NODE_FLAGS_UNHEALTHY;
348                 talloc_free(state);
349                 return;
350         }
351
352         if (ctdb->do_checkpublicip) {
353
354         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
355         if (ret != 0) {
356                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
357                 talloc_free(state);
358                 return;
359         }
360
361         }
362
363         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
364         data.dsize = strlen((char *)data.dptr) + 1;
365         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
366
367         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
368
369
370         /* the control succeeded */
371         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
372         talloc_free(state);
373         return;
374 }
375
376 /*
377   take over an ip address
378  */
379 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
380                               struct ctdb_req_control *c,
381                               struct ctdb_vnn *vnn)
382 {
383         int ret;
384         struct ctdb_do_takeip_state *state;
385
386         ret = ctdb_vnn_assign_iface(ctdb, vnn);
387         if (ret != 0) {
388                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
389                                  "assign a usable interface\n",
390                                  ctdb_addr_to_str(&vnn->public_address),
391                                  vnn->public_netmask_bits));
392                 return -1;
393         }
394
395         state = talloc(vnn, struct ctdb_do_takeip_state);
396         CTDB_NO_MEMORY(ctdb, state);
397
398         state->c = talloc_steal(ctdb, c);
399         state->vnn   = vnn;
400
401         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
402                             ctdb_addr_to_str(&vnn->public_address),
403                             vnn->public_netmask_bits,
404                             ctdb_vnn_iface_string(vnn)));
405
406         ret = ctdb_event_script_callback(ctdb,
407                                          state,
408                                          ctdb_do_takeip_callback,
409                                          state,
410                                          false,
411                                          CTDB_EVENT_TAKE_IP,
412                                          "%s %s %u",
413                                          ctdb_vnn_iface_string(vnn),
414                                          ctdb_addr_to_str(&vnn->public_address),
415                                          vnn->public_netmask_bits);
416
417         if (ret != 0) {
418                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
419                         ctdb_addr_to_str(&vnn->public_address),
420                         ctdb_vnn_iface_string(vnn)));
421                 talloc_free(state);
422                 return -1;
423         }
424
425         return 0;
426 }
427
428 struct ctdb_do_updateip_state {
429         struct ctdb_req_control *c;
430         struct ctdb_iface *old;
431         struct ctdb_vnn *vnn;
432 };
433
434 /*
435   called when updateip event finishes
436  */
437 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
438                                       void *private_data)
439 {
440         struct ctdb_do_updateip_state *state =
441                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
442         int32_t ret;
443
444         if (status != 0) {
445                 if (status == -ETIME) {
446                         ctdb_ban_self(ctdb);
447                 }
448                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
449                         ctdb_addr_to_str(&state->vnn->public_address),
450                         state->old->name,
451                         ctdb_vnn_iface_string(state->vnn)));
452
453                 /*
454                  * All we can do is reset the old interface
455                  * and let the next run fix it
456                  */
457                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
458                 state->vnn->iface = state->old;
459                 state->vnn->iface->references++;
460
461                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
462                 talloc_free(state);
463                 return;
464         }
465
466         if (ctdb->do_checkpublicip) {
467
468         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469         if (ret != 0) {
470                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471                 talloc_free(state);
472                 return;
473         }
474
475         }
476
477         /* the control succeeded */
478         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
479         talloc_free(state);
480         return;
481 }
482
483 /*
484   update (move) an ip address
485  */
486 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
487                                 struct ctdb_req_control *c,
488                                 struct ctdb_vnn *vnn)
489 {
490         int ret;
491         struct ctdb_do_updateip_state *state;
492         struct ctdb_iface *old = vnn->iface;
493         const char *new_name;
494
495         ctdb_vnn_unassign_iface(ctdb, vnn);
496         ret = ctdb_vnn_assign_iface(ctdb, vnn);
497         if (ret != 0) {
498                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
499                                  "assin a usable interface (old iface '%s')\n",
500                                  ctdb_addr_to_str(&vnn->public_address),
501                                  vnn->public_netmask_bits,
502                                  old->name));
503                 return -1;
504         }
505
506         new_name = ctdb_vnn_iface_string(vnn);
507         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
508                 /* A benign update from one interface onto itself.
509                  * no need to run the eventscripts in this case, just return
510                  * success.
511                  */
512                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
513                 return 0;
514         }
515
516         state = talloc(vnn, struct ctdb_do_updateip_state);
517         CTDB_NO_MEMORY(ctdb, state);
518
519         state->c = talloc_steal(ctdb, c);
520         state->old = old;
521         state->vnn = vnn;
522
523         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
524                             "interface %s to %s\n",
525                             ctdb_addr_to_str(&vnn->public_address),
526                             vnn->public_netmask_bits,
527                             old->name,
528                             new_name));
529
530         ret = ctdb_event_script_callback(ctdb,
531                                          state,
532                                          ctdb_do_updateip_callback,
533                                          state,
534                                          false,
535                                          CTDB_EVENT_UPDATE_IP,
536                                          "%s %s %s %u",
537                                          state->old->name,
538                                          new_name,
539                                          ctdb_addr_to_str(&vnn->public_address),
540                                          vnn->public_netmask_bits);
541         if (ret != 0) {
542                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
543                                  ctdb_addr_to_str(&vnn->public_address),
544                                  old->name, new_name));
545                 talloc_free(state);
546                 return -1;
547         }
548
549         return 0;
550 }
551
552 /*
553   Find the vnn of the node that has a public ip address
554   returns -1 if the address is not known as a public address
555  */
556 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
557 {
558         struct ctdb_vnn *vnn;
559
560         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
561                 if (ctdb_same_ip(&vnn->public_address, addr)) {
562                         return vnn;
563                 }
564         }
565
566         return NULL;
567 }
568
569 /*
570   take over an ip address
571  */
572 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
573                                  struct ctdb_req_control *c,
574                                  TDB_DATA indata,
575                                  bool *async_reply)
576 {
577         int ret;
578         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
579         struct ctdb_vnn *vnn;
580         bool have_ip = false;
581         bool do_updateip = false;
582         bool do_takeip = false;
583         struct ctdb_iface *best_iface = NULL;
584
585         if (pip->pnn != ctdb->pnn) {
586                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
587                                  "with pnn %d, but we're node %d\n",
588                                  ctdb_addr_to_str(&pip->addr),
589                                  pip->pnn, ctdb->pnn));
590                 return -1;
591         }
592
593         /* update out vnn list */
594         vnn = find_public_ip_vnn(ctdb, &pip->addr);
595         if (vnn == NULL) {
596                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
597                         ctdb_addr_to_str(&pip->addr)));
598                 return 0;
599         }
600
601         if (ctdb->do_checkpublicip) {
602                 have_ip = ctdb_sys_have_ip(&pip->addr);
603         }
604         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
605         if (best_iface == NULL) {
606                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
607                                  "a usable interface (old %s, have_ip %d)\n",
608                                  ctdb_addr_to_str(&vnn->public_address),
609                                  vnn->public_netmask_bits,
610                                  ctdb_vnn_iface_string(vnn),
611                                  have_ip));
612                 return -1;
613         }
614
615         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
616                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
617                 have_ip = false;
618         }
619
620
621         if (vnn->iface == NULL && have_ip) {
622                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
623                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
624                                  ctdb_addr_to_str(&vnn->public_address)));
625                 return 0;
626         }
627
628         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
629                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
630                                   "and we have it on iface[%s], but it was assigned to node %d"
631                                   "and we are node %d, banning ourself\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
634                 ctdb_ban_self(ctdb);
635                 return -1;
636         }
637
638         if (vnn->pnn == -1 && have_ip) {
639                 vnn->pnn = ctdb->pnn;
640                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
641                                   "and we already have it on iface[%s], update local daemon\n",
642                                  ctdb_addr_to_str(&vnn->public_address),
643                                   ctdb_vnn_iface_string(vnn)));
644                 return 0;
645         }
646
647         if (vnn->iface) {
648                 if (vnn->iface->link_up) {
649                         /* only move when the rebalance gains something */
650                         if (vnn->iface->references > (best_iface->references + 1)) {
651                                 do_updateip = true;
652                         }
653                 } else if (vnn->iface != best_iface) {
654                         do_updateip = true;
655                 }
656         }
657
658         if (!have_ip) {
659                 if (do_updateip) {
660                         ctdb_vnn_unassign_iface(ctdb, vnn);
661                         do_updateip = false;
662                 }
663                 do_takeip = true;
664         }
665
666         if (do_takeip) {
667                 ret = ctdb_do_takeip(ctdb, c, vnn);
668                 if (ret != 0) {
669                         return -1;
670                 }
671         } else if (do_updateip) {
672                 ret = ctdb_do_updateip(ctdb, c, vnn);
673                 if (ret != 0) {
674                         return -1;
675                 }
676         } else {
677                 /*
678                  * The interface is up and the kernel known the ip
679                  * => do nothing
680                  */
681                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
682                         ctdb_addr_to_str(&pip->addr),
683                         vnn->public_netmask_bits,
684                         ctdb_vnn_iface_string(vnn)));
685                 return 0;
686         }
687
688         /* tell ctdb_control.c that we will be replying asynchronously */
689         *async_reply = true;
690
691         return 0;
692 }
693
694 /*
695   takeover an ip address old v4 style
696  */
697 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
698                                 struct ctdb_req_control *c,
699                                 TDB_DATA indata, 
700                                 bool *async_reply)
701 {
702         TDB_DATA data;
703         
704         data.dsize = sizeof(struct ctdb_public_ip);
705         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
706         CTDB_NO_MEMORY(ctdb, data.dptr);
707         
708         memcpy(data.dptr, indata.dptr, indata.dsize);
709         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
710 }
711
712 /*
713   kill any clients that are registered with a IP that is being released
714  */
715 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
716 {
717         struct ctdb_client_ip *ip;
718
719         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
720                 ctdb_addr_to_str(addr)));
721
722         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
723                 ctdb_sock_addr tmp_addr;
724
725                 tmp_addr = ip->addr;
726                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
727                         ip->client_id,
728                         ctdb_addr_to_str(&ip->addr)));
729
730                 if (ctdb_same_ip(&tmp_addr, addr)) {
731                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
732                                                                      ip->client_id, 
733                                                                      struct ctdb_client);
734                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
735                                 ip->client_id,
736                                 ctdb_addr_to_str(&ip->addr),
737                                 client->pid));
738
739                         if (client->pid != 0) {
740                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
741                                         (unsigned)client->pid,
742                                         ctdb_addr_to_str(addr),
743                                         ip->client_id));
744                                 kill(client->pid, SIGKILL);
745                         }
746                 }
747         }
748 }
749
750 /*
751   called when releaseip event finishes
752  */
753 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
754                                 void *private_data)
755 {
756         struct takeover_callback_state *state = 
757                 talloc_get_type(private_data, struct takeover_callback_state);
758         TDB_DATA data;
759
760         if (status == -ETIME) {
761                 ctdb_ban_self(ctdb);
762         }
763
764         /* send a message to all clients of this node telling them
765            that the cluster has been reconfigured and they should
766            release any sockets on this IP */
767         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
768         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
769         data.dsize = strlen((char *)data.dptr)+1;
770
771         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
772
773         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
774
775         /* kill clients that have registered with this IP */
776         release_kill_clients(ctdb, state->addr);
777
778         ctdb_vnn_unassign_iface(ctdb, state->vnn);
779
780         /* the control succeeded */
781         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
782         talloc_free(state);
783 }
784
785 /*
786   release an ip address
787  */
788 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
789                                 struct ctdb_req_control *c,
790                                 TDB_DATA indata, 
791                                 bool *async_reply)
792 {
793         int ret;
794         struct takeover_callback_state *state;
795         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
796         struct ctdb_vnn *vnn;
797
798         /* update our vnn list */
799         vnn = find_public_ip_vnn(ctdb, &pip->addr);
800         if (vnn == NULL) {
801                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
802                         ctdb_addr_to_str(&pip->addr)));
803                 return 0;
804         }
805         vnn->pnn = pip->pnn;
806
807         /* stop any previous arps */
808         talloc_free(vnn->takeover_ctx);
809         vnn->takeover_ctx = NULL;
810
811         if (ctdb->do_checkpublicip) {
812
813                 if (!ctdb_sys_have_ip(&pip->addr)) {
814                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
815                                 ctdb_addr_to_str(&pip->addr),
816                                 vnn->public_netmask_bits,
817                                 ctdb_vnn_iface_string(vnn)));
818                         ctdb_vnn_unassign_iface(ctdb, vnn);
819                         return 0;
820                 }
821
822                 if (vnn->iface == NULL) {
823                         DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
824                                          "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
825                                          ctdb_addr_to_str(&vnn->public_address)));
826                         return 0;
827                 }
828
829         } else if (vnn->iface == NULL) {
830                 DEBUG(DEBUG_ERR, ("No interface found for IP %s.\n",
831                                      ctdb_addr_to_str(&vnn->public_address)));
832                 return 0;
833         }
834
835         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
836                 ctdb_addr_to_str(&pip->addr),
837                 vnn->public_netmask_bits, 
838                 ctdb_vnn_iface_string(vnn),
839                 pip->pnn));
840
841         state = talloc(ctdb, struct takeover_callback_state);
842         CTDB_NO_MEMORY(ctdb, state);
843
844         state->c = talloc_steal(state, c);
845         state->addr = talloc(state, ctdb_sock_addr);       
846         CTDB_NO_MEMORY(ctdb, state->addr);
847         *state->addr = pip->addr;
848         state->vnn   = vnn;
849
850         ret = ctdb_event_script_callback(ctdb, 
851                                          state, release_ip_callback, state,
852                                          false,
853                                          CTDB_EVENT_RELEASE_IP,
854                                          "%s %s %u",
855                                          ctdb_vnn_iface_string(vnn),
856                                          ctdb_addr_to_str(&pip->addr),
857                                          vnn->public_netmask_bits);
858         if (ret != 0) {
859                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
860                         ctdb_addr_to_str(&pip->addr),
861                         ctdb_vnn_iface_string(vnn)));
862                 talloc_free(state);
863                 return -1;
864         }
865
866         /* tell the control that we will be reply asynchronously */
867         *async_reply = true;
868         return 0;
869 }
870
871 /*
872   release an ip address old v4 style
873  */
874 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
875                                 struct ctdb_req_control *c,
876                                 TDB_DATA indata, 
877                                 bool *async_reply)
878 {
879         TDB_DATA data;
880         
881         data.dsize = sizeof(struct ctdb_public_ip);
882         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
883         CTDB_NO_MEMORY(ctdb, data.dptr);
884         
885         memcpy(data.dptr, indata.dptr, indata.dsize);
886         return ctdb_control_release_ip(ctdb, c, data, async_reply);
887 }
888
889
890 static int ctdb_add_public_address(struct ctdb_context *ctdb,
891                                    ctdb_sock_addr *addr,
892                                    unsigned mask, const char *ifaces,
893                                    bool check_address)
894 {
895         struct ctdb_vnn      *vnn;
896         uint32_t num = 0;
897         char *tmp;
898         const char *iface;
899         int i;
900         int ret;
901
902         tmp = strdup(ifaces);
903         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
904                 if (!ctdb_sys_check_iface_exists(iface)) {
905                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
906                         free(tmp);
907                         return -1;
908                 }
909         }
910         free(tmp);
911
912         /* Verify that we dont have an entry for this ip yet */
913         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
914                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
915                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
916                                 ctdb_addr_to_str(addr)));
917                         return -1;
918                 }               
919         }
920
921         /* create a new vnn structure for this ip address */
922         vnn = talloc_zero(ctdb, struct ctdb_vnn);
923         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
924         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
925         tmp = talloc_strdup(vnn, ifaces);
926         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
927         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
928                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
929                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
930                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
931                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
932                 num++;
933         }
934         talloc_free(tmp);
935         vnn->ifaces[num] = NULL;
936         vnn->public_address      = *addr;
937         vnn->public_netmask_bits = mask;
938         vnn->pnn                 = -1;
939         if (check_address) {
940                 if (ctdb_sys_have_ip(addr)) {
941                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
942                         vnn->pnn = ctdb->pnn;
943                 }
944         }
945
946         for (i=0; vnn->ifaces[i]; i++) {
947                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
948                 if (ret != 0) {
949                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
950                                            "for public_address[%s]\n",
951                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
952                         talloc_free(vnn);
953                         return -1;
954                 }
955                 if (i == 0) {
956                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
957                 }
958         }
959
960         DLIST_ADD(ctdb->vnn, vnn);
961
962         return 0;
963 }
964
965 /*
966   setup the event script directory
967 */
968 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
969 {
970         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
971         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
972         return 0;
973 }
974
975 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
976                                   struct timeval t, void *private_data)
977 {
978         struct ctdb_context *ctdb = talloc_get_type(private_data, 
979                                                         struct ctdb_context);
980         struct ctdb_vnn *vnn;
981
982         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
983                 int i;
984
985                 for (i=0; vnn->ifaces[i] != NULL; i++) {
986                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
987                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
988                                         vnn->ifaces[i],
989                                         ctdb_addr_to_str(&vnn->public_address)));
990                         }
991                 }
992         }
993
994         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
995                 timeval_current_ofs(30, 0), 
996                 ctdb_check_interfaces_event, ctdb);
997 }
998
999
1000 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1001 {
1002         if (ctdb->check_public_ifaces_ctx != NULL) {
1003                 talloc_free(ctdb->check_public_ifaces_ctx);
1004                 ctdb->check_public_ifaces_ctx = NULL;
1005         }
1006
1007         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1008         if (ctdb->check_public_ifaces_ctx == NULL) {
1009                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1010         }
1011
1012         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1013                 timeval_current_ofs(30, 0), 
1014                 ctdb_check_interfaces_event, ctdb);
1015
1016         return 0;
1017 }
1018
1019
1020 /*
1021   setup the public address lists from a file
1022 */
1023 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1024 {
1025         char **lines;
1026         int nlines;
1027         int i;
1028
1029         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1030         if (lines == NULL) {
1031                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1032                 return -1;
1033         }
1034         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1035                 nlines--;
1036         }
1037
1038         for (i=0;i<nlines;i++) {
1039                 unsigned mask;
1040                 ctdb_sock_addr addr;
1041                 const char *addrstr;
1042                 const char *ifaces;
1043                 char *tok, *line;
1044
1045                 line = lines[i];
1046                 while ((*line == ' ') || (*line == '\t')) {
1047                         line++;
1048                 }
1049                 if (*line == '#') {
1050                         continue;
1051                 }
1052                 if (strcmp(line, "") == 0) {
1053                         continue;
1054                 }
1055                 tok = strtok(line, " \t");
1056                 addrstr = tok;
1057                 tok = strtok(NULL, " \t");
1058                 if (tok == NULL) {
1059                         if (NULL == ctdb->default_public_interface) {
1060                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1061                                          i+1));
1062                                 talloc_free(lines);
1063                                 return -1;
1064                         }
1065                         ifaces = ctdb->default_public_interface;
1066                 } else {
1067                         ifaces = tok;
1068                 }
1069
1070                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1071                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1072                         talloc_free(lines);
1073                         return -1;
1074                 }
1075                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1076                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1077                         talloc_free(lines);
1078                         return -1;
1079                 }
1080         }
1081
1082
1083         talloc_free(lines);
1084         return 0;
1085 }
1086
1087 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1088                               const char *iface,
1089                               const char *ip)
1090 {
1091         struct ctdb_vnn *svnn;
1092         struct ctdb_iface *cur = NULL;
1093         bool ok;
1094         int ret;
1095
1096         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1097         CTDB_NO_MEMORY(ctdb, svnn);
1098
1099         svnn->ifaces = talloc_array(svnn, const char *, 2);
1100         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1101         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1102         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1103         svnn->ifaces[1] = NULL;
1104
1105         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1106         if (!ok) {
1107                 talloc_free(svnn);
1108                 return -1;
1109         }
1110
1111         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1112         if (ret != 0) {
1113                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1114                                    "for single_ip[%s]\n",
1115                                    svnn->ifaces[0],
1116                                    ctdb_addr_to_str(&svnn->public_address)));
1117                 talloc_free(svnn);
1118                 return -1;
1119         }
1120
1121         /* assume the single public ip interface is initially "good" */
1122         cur = ctdb_find_iface(ctdb, iface);
1123         if (cur == NULL) {
1124                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1125                 return -1;
1126         }
1127         cur->link_up = true;
1128
1129         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1130         if (ret != 0) {
1131                 talloc_free(svnn);
1132                 return -1;
1133         }
1134
1135         ctdb->single_ip_vnn = svnn;
1136         return 0;
1137 }
1138
1139 /* Given a physical node, return the number of
1140    public addresses that is currently assigned to this node.
1141 */
1142 static int node_ip_coverage(struct ctdb_context *ctdb, 
1143         int32_t pnn,
1144         struct ctdb_public_ip_list *ips)
1145 {
1146         int num=0;
1147
1148         for (;ips;ips=ips->next) {
1149                 if (ips->pnn == pnn) {
1150                         num++;
1151                 }
1152         }
1153         return num;
1154 }
1155
1156
1157 /* Check if this is a public ip known to the node, i.e. can that
1158    node takeover this ip ?
1159 */
1160 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1161                 struct ctdb_public_ip_list *ip)
1162 {
1163         struct ctdb_all_public_ips *public_ips;
1164         int i;
1165
1166         public_ips = ctdb->nodes[pnn]->available_public_ips;
1167
1168         if (public_ips == NULL) {
1169                 return -1;
1170         }
1171
1172         for (i=0;i<public_ips->num;i++) {
1173                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1174                         /* yes, this node can serve this public ip */
1175                         return 0;
1176                 }
1177         }
1178
1179         return -1;
1180 }
1181
1182
1183 /* search the node lists list for a node to takeover this ip.
1184    pick the node that currently are serving the least number of ips
1185    so that the ips get spread out evenly.
1186 */
1187 static int find_takeover_node(struct ctdb_context *ctdb, 
1188                 struct ctdb_node_map *nodemap, uint32_t mask, 
1189                 struct ctdb_public_ip_list *ip,
1190                 struct ctdb_public_ip_list *all_ips)
1191 {
1192         int pnn, min=0, num;
1193         int i;
1194
1195         pnn    = -1;
1196         for (i=0;i<nodemap->num;i++) {
1197                 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1198                         /* This node is not allowed to takeover any addresses
1199                         */
1200                         continue;
1201                 }
1202
1203                 if (nodemap->nodes[i].flags & mask) {
1204                         /* This node is not healty and can not be used to serve
1205                            a public address 
1206                         */
1207                         continue;
1208                 }
1209
1210                 /* verify that this node can serve this ip */
1211                 if (can_node_serve_ip(ctdb, i, ip)) {
1212                         /* no it couldnt   so skip to the next node */
1213                         continue;
1214                 }
1215
1216                 num = node_ip_coverage(ctdb, i, all_ips);
1217                 /* was this the first node we checked ? */
1218                 if (pnn == -1) {
1219                         pnn = i;
1220                         min  = num;
1221                 } else {
1222                         if (num < min) {
1223                                 pnn = i;
1224                                 min  = num;
1225                         }
1226                 }
1227         }       
1228         if (pnn == -1) {
1229                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1230                         ctdb_addr_to_str(&ip->addr)));
1231
1232                 return -1;
1233         }
1234
1235         ip->pnn = pnn;
1236         return 0;
1237 }
1238
1239 #define IP_KEYLEN       4
1240 static uint32_t *ip_key(ctdb_sock_addr *ip)
1241 {
1242         static uint32_t key[IP_KEYLEN];
1243
1244         bzero(key, sizeof(key));
1245
1246         switch (ip->sa.sa_family) {
1247         case AF_INET:
1248                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1249                 break;
1250         case AF_INET6: {
1251                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1252                 key[0]  = htonl(s6_a32[0]);
1253                 key[1]  = htonl(s6_a32[1]);
1254                 key[2]  = htonl(s6_a32[2]);
1255                 key[3]  = htonl(s6_a32[3]);
1256                 break;
1257         }
1258         default:
1259                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1260                 return key;
1261         }
1262
1263         return key;
1264 }
1265
1266 static void *add_ip_callback(void *parm, void *data)
1267 {
1268         struct ctdb_public_ip_list *this_ip = parm; 
1269         struct ctdb_public_ip_list *prev_ip = data; 
1270
1271         if (prev_ip == NULL) {
1272                 return parm;
1273         }
1274         if (this_ip->pnn == -1) {
1275                 this_ip->pnn = prev_ip->pnn;
1276         }
1277
1278         return parm;
1279 }
1280
1281 static int getips_count_callback(void *param, void *data)
1282 {
1283         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1284         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1285
1286         new_ip->next = *ip_list;
1287         *ip_list     = new_ip;
1288         return 0;
1289 }
1290
1291 static struct ctdb_public_ip_list *
1292 create_merged_ip_list(struct ctdb_context *ctdb)
1293 {
1294         int i, j;
1295         struct ctdb_public_ip_list *ip_list;
1296         struct ctdb_all_public_ips *public_ips;
1297
1298         if (ctdb->ip_tree != NULL) {
1299                 talloc_free(ctdb->ip_tree);
1300                 ctdb->ip_tree = NULL;
1301         }
1302         ctdb->ip_tree = trbt_create(ctdb, 0);
1303
1304         for (i=0;i<ctdb->num_nodes;i++) {
1305                 public_ips = ctdb->nodes[i]->known_public_ips;
1306
1307                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1308                         continue;
1309                 }
1310
1311                 /* there were no public ips for this node */
1312                 if (public_ips == NULL) {
1313                         continue;
1314                 }               
1315
1316                 for (j=0;j<public_ips->num;j++) {
1317                         struct ctdb_public_ip_list *tmp_ip; 
1318
1319                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1320                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1321                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1322                         tmp_ip->addr = public_ips->ips[j].addr;
1323                         tmp_ip->next = NULL;
1324
1325                         trbt_insertarray32_callback(ctdb->ip_tree,
1326                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1327                                 add_ip_callback,
1328                                 tmp_ip);
1329                 }
1330         }
1331
1332         ip_list = NULL;
1333         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1334
1335         return ip_list;
1336 }
1337
1338 /* 
1339  * This is the length of the longtest common prefix between the IPs.
1340  * It is calculated by XOR-ing the 2 IPs together and counting the
1341  * number of leading zeroes.  The implementation means that all
1342  * addresses end up being 128 bits long.
1343  *
1344  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1345  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1346  * lots of nodes and IP addresses?
1347  */
1348 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1349 {
1350         uint32_t ip1_k[IP_KEYLEN];
1351         uint32_t *t;
1352         int i;
1353         uint32_t x;
1354
1355         uint32_t distance = 0;
1356
1357         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1358         t = ip_key(ip2);
1359         for (i=0; i<IP_KEYLEN; i++) {
1360                 x = ip1_k[i] ^ t[i];
1361                 if (x == 0) {
1362                         distance += 32;
1363                 } else {
1364                         /* Count number of leading zeroes. 
1365                          * FIXME? This could be optimised...
1366                          */
1367                         while ((x & (1 << 31)) == 0) {
1368                                 x <<= 1;
1369                                 distance += 1;
1370                         }
1371                 }
1372         }
1373
1374         return distance;
1375 }
1376
1377 /* Calculate the IP distance for the given IP relative to IPs on the
1378    given node.  The ips argument is generally the all_ips variable
1379    used in the main part of the algorithm.
1380  */
1381 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1382                                   struct ctdb_public_ip_list *ips,
1383                                   int pnn)
1384 {
1385         struct ctdb_public_ip_list *t;
1386         uint32_t d;
1387
1388         uint32_t sum = 0;
1389
1390         for (t=ips; t != NULL; t=t->next) {
1391                 if (t->pnn != pnn) {
1392                         continue;
1393                 }
1394
1395                 /* Optimisation: We never calculate the distance
1396                  * between an address and itself.  This allows us to
1397                  * calculate the effect of removing an address from a
1398                  * node by simply calculating the distance between
1399                  * that address and all of the exitsing addresses.
1400                  * Moreover, we assume that we're only ever dealing
1401                  * with addresses from all_ips so we can identify an
1402                  * address via a pointer rather than doing a more
1403                  * expensive address comparison. */
1404                 if (&(t->addr) == ip) {
1405                         continue;
1406                 }
1407
1408                 d = ip_distance(ip, &(t->addr));
1409                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1410         }
1411
1412         return sum;
1413 }
1414
1415 /* Return the LCP2 imbalance metric for addresses currently assigned
1416    to the given node.
1417  */
1418 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1419 {
1420         struct ctdb_public_ip_list *t;
1421
1422         uint32_t imbalance = 0;
1423
1424         for (t=all_ips; t!=NULL; t=t->next) {
1425                 if (t->pnn != pnn) {
1426                         continue;
1427                 }
1428                 /* Pass the rest of the IPs rather than the whole
1429                    all_ips input list.
1430                 */
1431                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1432         }
1433
1434         return imbalance;
1435 }
1436
1437 /* Allocate any unassigned IPs just by looping through the IPs and
1438  * finding the best node for each.
1439  */
1440 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1441                                       struct ctdb_node_map *nodemap,
1442                                       uint32_t mask,
1443                                       struct ctdb_public_ip_list *all_ips)
1444 {
1445         struct ctdb_public_ip_list *tmp_ip;
1446
1447         /* loop over all ip's and find a physical node to cover for 
1448            each unassigned ip.
1449         */
1450         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1451                 if (tmp_ip->pnn == -1) {
1452                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1453                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1454                                         ctdb_addr_to_str(&tmp_ip->addr)));
1455                         }
1456                 }
1457         }
1458 }
1459
1460 /* Basic non-deterministic rebalancing algorithm.
1461  */
1462 static bool basic_failback(struct ctdb_context *ctdb,
1463                            struct ctdb_node_map *nodemap,
1464                            uint32_t mask,
1465                            struct ctdb_public_ip_list *all_ips,
1466                            int num_ips,
1467                            int *retries)
1468 {
1469         int i;
1470         int maxnode, maxnum=0, minnode, minnum=0, num;
1471         struct ctdb_public_ip_list *tmp_ip;
1472
1473         /* for each ip address, loop over all nodes that can serve
1474            this ip and make sure that the difference between the node
1475            serving the most and the node serving the least ip's are
1476            not greater than 1.
1477         */
1478         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1479                 if (tmp_ip->pnn == -1) {
1480                         continue;
1481                 }
1482
1483                 /* Get the highest and lowest number of ips's served by any 
1484                    valid node which can serve this ip.
1485                 */
1486                 maxnode = -1;
1487                 minnode = -1;
1488                 for (i=0;i<nodemap->num;i++) {
1489                         if (nodemap->nodes[i].flags & mask) {
1490                                 continue;
1491                         }
1492
1493                         /* Only check nodes that are allowed to takeover an ip */
1494                         if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1495                                 continue;
1496                         }
1497
1498                         /* only check nodes that can actually serve this ip */
1499                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1500                                 /* no it couldnt   so skip to the next node */
1501                                 continue;
1502                         }
1503
1504                         num = node_ip_coverage(ctdb, i, all_ips);
1505                         if (maxnode == -1) {
1506                                 maxnode = i;
1507                                 maxnum  = num;
1508                         } else {
1509                                 if (num > maxnum) {
1510                                         maxnode = i;
1511                                         maxnum  = num;
1512                                 }
1513                         }
1514                         if (minnode == -1) {
1515                                 minnode = i;
1516                                 minnum  = num;
1517                         } else {
1518                                 if (num < minnum) {
1519                                         minnode = i;
1520                                         minnum  = num;
1521                                 }
1522                         }
1523                 }
1524                 if (maxnode == -1) {
1525                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1526                                 ctdb_addr_to_str(&tmp_ip->addr)));
1527
1528                         continue;
1529                 }
1530
1531                 /* If we want deterministic IPs then dont try to reallocate 
1532                    them to spread out the load.
1533                 */
1534                 if (1 == ctdb->tunable.deterministic_public_ips) {
1535                         continue;
1536                 }
1537
1538                 /* if the spread between the smallest and largest coverage by
1539                    a node is >=2 we steal one of the ips from the node with
1540                    most coverage to even things out a bit.
1541                    try to do this a limited number of times since we dont
1542                    want to spend too much time balancing the ip coverage.
1543                 */
1544                 if ( (maxnum > minnum+1)
1545                      && (*retries < (num_ips + 5)) ){
1546                         struct ctdb_public_ip_list *tmp;
1547
1548                         /* mark one of maxnode's vnn's as unassigned and try
1549                            again
1550                         */
1551                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1552                                 if (tmp->pnn == maxnode) {
1553                                         tmp->pnn = -1;
1554                                         (*retries)++;
1555                                         return true;
1556                                 }
1557                         }
1558                 }
1559         }
1560
1561         return false;
1562 }
1563
1564 struct ctdb_rebalancenodes {
1565         struct ctdb_rebalancenodes *next;
1566         uint32_t pnn;
1567 };
1568 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1569
1570
1571 /* set this flag to force the node to be rebalanced even if it just didnt
1572    become healthy again.
1573 */
1574 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1575 {
1576         struct ctdb_rebalancenodes *rebalance;
1577
1578         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1579                 if (rebalance->pnn == pnn) {
1580                         return;
1581                 }
1582         }
1583
1584         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1585         rebalance->pnn = pnn;
1586         rebalance->next = force_rebalance_list;
1587         force_rebalance_list = rebalance;
1588 }
1589
1590 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1591  * that we can unit test it.
1592  */
1593 static void lcp2_init(struct ctdb_context * tmp_ctx,
1594                struct ctdb_node_map * nodemap,
1595                uint32_t mask,
1596                struct ctdb_public_ip_list *all_ips,
1597                uint32_t **lcp2_imbalances,
1598                bool **newly_healthy)
1599 {
1600         int i;
1601         struct ctdb_public_ip_list *tmp_ip;
1602
1603         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1604         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1605         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1606         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1607
1608         for (i=0;i<nodemap->num;i++) {
1609                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1610                 /* First step: is the node "healthy"? */
1611                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1612         }
1613
1614         /* 2nd step: if a ndoe has IPs assigned then it must have been
1615          * healthy before, so we remove it from consideration... */
1616         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1617                 if (tmp_ip->pnn != -1) {
1618                         (*newly_healthy)[tmp_ip->pnn] = false;
1619                 }
1620         }
1621
1622         /* 3rd step: if a node is forced to re-balance then
1623            we allow failback onto the node */
1624         while (force_rebalance_list != NULL) {
1625                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1626
1627                 if (force_rebalance_list->pnn <= nodemap->num) {
1628                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1629                 }
1630
1631                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1632                 talloc_free(force_rebalance_list);
1633                 force_rebalance_list = next;
1634         }
1635 }
1636
1637 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1638  * the IP/node combination that will cost the least.
1639  */
1640 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1641                               struct ctdb_node_map *nodemap,
1642                               uint32_t mask,
1643                               struct ctdb_public_ip_list *all_ips,
1644                               uint32_t *lcp2_imbalances)
1645 {
1646         struct ctdb_public_ip_list *tmp_ip;
1647         int dstnode;
1648
1649         int minnode;
1650         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1651         struct ctdb_public_ip_list *minip;
1652
1653         bool should_loop = true;
1654         bool have_unassigned = true;
1655
1656         while (have_unassigned && should_loop) {
1657                 should_loop = false;
1658
1659                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1660                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1661
1662                 minnode = -1;
1663                 mindsum = 0;
1664                 minip = NULL;
1665
1666                 /* loop over each unassigned ip. */
1667                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1668                         if (tmp_ip->pnn != -1) {
1669                                 continue;
1670                         }
1671
1672                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1673                                 /* Only check nodes that are allowed to takeover an ip */
1674                                 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1675                                         continue;
1676                                 }
1677
1678                                 /* only check nodes that can actually serve this ip */
1679                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1680                                         /* no it couldnt   so skip to the next node */
1681                                         continue;
1682                                 }
1683                                 if (nodemap->nodes[dstnode].flags & mask) {
1684                                         continue;
1685                                 }
1686
1687                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1688                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1689                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1690                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1691                                                    dstnode,
1692                                                    dstimbl - lcp2_imbalances[dstnode]));
1693
1694
1695                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1696                                         minnode = dstnode;
1697                                         minimbl = dstimbl;
1698                                         mindsum = dstdsum;
1699                                         minip = tmp_ip;
1700                                         should_loop = true;
1701                                 }
1702                         }
1703                 }
1704
1705                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1706
1707                 /* If we found one then assign it to the given node. */
1708                 if (minnode != -1) {
1709                         minip->pnn = minnode;
1710                         lcp2_imbalances[minnode] = minimbl;
1711                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1712                                           ctdb_addr_to_str(&(minip->addr)),
1713                                           minnode,
1714                                           mindsum));
1715                 }
1716
1717                 /* There might be a better way but at least this is clear. */
1718                 have_unassigned = false;
1719                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1720                         if (tmp_ip->pnn == -1) {
1721                                 have_unassigned = true;
1722                         }
1723                 }
1724         }
1725
1726         /* We know if we have an unassigned addresses so we might as
1727          * well optimise.
1728          */
1729         if (have_unassigned) {
1730                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1731                         if (tmp_ip->pnn == -1) {
1732                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1733                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1734                         }
1735                 }
1736         }
1737 }
1738
1739 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1740  * to move IPs from, determines the best IP/destination node
1741  * combination to move from the source node.
1742  */
1743 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1744                                     struct ctdb_node_map *nodemap,
1745                                     struct ctdb_public_ip_list *all_ips,
1746                                     int srcnode,
1747                                     uint32_t candimbl,
1748                                     uint32_t *lcp2_imbalances,
1749                                     bool *newly_healthy)
1750 {
1751         int dstnode, mindstnode;
1752         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1753         uint32_t minsrcimbl, mindstimbl;
1754         struct ctdb_public_ip_list *minip;
1755         struct ctdb_public_ip_list *tmp_ip;
1756
1757         /* Find an IP and destination node that best reduces imbalance. */
1758         minip = NULL;
1759         minsrcimbl = 0;
1760         mindstnode = -1;
1761         mindstimbl = 0;
1762
1763         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1764         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1765
1766         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1767                 /* Only consider addresses on srcnode. */
1768                 if (tmp_ip->pnn != srcnode) {
1769                         continue;
1770                 }
1771
1772                 /* What is this IP address costing the source node? */
1773                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1774                 srcimbl = candimbl - srcdsum;
1775
1776                 /* Consider this IP address would cost each potential
1777                  * destination node.  Destination nodes are limited to
1778                  * those that are newly healthy, since we don't want
1779                  * to do gratuitous failover of IPs just to make minor
1780                  * balance improvements.
1781                  */
1782                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1783                         if (! newly_healthy[dstnode]) {
1784                                 continue;
1785                         }
1786
1787                         /* Only check nodes that are allowed to takeover an ip */
1788                         if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1789                                 continue;
1790                         }
1791
1792                         /* only check nodes that can actually serve this ip */
1793                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1794                                 /* no it couldnt   so skip to the next node */
1795                                 continue;
1796                         }
1797
1798                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1799                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1800                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1801                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1802                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1803                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1804
1805                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1806                             ((mindstnode == -1) ||                              \
1807                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1808
1809                                 minip = tmp_ip;
1810                                 minsrcimbl = srcimbl;
1811                                 mindstnode = dstnode;
1812                                 mindstimbl = dstimbl;
1813                         }
1814                 }
1815         }
1816         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1817
1818         if (mindstnode != -1) {
1819                 /* We found a move that makes things better... */
1820                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1821                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1822                                   ctdb_addr_to_str(&(minip->addr)),
1823                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1824
1825
1826                 lcp2_imbalances[srcnode] = srcimbl;
1827                 lcp2_imbalances[mindstnode] = mindstimbl;
1828                 minip->pnn = mindstnode;
1829
1830                 return true;
1831         }
1832
1833         return false;
1834         
1835 }
1836
1837 struct lcp2_imbalance_pnn {
1838         uint32_t imbalance;
1839         int pnn;
1840 };
1841
1842 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1843 {
1844         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1845         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1846
1847         if (lipa->imbalance > lipb->imbalance) {
1848                 return -1;
1849         } else if (lipa->imbalance == lipb->imbalance) {
1850                 return 0;
1851         } else {
1852                 return 1;
1853         }
1854 }
1855
1856 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1857  * node with the highest LCP2 imbalance, and then determines the best
1858  * IP/destination node combination to move from the source node.
1859  */
1860 static bool lcp2_failback(struct ctdb_context *ctdb,
1861                           struct ctdb_node_map *nodemap,
1862                           uint32_t mask,
1863                           struct ctdb_public_ip_list *all_ips,
1864                           uint32_t *lcp2_imbalances,
1865                           bool *newly_healthy)
1866 {
1867         int i, num_newly_healthy;
1868         struct lcp2_imbalance_pnn * lips;
1869         bool ret;
1870
1871         /* It is only worth continuing if we have suitable target
1872          * nodes to transfer IPs to.  This check is much cheaper than
1873          * continuing on...
1874          */
1875         num_newly_healthy = 0;
1876         for (i = 0; i < nodemap->num; i++) {
1877                 if (newly_healthy[i]) {
1878                         num_newly_healthy++;
1879                 }
1880         }
1881         if (num_newly_healthy == 0) {
1882                 return false;
1883         }
1884
1885         /* Put the imbalances and nodes into an array, sort them and
1886          * iterate through candidates.  Usually the 1st one will be
1887          * used, so this doesn't cost much...
1888          */
1889         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
1890         for (i = 0; i < nodemap->num; i++) {
1891                 lips[i].imbalance = lcp2_imbalances[i];
1892                 lips[i].pnn = i;
1893         }
1894         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
1895               lcp2_cmp_imbalance_pnn);
1896
1897         ret = false;
1898         for (i = 0; i < nodemap->num; i++) {
1899                 /* This means that all nodes had 0 or 1 addresses, so
1900                  * can't be imbalanced.
1901                  */
1902                 if (lips[i].imbalance == 0) {
1903                         break;
1904                 }
1905
1906                 if (lcp2_failback_candidate(ctdb,
1907                                             nodemap,
1908                                             all_ips,
1909                                             lips[i].pnn,
1910                                             lips[i].imbalance,
1911                                             lcp2_imbalances,
1912                                             newly_healthy)) {
1913                         ret = true;
1914                         break;
1915                 }
1916         }
1917
1918         talloc_free(lips);
1919         return ret;
1920 }
1921
1922 /* The calculation part of the IP allocation algorithm. */
1923 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1924                                    struct ctdb_node_map *nodemap,
1925                                    struct ctdb_public_ip_list **all_ips_p)
1926 {
1927         int i, num_healthy, retries, num_ips;
1928         uint32_t mask;
1929         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1930         uint32_t *lcp2_imbalances;
1931         bool *newly_healthy;
1932
1933         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1934
1935         /* Count how many completely healthy nodes we have */
1936         num_healthy = 0;
1937         for (i=0;i<nodemap->num;i++) {
1938                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1939                         num_healthy++;
1940                 }
1941         }
1942
1943         if (num_healthy > 0) {
1944                 /* We have healthy nodes, so only consider them for 
1945                    serving public addresses
1946                 */
1947                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1948         } else {
1949                 /* We didnt have any completely healthy nodes so
1950                    use "disabled" nodes as a fallback
1951                 */
1952                 mask = NODE_FLAGS_INACTIVE;
1953         }
1954
1955         /* since nodes only know about those public addresses that
1956            can be served by that particular node, no single node has
1957            a full list of all public addresses that exist in the cluster.
1958            Walk over all node structures and create a merged list of
1959            all public addresses that exist in the cluster.
1960
1961            keep the tree of ips around as ctdb->ip_tree
1962         */
1963         all_ips = create_merged_ip_list(ctdb);
1964         *all_ips_p = all_ips; /* minimal code changes */
1965
1966         /* Count how many ips we have */
1967         num_ips = 0;
1968         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1969                 num_ips++;
1970         }
1971
1972         /* If we want deterministic ip allocations, i.e. that the ip addresses
1973            will always be allocated the same way for a specific set of
1974            available/unavailable nodes.
1975         */
1976         if (1 == ctdb->tunable.deterministic_public_ips) {              
1977                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1978                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1979                         tmp_ip->pnn = i%nodemap->num;
1980                 }
1981         }
1982
1983
1984         /* mark all public addresses with a masked node as being served by
1985            node -1
1986         */
1987         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1988                 if (tmp_ip->pnn == -1) {
1989                         continue;
1990                 }
1991                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1992                         tmp_ip->pnn = -1;
1993                 }
1994         }
1995
1996         /* verify that the assigned nodes can serve that public ip
1997            and set it to -1 if not
1998         */
1999         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2000                 if (tmp_ip->pnn == -1) {
2001                         continue;
2002                 }
2003                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2004                         /* this node can not serve this ip. */
2005                         tmp_ip->pnn = -1;
2006                 }
2007         }
2008
2009         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2010                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2011         }
2012
2013         /* now we must redistribute all public addresses with takeover node
2014            -1 among the nodes available
2015         */
2016         retries = 0;
2017 try_again:
2018         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2019                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2020         } else {
2021                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2022         }
2023
2024         /* If we dont want ips to fail back after a node becomes healthy
2025            again, we wont even try to reallocat the ip addresses so that
2026            they are evenly spread out.
2027            This can NOT be used at the same time as DeterministicIPs !
2028         */
2029         if (1 == ctdb->tunable.no_ip_failback) {
2030                 if (1 == ctdb->tunable.deterministic_public_ips) {
2031                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
2032                 }
2033                 goto finished;
2034         }
2035
2036
2037         /* now, try to make sure the ip adresses are evenly distributed
2038            across the node.
2039         */
2040         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2041                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
2042                         goto try_again;
2043                 }
2044         } else {
2045                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
2046                         goto try_again;
2047                 }
2048         }
2049
2050         /* finished distributing the public addresses, now just send the 
2051            info out to the nodes */
2052 finished:
2053         /* at this point ->pnn is the node which will own each IP
2054            or -1 if there is no node that can cover this ip
2055         */
2056
2057         return;
2058 }
2059
2060 static void noiptakeover_cb(struct ctdb_context *ctdb, uint32_t pnn, int32_t res, TDB_DATA outdata, void *callback)
2061 {
2062         struct ctdb_node_map *nodemap = (struct ctdb_node_map *)callback;
2063
2064         if (res != 0) {
2065                 DEBUG(DEBUG_ERR,("Failure to read NoIPTakeover tunable from remote node %d\n", pnn));
2066                 return;
2067         }
2068
2069         if (outdata.dsize != sizeof(uint32_t)) {
2070                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading NoIPTakeover tunable from node %d. Expected %d bytes but received %d bytes\n", pnn, (int)sizeof(uint32_t), (int)outdata.dsize));
2071                 return;
2072         }
2073
2074         if (pnn >= nodemap->num) {
2075                 DEBUG(DEBUG_ERR,("Got NoIPTakeover reply from node %d but nodemap only has %d entries\n", pnn, nodemap->num));
2076                 return;
2077         }
2078
2079         if (*(uint32_t *)outdata.dptr != 0) {
2080                 nodemap->nodes[pnn].flags |= NODE_FLAGS_NOIPTAKEOVER;
2081         }
2082 }
2083
2084 /*
2085   make any IP alias changes for public addresses that are necessary 
2086  */
2087 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2088 {
2089         int i;
2090         struct ctdb_public_ip ip;
2091         struct ctdb_public_ipv4 ipv4;
2092         struct ctdb_control_get_tunable *t;
2093         uint32_t *nodes;
2094         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2095         TDB_DATA data;
2096         struct timeval timeout;
2097         struct client_async_data *async_data;
2098         struct ctdb_client_control_state *state;
2099         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2100
2101         /*
2102          * ip failover is completely disabled, just send out the 
2103          * ipreallocated event.
2104          */
2105         if (ctdb->tunable.disable_ip_failover != 0) {
2106                 goto ipreallocated;
2107         }
2108
2109
2110         /* assume all nodes do support failback */
2111         for (i=0;i<nodemap->num;i++) {
2112                 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2113         }
2114         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen("NoIPTakeover") + 1;
2115         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2116         t = (struct ctdb_control_get_tunable *)data.dptr;
2117         t->length = strlen("NoIPTakeover")+1;
2118         memcpy(t->name, "NoIPTakeover", t->length);
2119         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2120         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2121                                       nodes, 0, TAKEOVER_TIMEOUT(),
2122                                       false, data,
2123                                       noiptakeover_cb, NULL,
2124                                       nodemap) != 0) {
2125                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get noiptakeover tunable failed\n"));
2126         }
2127         talloc_free(nodes);
2128         talloc_free(data.dptr);
2129
2130
2131         ZERO_STRUCT(ip);
2132
2133         /* Do the IP reassignment calculations */
2134         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2135
2136         /* now tell all nodes to delete any alias that they should not
2137            have.  This will be a NOOP on nodes that don't currently
2138            hold the given alias */
2139         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2140         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2141
2142         for (i=0;i<nodemap->num;i++) {
2143                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2144                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2145                         continue;
2146                 }
2147
2148                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2149                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2150                                 /* This node should be serving this
2151                                    vnn so dont tell it to release the ip
2152                                 */
2153                                 continue;
2154                         }
2155                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2156                                 ipv4.pnn = tmp_ip->pnn;
2157                                 ipv4.sin = tmp_ip->addr.ip;
2158
2159                                 timeout = TAKEOVER_TIMEOUT();
2160                                 data.dsize = sizeof(ipv4);
2161                                 data.dptr  = (uint8_t *)&ipv4;
2162                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2163                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2164                                                 data, async_data,
2165                                                 &timeout, NULL);
2166                         } else {
2167                                 ip.pnn  = tmp_ip->pnn;
2168                                 ip.addr = tmp_ip->addr;
2169
2170                                 timeout = TAKEOVER_TIMEOUT();
2171                                 data.dsize = sizeof(ip);
2172                                 data.dptr  = (uint8_t *)&ip;
2173                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2174                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2175                                                 data, async_data,
2176                                                 &timeout, NULL);
2177                         }
2178
2179                         if (state == NULL) {
2180                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2181                                 talloc_free(tmp_ctx);
2182                                 return -1;
2183                         }
2184                 
2185                         ctdb_client_async_add(async_data, state);
2186                 }
2187         }
2188         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2189                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2190                 talloc_free(tmp_ctx);
2191                 return -1;
2192         }
2193         talloc_free(async_data);
2194
2195
2196         /* tell all nodes to get their own IPs */
2197         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2198         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2199         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2200                 if (tmp_ip->pnn == -1) {
2201                         /* this IP won't be taken over */
2202                         continue;
2203                 }
2204
2205                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2206                         ipv4.pnn = tmp_ip->pnn;
2207                         ipv4.sin = tmp_ip->addr.ip;
2208
2209                         timeout = TAKEOVER_TIMEOUT();
2210                         data.dsize = sizeof(ipv4);
2211                         data.dptr  = (uint8_t *)&ipv4;
2212                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2213                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2214                                         data, async_data,
2215                                         &timeout, NULL);
2216                 } else {
2217                         ip.pnn  = tmp_ip->pnn;
2218                         ip.addr = tmp_ip->addr;
2219
2220                         timeout = TAKEOVER_TIMEOUT();
2221                         data.dsize = sizeof(ip);
2222                         data.dptr  = (uint8_t *)&ip;
2223                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2224                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2225                                         data, async_data,
2226                                         &timeout, NULL);
2227                 }
2228                 if (state == NULL) {
2229                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2230                         talloc_free(tmp_ctx);
2231                         return -1;
2232                 }
2233                 
2234                 ctdb_client_async_add(async_data, state);
2235         }
2236         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2237                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2238                 talloc_free(tmp_ctx);
2239                 return -1;
2240         }
2241
2242 ipreallocated:
2243         /* tell all nodes to update natwg */
2244         /* send the flags update natgw on all connected nodes */
2245         data.dptr  = discard_const("ipreallocated");
2246         data.dsize = strlen((char *)data.dptr) + 1; 
2247         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2248         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2249                                       nodes, 0, TAKEOVER_TIMEOUT(),
2250                                       false, data,
2251                                       NULL, NULL,
2252                                       NULL) != 0) {
2253                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2254         }
2255
2256         talloc_free(tmp_ctx);
2257         return 0;
2258 }
2259
2260
2261 /*
2262   destroy a ctdb_client_ip structure
2263  */
2264 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2265 {
2266         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2267                 ctdb_addr_to_str(&ip->addr),
2268                 ntohs(ip->addr.ip.sin_port),
2269                 ip->client_id));
2270
2271         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2272         return 0;
2273 }
2274
2275 /*
2276   called by a client to inform us of a TCP connection that it is managing
2277   that should tickled with an ACK when IP takeover is done
2278   we handle both the old ipv4 style of packets as well as the new ipv4/6
2279   pdus.
2280  */
2281 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2282                                 TDB_DATA indata)
2283 {
2284         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2285         struct ctdb_control_tcp *old_addr = NULL;
2286         struct ctdb_control_tcp_addr new_addr;
2287         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2288         struct ctdb_tcp_list *tcp;
2289         struct ctdb_tcp_connection t;
2290         int ret;
2291         TDB_DATA data;
2292         struct ctdb_client_ip *ip;
2293         struct ctdb_vnn *vnn;
2294         ctdb_sock_addr addr;
2295
2296         switch (indata.dsize) {
2297         case sizeof(struct ctdb_control_tcp):
2298                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2299                 ZERO_STRUCT(new_addr);
2300                 tcp_sock = &new_addr;
2301                 tcp_sock->src.ip  = old_addr->src;
2302                 tcp_sock->dest.ip = old_addr->dest;
2303                 break;
2304         case sizeof(struct ctdb_control_tcp_addr):
2305                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2306                 break;
2307         default:
2308                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2309                                  "to ctdb_control_tcp_client. size was %d but "
2310                                  "only allowed sizes are %lu and %lu\n",
2311                                  (int)indata.dsize,
2312                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2313                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2314                 return -1;
2315         }
2316
2317         addr = tcp_sock->src;
2318         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2319         addr = tcp_sock->dest;
2320         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2321
2322         ZERO_STRUCT(addr);
2323         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2324         vnn = find_public_ip_vnn(ctdb, &addr);
2325         if (vnn == NULL) {
2326                 switch (addr.sa.sa_family) {
2327                 case AF_INET:
2328                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2329                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2330                                         ctdb_addr_to_str(&addr)));
2331                         }
2332                         break;
2333                 case AF_INET6:
2334                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2335                                 ctdb_addr_to_str(&addr)));
2336                         break;
2337                 default:
2338                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2339                 }
2340
2341                 return 0;
2342         }
2343
2344         if (vnn->pnn != ctdb->pnn) {
2345                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2346                         ctdb_addr_to_str(&addr),
2347                         client_id, client->pid));
2348                 /* failing this call will tell smbd to die */
2349                 return -1;
2350         }
2351
2352         ip = talloc(client, struct ctdb_client_ip);
2353         CTDB_NO_MEMORY(ctdb, ip);
2354
2355         ip->ctdb      = ctdb;
2356         ip->addr      = addr;
2357         ip->client_id = client_id;
2358         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2359         DLIST_ADD(ctdb->client_ip_list, ip);
2360
2361         tcp = talloc(client, struct ctdb_tcp_list);
2362         CTDB_NO_MEMORY(ctdb, tcp);
2363
2364         tcp->connection.src_addr = tcp_sock->src;
2365         tcp->connection.dst_addr = tcp_sock->dest;
2366
2367         DLIST_ADD(client->tcp_list, tcp);
2368
2369         t.src_addr = tcp_sock->src;
2370         t.dst_addr = tcp_sock->dest;
2371
2372         data.dptr = (uint8_t *)&t;
2373         data.dsize = sizeof(t);
2374
2375         switch (addr.sa.sa_family) {
2376         case AF_INET:
2377                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2378                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2379                         ctdb_addr_to_str(&tcp_sock->src),
2380                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2381                 break;
2382         case AF_INET6:
2383                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2384                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2385                         ctdb_addr_to_str(&tcp_sock->src),
2386                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2387                 break;
2388         default:
2389                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2390         }
2391
2392
2393         /* tell all nodes about this tcp connection */
2394         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2395                                        CTDB_CONTROL_TCP_ADD,
2396                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2397         if (ret != 0) {
2398                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2399                 return -1;
2400         }
2401
2402         return 0;
2403 }
2404
2405 /*
2406   find a tcp address on a list
2407  */
2408 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2409                                            struct ctdb_tcp_connection *tcp)
2410 {
2411         int i;
2412
2413         if (array == NULL) {
2414                 return NULL;
2415         }
2416
2417         for (i=0;i<array->num;i++) {
2418                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2419                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2420                         return &array->connections[i];
2421                 }
2422         }
2423         return NULL;
2424 }
2425
2426
2427
2428 /*
2429   called by a daemon to inform us of a TCP connection that one of its
2430   clients managing that should tickled with an ACK when IP takeover is
2431   done
2432  */
2433 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2434 {
2435         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2436         struct ctdb_tcp_array *tcparray;
2437         struct ctdb_tcp_connection tcp;
2438         struct ctdb_vnn *vnn;
2439
2440         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2441         if (vnn == NULL) {
2442                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2443                         ctdb_addr_to_str(&p->dst_addr)));
2444
2445                 return -1;
2446         }
2447
2448
2449         tcparray = vnn->tcp_array;
2450
2451         /* If this is the first tickle */
2452         if (tcparray == NULL) {
2453                 tcparray = talloc_size(ctdb->nodes, 
2454                         offsetof(struct ctdb_tcp_array, connections) +
2455                         sizeof(struct ctdb_tcp_connection) * 1);
2456                 CTDB_NO_MEMORY(ctdb, tcparray);
2457                 vnn->tcp_array = tcparray;
2458
2459                 tcparray->num = 0;
2460                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2461                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2462
2463                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2464                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2465                 tcparray->num++;
2466
2467                 if (tcp_update_needed) {
2468                         vnn->tcp_update_needed = true;
2469                 }
2470                 return 0;
2471         }
2472
2473
2474         /* Do we already have this tickle ?*/
2475         tcp.src_addr = p->src_addr;
2476         tcp.dst_addr = p->dst_addr;
2477         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2478                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2479                         ctdb_addr_to_str(&tcp.dst_addr),
2480                         ntohs(tcp.dst_addr.ip.sin_port),
2481                         vnn->pnn));
2482                 return 0;
2483         }
2484
2485         /* A new tickle, we must add it to the array */
2486         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2487                                         struct ctdb_tcp_connection,
2488                                         tcparray->num+1);
2489         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2490
2491         vnn->tcp_array = tcparray;
2492         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2493         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2494         tcparray->num++;
2495                                 
2496         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2497                 ctdb_addr_to_str(&tcp.dst_addr),
2498                 ntohs(tcp.dst_addr.ip.sin_port),
2499                 vnn->pnn));
2500
2501         if (tcp_update_needed) {
2502                 vnn->tcp_update_needed = true;
2503         }
2504
2505         return 0;
2506 }
2507
2508
2509 /*
2510   called by a daemon to inform us of a TCP connection that one of its
2511   clients managing that should tickled with an ACK when IP takeover is
2512   done
2513  */
2514 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2515 {
2516         struct ctdb_tcp_connection *tcpp;
2517         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2518
2519         if (vnn == NULL) {
2520                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2521                         ctdb_addr_to_str(&conn->dst_addr)));
2522                 return;
2523         }
2524
2525         /* if the array is empty we cant remove it
2526            and we dont need to do anything
2527          */
2528         if (vnn->tcp_array == NULL) {
2529                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2530                         ctdb_addr_to_str(&conn->dst_addr),
2531                         ntohs(conn->dst_addr.ip.sin_port)));
2532                 return;
2533         }
2534
2535
2536         /* See if we know this connection
2537            if we dont know this connection  then we dont need to do anything
2538          */
2539         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2540         if (tcpp == NULL) {
2541                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2542                         ctdb_addr_to_str(&conn->dst_addr),
2543                         ntohs(conn->dst_addr.ip.sin_port)));
2544                 return;
2545         }
2546
2547
2548         /* We need to remove this entry from the array.
2549            Instead of allocating a new array and copying data to it
2550            we cheat and just copy the last entry in the existing array
2551            to the entry that is to be removed and just shring the 
2552            ->num field
2553          */
2554         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2555         vnn->tcp_array->num--;
2556
2557         /* If we deleted the last entry we also need to remove the entire array
2558          */
2559         if (vnn->tcp_array->num == 0) {
2560                 talloc_free(vnn->tcp_array);
2561                 vnn->tcp_array = NULL;
2562         }               
2563
2564         vnn->tcp_update_needed = true;
2565
2566         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2567                 ctdb_addr_to_str(&conn->src_addr),
2568                 ntohs(conn->src_addr.ip.sin_port)));
2569 }
2570
2571
2572 /*
2573   called by a daemon to inform us of a TCP connection that one of its
2574   clients used are no longer needed in the tickle database
2575  */
2576 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2577 {
2578         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2579
2580         ctdb_remove_tcp_connection(ctdb, conn);
2581
2582         return 0;
2583 }
2584
2585
2586 /*
2587   called when a daemon restarts - send all tickes for all public addresses
2588   we are serving immediately to the new node.
2589  */
2590 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2591 {
2592 /*XXX here we should send all tickes we are serving to the new node */
2593         return 0;
2594 }
2595
2596
2597 /*
2598   called when a client structure goes away - hook to remove
2599   elements from the tcp_list in all daemons
2600  */
2601 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2602 {
2603         while (client->tcp_list) {
2604                 struct ctdb_tcp_list *tcp = client->tcp_list;
2605                 DLIST_REMOVE(client->tcp_list, tcp);
2606                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2607         }
2608 }
2609
2610
2611 /*
2612   release all IPs on shutdown
2613  */
2614 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2615 {
2616         struct ctdb_vnn *vnn;
2617
2618         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2619                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2620                         ctdb_vnn_unassign_iface(ctdb, vnn);
2621                         continue;
2622                 }
2623                 if (!vnn->iface) {
2624                         continue;
2625                 }
2626                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2627                                   ctdb_vnn_iface_string(vnn),
2628                                   ctdb_addr_to_str(&vnn->public_address),
2629                                   vnn->public_netmask_bits);
2630                 release_kill_clients(ctdb, &vnn->public_address);
2631                 ctdb_vnn_unassign_iface(ctdb, vnn);
2632         }
2633 }
2634
2635
2636 /*
2637   get list of public IPs
2638  */
2639 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2640                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2641 {
2642         int i, num, len;
2643         struct ctdb_all_public_ips *ips;
2644         struct ctdb_vnn *vnn;
2645         bool only_available = false;
2646
2647         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2648                 only_available = true;
2649         }
2650
2651         /* count how many public ip structures we have */
2652         num = 0;
2653         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2654                 num++;
2655         }
2656
2657         len = offsetof(struct ctdb_all_public_ips, ips) + 
2658                 num*sizeof(struct ctdb_public_ip);
2659         ips = talloc_zero_size(outdata, len);
2660         CTDB_NO_MEMORY(ctdb, ips);
2661
2662         i = 0;
2663         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2664                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2665                         continue;
2666                 }
2667                 ips->ips[i].pnn  = vnn->pnn;
2668                 ips->ips[i].addr = vnn->public_address;
2669                 i++;
2670         }
2671         ips->num = i;
2672         len = offsetof(struct ctdb_all_public_ips, ips) +
2673                 i*sizeof(struct ctdb_public_ip);
2674
2675         outdata->dsize = len;
2676         outdata->dptr  = (uint8_t *)ips;
2677
2678         return 0;
2679 }
2680
2681
2682 /*
2683   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2684  */
2685 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2686                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2687 {
2688         int i, num, len;
2689         struct ctdb_all_public_ipsv4 *ips;
2690         struct ctdb_vnn *vnn;
2691
2692         /* count how many public ip structures we have */
2693         num = 0;
2694         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2695                 if (vnn->public_address.sa.sa_family != AF_INET) {
2696                         continue;
2697                 }
2698                 num++;
2699         }
2700
2701         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2702                 num*sizeof(struct ctdb_public_ipv4);
2703         ips = talloc_zero_size(outdata, len);
2704         CTDB_NO_MEMORY(ctdb, ips);
2705
2706         outdata->dsize = len;
2707         outdata->dptr  = (uint8_t *)ips;
2708
2709         ips->num = num;
2710         i = 0;
2711         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2712                 if (vnn->public_address.sa.sa_family != AF_INET) {
2713                         continue;
2714                 }
2715                 ips->ips[i].pnn = vnn->pnn;
2716                 ips->ips[i].sin = vnn->public_address.ip;
2717                 i++;
2718         }
2719
2720         return 0;
2721 }
2722
2723 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2724                                         struct ctdb_req_control *c,
2725                                         TDB_DATA indata,
2726                                         TDB_DATA *outdata)
2727 {
2728         int i, num, len;
2729         ctdb_sock_addr *addr;
2730         struct ctdb_control_public_ip_info *info;
2731         struct ctdb_vnn *vnn;
2732
2733         addr = (ctdb_sock_addr *)indata.dptr;
2734
2735         vnn = find_public_ip_vnn(ctdb, addr);
2736         if (vnn == NULL) {
2737                 /* if it is not a public ip   it could be our 'single ip' */
2738                 if (ctdb->single_ip_vnn) {
2739                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2740                                 vnn = ctdb->single_ip_vnn;
2741                         }
2742                 }
2743         }
2744         if (vnn == NULL) {
2745                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2746                                  "'%s'not a public address\n",
2747                                  ctdb_addr_to_str(addr)));
2748                 return -1;
2749         }
2750
2751         /* count how many public ip structures we have */
2752         num = 0;
2753         for (;vnn->ifaces[num];) {
2754                 num++;
2755         }
2756
2757         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2758                 num*sizeof(struct ctdb_control_iface_info);
2759         info = talloc_zero_size(outdata, len);
2760         CTDB_NO_MEMORY(ctdb, info);
2761
2762         info->ip.addr = vnn->public_address;
2763         info->ip.pnn = vnn->pnn;
2764         info->active_idx = 0xFFFFFFFF;
2765
2766         for (i=0; vnn->ifaces[i]; i++) {
2767                 struct ctdb_iface *cur;
2768
2769                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2770                 if (cur == NULL) {
2771                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2772                                            vnn->ifaces[i]));
2773                         return -1;
2774                 }
2775                 if (vnn->iface == cur) {
2776                         info->active_idx = i;
2777                 }
2778                 strcpy(info->ifaces[i].name, cur->name);
2779                 info->ifaces[i].link_state = cur->link_up;
2780                 info->ifaces[i].references = cur->references;
2781         }
2782         info->num = i;
2783         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2784                 i*sizeof(struct ctdb_control_iface_info);
2785
2786         outdata->dsize = len;
2787         outdata->dptr  = (uint8_t *)info;
2788
2789         return 0;
2790 }
2791
2792 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2793                                 struct ctdb_req_control *c,
2794                                 TDB_DATA *outdata)
2795 {
2796         int i, num, len;
2797         struct ctdb_control_get_ifaces *ifaces;
2798         struct ctdb_iface *cur;
2799
2800         /* count how many public ip structures we have */
2801         num = 0;
2802         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2803                 num++;
2804         }
2805
2806         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2807                 num*sizeof(struct ctdb_control_iface_info);
2808         ifaces = talloc_zero_size(outdata, len);
2809         CTDB_NO_MEMORY(ctdb, ifaces);
2810
2811         i = 0;
2812         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2813                 strcpy(ifaces->ifaces[i].name, cur->name);
2814                 ifaces->ifaces[i].link_state = cur->link_up;
2815                 ifaces->ifaces[i].references = cur->references;
2816                 i++;
2817         }
2818         ifaces->num = i;
2819         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2820                 i*sizeof(struct ctdb_control_iface_info);
2821
2822         outdata->dsize = len;
2823         outdata->dptr  = (uint8_t *)ifaces;
2824
2825         return 0;
2826 }
2827
2828 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2829                                     struct ctdb_req_control *c,
2830                                     TDB_DATA indata)
2831 {
2832         struct ctdb_control_iface_info *info;
2833         struct ctdb_iface *iface;
2834         bool link_up = false;
2835
2836         info = (struct ctdb_control_iface_info *)indata.dptr;
2837
2838         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2839                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2840                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2841                                   len, len, info->name));
2842                 return -1;
2843         }
2844
2845         switch (info->link_state) {
2846         case 0:
2847                 link_up = false;
2848                 break;
2849         case 1:
2850                 link_up = true;
2851                 break;
2852         default:
2853                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2854                                   (unsigned int)info->link_state));
2855                 return -1;
2856         }
2857
2858         if (info->references != 0) {
2859                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2860                                   (unsigned int)info->references));
2861                 return -1;
2862         }
2863
2864         iface = ctdb_find_iface(ctdb, info->name);
2865         if (iface == NULL) {
2866                 return -1;
2867         }
2868
2869         if (link_up == iface->link_up) {
2870                 return 0;
2871         }
2872
2873         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2874               ("iface[%s] has changed it's link status %s => %s\n",
2875                iface->name,
2876                iface->link_up?"up":"down",
2877                link_up?"up":"down"));
2878
2879         iface->link_up = link_up;
2880         return 0;
2881 }
2882
2883
2884 /* 
2885    structure containing the listening socket and the list of tcp connections
2886    that the ctdb daemon is to kill
2887 */
2888 struct ctdb_kill_tcp {
2889         struct ctdb_vnn *vnn;
2890         struct ctdb_context *ctdb;
2891         int capture_fd;
2892         struct fd_event *fde;
2893         trbt_tree_t *connections;
2894         void *private_data;
2895 };
2896
2897 /*
2898   a tcp connection that is to be killed
2899  */
2900 struct ctdb_killtcp_con {
2901         ctdb_sock_addr src_addr;
2902         ctdb_sock_addr dst_addr;
2903         int count;
2904         struct ctdb_kill_tcp *killtcp;
2905 };
2906
2907 /* this function is used to create a key to represent this socketpair
2908    in the killtcp tree.
2909    this key is used to insert and lookup matching socketpairs that are
2910    to be tickled and RST
2911 */
2912 #define KILLTCP_KEYLEN  10
2913 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2914 {
2915         static uint32_t key[KILLTCP_KEYLEN];
2916
2917         bzero(key, sizeof(key));
2918
2919         if (src->sa.sa_family != dst->sa.sa_family) {
2920                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2921                 return key;
2922         }
2923         
2924         switch (src->sa.sa_family) {
2925         case AF_INET:
2926                 key[0]  = dst->ip.sin_addr.s_addr;
2927                 key[1]  = src->ip.sin_addr.s_addr;
2928                 key[2]  = dst->ip.sin_port;
2929                 key[3]  = src->ip.sin_port;
2930                 break;
2931         case AF_INET6: {
2932                 uint32_t *dst6_addr32 =
2933                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2934                 uint32_t *src6_addr32 =
2935                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2936                 key[0]  = dst6_addr32[3];
2937                 key[1]  = src6_addr32[3];
2938                 key[2]  = dst6_addr32[2];
2939                 key[3]  = src6_addr32[2];
2940                 key[4]  = dst6_addr32[1];
2941                 key[5]  = src6_addr32[1];
2942                 key[6]  = dst6_addr32[0];
2943                 key[7]  = src6_addr32[0];
2944                 key[8]  = dst->ip6.sin6_port;
2945                 key[9]  = src->ip6.sin6_port;
2946                 break;
2947         }
2948         default:
2949                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2950                 return key;
2951         }
2952
2953         return key;
2954 }
2955
2956 /*
2957   called when we get a read event on the raw socket
2958  */
2959 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2960                                 uint16_t flags, void *private_data)
2961 {
2962         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2963         struct ctdb_killtcp_con *con;
2964         ctdb_sock_addr src, dst;
2965         uint32_t ack_seq, seq;
2966
2967         if (!(flags & EVENT_FD_READ)) {
2968                 return;
2969         }
2970
2971         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2972                                 killtcp->private_data,
2973                                 &src, &dst,
2974                                 &ack_seq, &seq) != 0) {
2975                 /* probably a non-tcp ACK packet */
2976                 return;
2977         }
2978
2979         /* check if we have this guy in our list of connections
2980            to kill
2981         */
2982         con = trbt_lookuparray32(killtcp->connections, 
2983                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2984         if (con == NULL) {
2985                 /* no this was some other packet we can just ignore */
2986                 return;
2987         }
2988
2989         /* This one has been tickled !
2990            now reset him and remove him from the list.
2991          */
2992         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2993                 ntohs(con->dst_addr.ip.sin_port),
2994                 ctdb_addr_to_str(&con->src_addr),
2995                 ntohs(con->src_addr.ip.sin_port)));
2996
2997         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2998         talloc_free(con);
2999 }
3000
3001
3002 /* when traversing the list of all tcp connections to send tickle acks to
3003    (so that we can capture the ack coming back and kill the connection
3004     by a RST)
3005    this callback is called for each connection we are currently trying to kill
3006 */
3007 static int tickle_connection_traverse(void *param, void *data)
3008 {
3009         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3010
3011         /* have tried too many times, just give up */
3012         if (con->count >= 5) {
3013                 /* can't delete in traverse: reparent to delete_cons */
3014                 talloc_steal(param, con);
3015                 return 0;
3016         }
3017
3018         /* othervise, try tickling it again */
3019         con->count++;
3020         ctdb_sys_send_tcp(
3021                 (ctdb_sock_addr *)&con->dst_addr,
3022                 (ctdb_sock_addr *)&con->src_addr,
3023                 0, 0, 0);
3024         return 0;
3025 }
3026
3027
3028 /* 
3029    called every second until all sentenced connections have been reset
3030  */
3031 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3032                                               struct timeval t, void *private_data)
3033 {
3034         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3035         void *delete_cons = talloc_new(NULL);
3036
3037         /* loop over all connections sending tickle ACKs */
3038         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3039
3040         /* now we've finished traverse, it's safe to do deletion. */
3041         talloc_free(delete_cons);
3042
3043         /* If there are no more connections to kill we can remove the
3044            entire killtcp structure
3045          */
3046         if ( (killtcp->connections == NULL) || 
3047              (killtcp->connections->root == NULL) ) {
3048                 talloc_free(killtcp);
3049                 return;
3050         }
3051
3052         /* try tickling them again in a seconds time
3053          */
3054         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3055                         ctdb_tickle_sentenced_connections, killtcp);
3056 }
3057
3058 /*
3059   destroy the killtcp structure
3060  */
3061 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3062 {
3063         struct ctdb_vnn *tmpvnn;
3064
3065         /* verify that this vnn is still active */
3066         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3067                 if (tmpvnn == killtcp->vnn) {
3068                         break;
3069                 }
3070         }
3071
3072         if (tmpvnn == NULL) {
3073                 return 0;
3074         }
3075
3076         if (killtcp->vnn->killtcp != killtcp) {
3077                 return 0;
3078         }
3079
3080         killtcp->vnn->killtcp = NULL;
3081
3082         return 0;
3083 }
3084
3085
3086 /* nothing fancy here, just unconditionally replace any existing
3087    connection structure with the new one.
3088
3089    dont even free the old one if it did exist, that one is talloc_stolen
3090    by the same node in the tree anyway and will be deleted when the new data 
3091    is deleted
3092 */
3093 static void *add_killtcp_callback(void *parm, void *data)
3094 {
3095         return parm;
3096 }
3097
3098 /*
3099   add a tcp socket to the list of connections we want to RST
3100  */
3101 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3102                                        ctdb_sock_addr *s,
3103                                        ctdb_sock_addr *d)
3104 {
3105         ctdb_sock_addr src, dst;
3106         struct ctdb_kill_tcp *killtcp;
3107         struct ctdb_killtcp_con *con;
3108         struct ctdb_vnn *vnn;
3109
3110         ctdb_canonicalize_ip(s, &src);
3111         ctdb_canonicalize_ip(d, &dst);
3112
3113         vnn = find_public_ip_vnn(ctdb, &dst);
3114         if (vnn == NULL) {
3115                 vnn = find_public_ip_vnn(ctdb, &src);
3116         }
3117         if (vnn == NULL) {
3118                 /* if it is not a public ip   it could be our 'single ip' */
3119                 if (ctdb->single_ip_vnn) {
3120                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3121                                 vnn = ctdb->single_ip_vnn;
3122                         }
3123                 }
3124         }
3125         if (vnn == NULL) {
3126                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3127                 return -1;
3128         }
3129
3130         killtcp = vnn->killtcp;
3131         
3132         /* If this is the first connection to kill we must allocate
3133            a new structure
3134          */
3135         if (killtcp == NULL) {
3136                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3137                 CTDB_NO_MEMORY(ctdb, killtcp);
3138
3139                 killtcp->vnn         = vnn;
3140                 killtcp->ctdb        = ctdb;
3141                 killtcp->capture_fd  = -1;
3142                 killtcp->connections = trbt_create(killtcp, 0);
3143
3144                 vnn->killtcp         = killtcp;
3145                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3146         }
3147
3148
3149
3150         /* create a structure that describes this connection we want to
3151            RST and store it in killtcp->connections
3152         */
3153         con = talloc(killtcp, struct ctdb_killtcp_con);
3154         CTDB_NO_MEMORY(ctdb, con);
3155         con->src_addr = src;
3156         con->dst_addr = dst;
3157         con->count    = 0;
3158         con->killtcp  = killtcp;
3159
3160
3161         trbt_insertarray32_callback(killtcp->connections,
3162                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3163                         add_killtcp_callback, con);
3164
3165         /* 
3166            If we dont have a socket to listen on yet we must create it
3167          */
3168         if (killtcp->capture_fd == -1) {
3169                 const char *iface = ctdb_vnn_iface_string(vnn);
3170                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3171                 if (killtcp->capture_fd == -1) {
3172                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3173                                           "socket on iface '%s' for killtcp (%s)\n",
3174                                           iface, strerror(errno)));
3175                         goto failed;
3176                 }
3177         }
3178
3179
3180         if (killtcp->fde == NULL) {
3181                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3182                                             EVENT_FD_READ,
3183                                             capture_tcp_handler, killtcp);
3184                 tevent_fd_set_auto_close(killtcp->fde);
3185
3186                 /* We also need to set up some events to tickle all these connections
3187                    until they are all reset
3188                 */
3189                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3190                                 ctdb_tickle_sentenced_connections, killtcp);
3191         }
3192
3193         /* tickle him once now */
3194         ctdb_sys_send_tcp(
3195                 &con->dst_addr,
3196                 &con->src_addr,
3197                 0, 0, 0);
3198
3199         return 0;
3200
3201 failed:
3202         talloc_free(vnn->killtcp);
3203         vnn->killtcp = NULL;
3204         return -1;
3205 }
3206
3207 /*
3208   kill a TCP connection.
3209  */
3210 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3211 {
3212         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3213
3214         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3215 }
3216
3217 /*
3218   called by a daemon to inform us of the entire list of TCP tickles for
3219   a particular public address.
3220   this control should only be sent by the node that is currently serving
3221   that public address.
3222  */
3223 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3224 {
3225         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3226         struct ctdb_tcp_array *tcparray;
3227         struct ctdb_vnn *vnn;
3228
3229         /* We must at least have tickles.num or else we cant verify the size
3230            of the received data blob
3231          */
3232         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3233                                         tickles.connections)) {
3234                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3235                 return -1;
3236         }
3237
3238         /* verify that the size of data matches what we expect */
3239         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3240                                 tickles.connections)
3241                          + sizeof(struct ctdb_tcp_connection)
3242                                  * list->tickles.num) {
3243                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3244                 return -1;
3245         }       
3246
3247         vnn = find_public_ip_vnn(ctdb, &list->addr);
3248         if (vnn == NULL) {
3249                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3250                         ctdb_addr_to_str(&list->addr)));
3251
3252                 return 1;
3253         }
3254
3255         /* remove any old ticklelist we might have */
3256         talloc_free(vnn->tcp_array);
3257         vnn->tcp_array = NULL;
3258
3259         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3260         CTDB_NO_MEMORY(ctdb, tcparray);
3261
3262         tcparray->num = list->tickles.num;
3263
3264         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3265         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3266
3267         memcpy(tcparray->connections, &list->tickles.connections[0], 
3268                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3269
3270         /* We now have a new fresh tickle list array for this vnn */
3271         vnn->tcp_array = talloc_steal(vnn, tcparray);
3272         
3273         return 0;
3274 }
3275
3276 /*
3277   called to return the full list of tickles for the puclic address associated 
3278   with the provided vnn
3279  */
3280 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3281 {
3282         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3283         struct ctdb_control_tcp_tickle_list *list;
3284         struct ctdb_tcp_array *tcparray;
3285         int num;
3286         struct ctdb_vnn *vnn;
3287
3288         vnn = find_public_ip_vnn(ctdb, addr);
3289         if (vnn == NULL) {
3290                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3291                         ctdb_addr_to_str(addr)));
3292
3293                 return 1;
3294         }
3295
3296         tcparray = vnn->tcp_array;
3297         if (tcparray) {
3298                 num = tcparray->num;
3299         } else {
3300                 num = 0;
3301         }
3302
3303         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3304                                 tickles.connections)
3305                         + sizeof(struct ctdb_tcp_connection) * num;
3306
3307         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3308         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3309         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3310
3311         list->addr = *addr;
3312         list->tickles.num = num;
3313         if (num) {
3314                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3315                         sizeof(struct ctdb_tcp_connection) * num);
3316         }
3317
3318         return 0;
3319 }
3320
3321
3322 /*
3323   set the list of all tcp tickles for a public address
3324  */
3325 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3326                               struct timeval timeout, uint32_t destnode, 
3327                               ctdb_sock_addr *addr,
3328                               struct ctdb_tcp_array *tcparray)
3329 {
3330         int ret, num;
3331         TDB_DATA data;
3332         struct ctdb_control_tcp_tickle_list *list;
3333
3334         if (tcparray) {
3335                 num = tcparray->num;
3336         } else {
3337                 num = 0;
3338         }
3339
3340         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3341                                 tickles.connections) +
3342                         sizeof(struct ctdb_tcp_connection) * num;
3343         data.dptr = talloc_size(ctdb, data.dsize);
3344         CTDB_NO_MEMORY(ctdb, data.dptr);
3345
3346         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3347         list->addr = *addr;
3348         list->tickles.num = num;
3349         if (tcparray) {
3350                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3351         }
3352
3353         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3354                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3355                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3356         if (ret != 0) {
3357                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3358                 return -1;
3359         }
3360
3361         talloc_free(data.dptr);
3362
3363         return ret;
3364 }
3365
3366
3367 /*
3368   perform tickle updates if required
3369  */
3370 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3371                                 struct timed_event *te, 
3372                                 struct timeval t, void *private_data)
3373 {
3374         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3375         int ret;
3376         struct ctdb_vnn *vnn;
3377
3378         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3379                 /* we only send out updates for public addresses that 
3380                    we have taken over
3381                  */
3382                 if (ctdb->pnn != vnn->pnn) {
3383                         continue;
3384                 }
3385                 /* We only send out the updates if we need to */
3386                 if (!vnn->tcp_update_needed) {
3387                         continue;
3388                 }
3389                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3390                                 TAKEOVER_TIMEOUT(),
3391                                 CTDB_BROADCAST_CONNECTED,
3392                                 &vnn->public_address,
3393                                 vnn->tcp_array);
3394                 if (ret != 0) {
3395                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3396                                 ctdb_addr_to_str(&vnn->public_address)));
3397                 }
3398         }
3399
3400         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3401                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3402                              ctdb_update_tcp_tickles, ctdb);
3403 }               
3404         
3405
3406 /*
3407   start periodic update of tcp tickles
3408  */
3409 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3410 {
3411         ctdb->tickle_update_context = talloc_new(ctdb);
3412
3413         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3414                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3415                              ctdb_update_tcp_tickles, ctdb);
3416 }
3417
3418
3419
3420
3421 struct control_gratious_arp {
3422         struct ctdb_context *ctdb;
3423         ctdb_sock_addr addr;
3424         const char *iface;
3425         int count;
3426 };
3427
3428 /*
3429   send a control_gratuitous arp
3430  */
3431 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3432                                   struct timeval t, void *private_data)
3433 {
3434         int ret;
3435         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3436                                                         struct control_gratious_arp);
3437
3438         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3439         if (ret != 0) {
3440                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3441                                  arp->iface, strerror(errno)));
3442         }
3443
3444
3445         arp->count++;
3446         if (arp->count == CTDB_ARP_REPEAT) {
3447                 talloc_free(arp);
3448                 return;
3449         }
3450
3451         event_add_timed(arp->ctdb->ev, arp, 
3452                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3453                         send_gratious_arp, arp);
3454 }
3455
3456
3457 /*
3458   send a gratious arp 
3459  */
3460 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3461 {
3462         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3463         struct control_gratious_arp *arp;
3464
3465         /* verify the size of indata */
3466         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3467                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3468                                  (unsigned)indata.dsize, 
3469                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3470                 return -1;
3471         }
3472         if (indata.dsize != 
3473                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3474                 + gratious_arp->len ) ){
3475
3476                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3477                         "but should be %u bytes\n", 
3478                          (unsigned)indata.dsize, 
3479                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3480                 return -1;
3481         }
3482
3483
3484         arp = talloc(ctdb, struct control_gratious_arp);
3485         CTDB_NO_MEMORY(ctdb, arp);
3486
3487         arp->ctdb  = ctdb;
3488         arp->addr   = gratious_arp->addr;
3489         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3490         CTDB_NO_MEMORY(ctdb, arp->iface);
3491         arp->count = 0;
3492         
3493         event_add_timed(arp->ctdb->ev, arp, 
3494                         timeval_zero(), send_gratious_arp, arp);
3495
3496         return 0;
3497 }
3498
3499 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3500 {
3501         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3502         int ret;
3503
3504         /* verify the size of indata */
3505         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3506                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3507                 return -1;
3508         }
3509         if (indata.dsize != 
3510                 ( offsetof(struct ctdb_control_ip_iface, iface)
3511                 + pub->len ) ){
3512
3513                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3514                         "but should be %u bytes\n", 
3515                          (unsigned)indata.dsize, 
3516                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3517                 return -1;
3518         }
3519
3520         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3521
3522         if (ret != 0) {
3523                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3524                 return -1;
3525         }
3526
3527         return 0;
3528 }
3529
3530 /*
3531   called when releaseip event finishes for del_public_address
3532  */
3533 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3534                                 void *private_data)
3535 {
3536         talloc_free(private_data);
3537 }
3538
3539 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3540 {
3541         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3542         struct ctdb_vnn *vnn;
3543         int ret;
3544
3545         /* verify the size of indata */
3546         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3547                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3548                 return -1;
3549         }
3550         if (indata.dsize != 
3551                 ( offsetof(struct ctdb_control_ip_iface, iface)
3552                 + pub->len ) ){
3553
3554                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3555                         "but should be %u bytes\n", 
3556                          (unsigned)indata.dsize, 
3557                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3558                 return -1;
3559         }
3560
3561         /* walk over all public addresses until we find a match */
3562         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3563                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3564                         TALLOC_CTX *mem_ctx;
3565
3566                         DLIST_REMOVE(ctdb->vnn, vnn);
3567                         if (vnn->pnn != ctdb->pnn) {
3568                                 if (vnn->iface != NULL) {
3569                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3570                                 }
3571                                 talloc_free(vnn);
3572                                 return 0;
3573                         }
3574                         vnn->pnn = -1;
3575
3576                         mem_ctx = talloc_new(ctdb);
3577                         talloc_steal(mem_ctx, vnn);
3578                         ret = ctdb_event_script_callback(ctdb, 
3579                                          mem_ctx, delete_ip_callback, mem_ctx,
3580                                          false,
3581                                          CTDB_EVENT_RELEASE_IP,
3582                                          "%s %s %u",
3583                                          ctdb_vnn_iface_string(vnn),
3584                                          ctdb_addr_to_str(&vnn->public_address),
3585                                          vnn->public_netmask_bits);
3586                         if (vnn->iface != NULL) {
3587                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3588                         }
3589                         if (ret != 0) {
3590                                 return -1;
3591                         }
3592                         return 0;
3593                 }
3594         }
3595
3596         return -1;
3597 }
3598
3599 /* This function is called from the recovery daemon to verify that a remote
3600    node has the expected ip allocation.
3601    This is verified against ctdb->ip_tree
3602 */
3603 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3604 {
3605         struct ctdb_public_ip_list *tmp_ip; 
3606         int i;
3607
3608         if (ctdb->ip_tree == NULL) {
3609                 /* dont know the expected allocation yet, assume remote node
3610                    is correct. */
3611                 return 0;
3612         }
3613
3614         if (ips == NULL) {
3615                 return 0;
3616         }
3617
3618         for (i=0; i<ips->num; i++) {
3619                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3620                 if (tmp_ip == NULL) {
3621                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3622                         return -1;
3623                 }
3624
3625                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3626                         continue;
3627                 }
3628
3629                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3630                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3631                         return -1;
3632                 }
3633         }
3634
3635         return 0;
3636 }
3637
3638 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3639 {
3640         struct ctdb_public_ip_list *tmp_ip; 
3641
3642         if (ctdb->ip_tree == NULL) {
3643                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3644                 return -1;
3645         }
3646
3647         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3648         if (tmp_ip == NULL) {
3649                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3650                 return -1;
3651         }
3652
3653         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3654         tmp_ip->pnn = ip->pnn;
3655
3656         return 0;
3657 }
3658
3659
3660 struct ctdb_reloadips_handle {
3661         struct ctdb_context *ctdb;
3662         struct ctdb_req_control *c;
3663         int status;
3664         int fd[2];
3665         pid_t child;
3666         struct fd_event *fde;
3667 };
3668
3669 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3670 {
3671         if (h == h->ctdb->reload_ips) {
3672                 h->ctdb->reload_ips = NULL;
3673         }
3674         if (h->c != NULL) {
3675                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3676                 h->c = NULL;
3677         }
3678         kill(h->child, SIGKILL);
3679         return 0;
3680 }
3681
3682 static void ctdb_reloadips_timeout_event(struct event_context *ev,
3683                                 struct timed_event *te,
3684                                 struct timeval t, void *private_data)
3685 {
3686         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3687
3688         talloc_free(h);
3689 }       
3690
3691 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
3692                              uint16_t flags, void *private_data)
3693 {
3694         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3695
3696         char res;
3697         int ret;
3698
3699         ret = read(h->fd[0], &res, 1);
3700         if (ret < 1 || res != 0) {
3701                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3702                 res = 1;
3703         }
3704         h->status = res;
3705
3706         talloc_free(h);
3707 }
3708
3709 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3710 {
3711         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3712         struct ctdb_all_public_ips *ips;
3713         struct ctdb_vnn *vnn;
3714         int i, ret;
3715
3716         /* read the ip allocation from the local node */
3717         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
3718         if (ret != 0) {
3719                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
3720                 talloc_free(mem_ctx);
3721                 return -1;
3722         }
3723
3724         /* re-read the public ips file */
3725         ctdb->vnn = NULL;
3726         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3727                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3728                 talloc_free(mem_ctx);
3729                 return -1;
3730         }               
3731
3732
3733         /* check the previous list of ips and scan for ips that have been
3734            dropped.
3735          */
3736         for (i = 0; i < ips->num; i++) {
3737                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3738                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3739                                 break;
3740                         }
3741                 }
3742
3743                 /* we need to delete this ip, no longer available on this node */
3744                 if (vnn == NULL) {
3745                         struct ctdb_control_ip_iface pub;
3746
3747                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3748                         pub.addr  = ips->ips[i].addr;
3749                         pub.mask  = 0;
3750                         pub.len   = 0;
3751
3752                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3753                         if (ret != 0) {
3754                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3755                                 return -1;
3756                         }
3757                 }
3758         }
3759
3760
3761         /* loop over all new ones and check the ones we need to add */
3762         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3763                 for (i = 0; i < ips->num; i++) {
3764                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3765                                 break;
3766                         }
3767                 }
3768                 if (i == ips->num) {
3769                         struct ctdb_control_ip_iface pub;
3770                         char *ifaces = NULL;
3771                         int iface = 0;
3772
3773                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
3774
3775                         pub.addr  = vnn->public_address;
3776                         pub.mask  = vnn->public_netmask_bits;
3777
3778
3779                         ifaces = vnn->ifaces[0];
3780                         iface = 1;
3781                         while (vnn->ifaces[iface] != NULL) {
3782                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
3783                                 iface++;
3784                         }
3785                         pub.len   = strlen(ifaces)+1;
3786                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
3787
3788                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3789                         if (ret != 0) {
3790                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
3791                                 return -1;
3792                         }
3793                 }
3794         }
3795
3796         return 0;
3797 }
3798
3799 /* This control is sent to force the node to re-read the public addresses file
3800    and drop any addresses we should nnot longer host, and add new addresses
3801    that we are now able to host
3802 */
3803 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
3804 {
3805         struct ctdb_reloadips_handle *h;
3806         pid_t parent = getpid();
3807
3808         if (ctdb->reload_ips != NULL) {
3809                 talloc_free(ctdb->reload_ips);
3810                 ctdb->reload_ips = NULL;
3811         }
3812
3813         h = talloc(ctdb, struct ctdb_reloadips_handle);
3814         CTDB_NO_MEMORY(ctdb, h);
3815         h->ctdb     = ctdb;
3816         h->c        = NULL;
3817         h->status   = -1;
3818         
3819         if (pipe(h->fd) == -1) {
3820                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3821                 talloc_free(h);
3822                 return -1;
3823         }
3824
3825         h->child = ctdb_fork(ctdb);
3826         if (h->child == (pid_t)-1) {
3827                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3828                 close(h->fd[0]);
3829                 close(h->fd[1]);
3830                 talloc_free(h);
3831                 return -1;
3832         }
3833
3834         /* child process */
3835         if (h->child == 0) {
3836                 signed char res = 0;
3837
3838                 close(h->fd[0]);
3839                 debug_extra = talloc_asprintf(NULL, "reloadips:");
3840
3841                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
3842                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
3843                         res = -1;
3844                 } else {
3845                         res = ctdb_reloadips_child(ctdb);
3846                         if (res != 0) {
3847                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
3848                         }
3849                 }
3850
3851                 write(h->fd[1], &res, 1);
3852                 /* make sure we die when our parent dies */
3853                 while (kill(parent, 0) == 0 || errno != ESRCH) {
3854                         sleep(5);
3855                 }
3856                 _exit(0);
3857         }
3858
3859         h->c             = talloc_steal(h, c);
3860
3861         close(h->fd[1]);
3862         set_close_on_exec(h->fd[0]);
3863
3864         talloc_set_destructor(h, ctdb_reloadips_destructor);
3865
3866
3867         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
3868                         EVENT_FD_READ, ctdb_reloadips_child_handler,
3869                         (void *)h);
3870         tevent_fd_set_auto_close(h->fde);
3871
3872         event_add_timed(ctdb->ev, h,
3873                         timeval_current_ofs(120, 0),
3874                         ctdb_reloadips_timeout_event, h);
3875
3876         /* we reply later */
3877         *async_reply = True;
3878         return 0;
3879 }