ctdb/daemon: Make delete IP wait until the IP is released
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->delete_pending) {
264                 return false;
265         }
266
267         if (vnn->iface && vnn->iface->link_up) {
268                 return true;
269         }
270
271         for (i=0; vnn->ifaces[i]; i++) {
272                 struct ctdb_iface *cur;
273
274                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
275                 if (cur == NULL) {
276                         continue;
277                 }
278
279                 if (cur->link_up) {
280                         return true;
281                 }
282         }
283
284         return false;
285 }
286
287 struct ctdb_takeover_arp {
288         struct ctdb_context *ctdb;
289         uint32_t count;
290         ctdb_sock_addr addr;
291         struct ctdb_tcp_array *tcparray;
292         struct ctdb_vnn *vnn;
293 };
294
295
296 /*
297   lists of tcp endpoints
298  */
299 struct ctdb_tcp_list {
300         struct ctdb_tcp_list *prev, *next;
301         struct ctdb_tcp_connection connection;
302 };
303
304 /*
305   list of clients to kill on IP release
306  */
307 struct ctdb_client_ip {
308         struct ctdb_client_ip *prev, *next;
309         struct ctdb_context *ctdb;
310         ctdb_sock_addr addr;
311         uint32_t client_id;
312 };
313
314
315 /*
316   send a gratuitous arp
317  */
318 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
319                                   struct timeval t, void *private_data)
320 {
321         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
322                                                         struct ctdb_takeover_arp);
323         int i, ret;
324         struct ctdb_tcp_array *tcparray;
325         const char *iface = ctdb_vnn_iface_string(arp->vnn);
326
327         ret = ctdb_sys_send_arp(&arp->addr, iface);
328         if (ret != 0) {
329                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
330                                   iface, strerror(errno)));
331         }
332
333         tcparray = arp->tcparray;
334         if (tcparray) {
335                 for (i=0;i<tcparray->num;i++) {
336                         struct ctdb_tcp_connection *tcon;
337
338                         tcon = &tcparray->connections[i];
339                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
340                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
341                                 ctdb_addr_to_str(&tcon->src_addr),
342                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
343                         ret = ctdb_sys_send_tcp(
344                                 &tcon->src_addr, 
345                                 &tcon->dst_addr,
346                                 0, 0, 0);
347                         if (ret != 0) {
348                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
349                                         ctdb_addr_to_str(&tcon->src_addr)));
350                         }
351                 }
352         }
353
354         arp->count++;
355
356         if (arp->count == CTDB_ARP_REPEAT) {
357                 talloc_free(arp);
358                 return;
359         }
360
361         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
362                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
363                         ctdb_control_send_arp, arp);
364 }
365
366 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
367                                        struct ctdb_vnn *vnn)
368 {
369         struct ctdb_takeover_arp *arp;
370         struct ctdb_tcp_array *tcparray;
371
372         if (!vnn->takeover_ctx) {
373                 vnn->takeover_ctx = talloc_new(vnn);
374                 if (!vnn->takeover_ctx) {
375                         return -1;
376                 }
377         }
378
379         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
380         if (!arp) {
381                 return -1;
382         }
383
384         arp->ctdb = ctdb;
385         arp->addr = vnn->public_address;
386         arp->vnn  = vnn;
387
388         tcparray = vnn->tcp_array;
389         if (tcparray) {
390                 /* add all of the known tcp connections for this IP to the
391                    list of tcp connections to send tickle acks for */
392                 arp->tcparray = talloc_steal(arp, tcparray);
393
394                 vnn->tcp_array = NULL;
395                 vnn->tcp_update_needed = true;
396         }
397
398         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
399                         timeval_zero(), ctdb_control_send_arp, arp);
400
401         return 0;
402 }
403
404 struct takeover_callback_state {
405         struct ctdb_req_control *c;
406         ctdb_sock_addr *addr;
407         struct ctdb_vnn *vnn;
408 };
409
410 struct ctdb_do_takeip_state {
411         struct ctdb_req_control *c;
412         struct ctdb_vnn *vnn;
413 };
414
415 /*
416   called when takeip event finishes
417  */
418 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
419                                     void *private_data)
420 {
421         struct ctdb_do_takeip_state *state =
422                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
423         int32_t ret;
424         TDB_DATA data;
425
426         if (status != 0) {
427                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
428         
429                 if (status == -ETIME) {
430                         ctdb_ban_self(ctdb);
431                 }
432                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
433                                  ctdb_addr_to_str(&state->vnn->public_address),
434                                  ctdb_vnn_iface_string(state->vnn)));
435                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
436
437                 node->flags |= NODE_FLAGS_UNHEALTHY;
438                 talloc_free(state);
439                 return;
440         }
441
442         if (ctdb->do_checkpublicip) {
443
444         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
445         if (ret != 0) {
446                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
447                 talloc_free(state);
448                 return;
449         }
450
451         }
452
453         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
454         data.dsize = strlen((char *)data.dptr) + 1;
455         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
456
457         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
458
459
460         /* the control succeeded */
461         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
462         talloc_free(state);
463         return;
464 }
465
466 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
467 {
468         state->vnn->update_in_flight = false;
469         return 0;
470 }
471
472 /*
473   take over an ip address
474  */
475 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
476                               struct ctdb_req_control *c,
477                               struct ctdb_vnn *vnn)
478 {
479         int ret;
480         struct ctdb_do_takeip_state *state;
481
482         if (vnn->update_in_flight) {
483                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
484                                     "update for this IP already in flight\n",
485                                     ctdb_addr_to_str(&vnn->public_address),
486                                     vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         ret = ctdb_vnn_assign_iface(ctdb, vnn);
491         if (ret != 0) {
492                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
493                                  "assign a usable interface\n",
494                                  ctdb_addr_to_str(&vnn->public_address),
495                                  vnn->public_netmask_bits));
496                 return -1;
497         }
498
499         state = talloc(vnn, struct ctdb_do_takeip_state);
500         CTDB_NO_MEMORY(ctdb, state);
501
502         state->c = talloc_steal(ctdb, c);
503         state->vnn   = vnn;
504
505         vnn->update_in_flight = true;
506         talloc_set_destructor(state, ctdb_takeip_destructor);
507
508         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
509                             ctdb_addr_to_str(&vnn->public_address),
510                             vnn->public_netmask_bits,
511                             ctdb_vnn_iface_string(vnn)));
512
513         ret = ctdb_event_script_callback(ctdb,
514                                          state,
515                                          ctdb_do_takeip_callback,
516                                          state,
517                                          CTDB_EVENT_TAKE_IP,
518                                          "%s %s %u",
519                                          ctdb_vnn_iface_string(vnn),
520                                          ctdb_addr_to_str(&vnn->public_address),
521                                          vnn->public_netmask_bits);
522
523         if (ret != 0) {
524                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
525                         ctdb_addr_to_str(&vnn->public_address),
526                         ctdb_vnn_iface_string(vnn)));
527                 talloc_free(state);
528                 return -1;
529         }
530
531         return 0;
532 }
533
534 struct ctdb_do_updateip_state {
535         struct ctdb_req_control *c;
536         struct ctdb_iface *old;
537         struct ctdb_vnn *vnn;
538 };
539
540 /*
541   called when updateip event finishes
542  */
543 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
544                                       void *private_data)
545 {
546         struct ctdb_do_updateip_state *state =
547                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
548         int32_t ret;
549
550         if (status != 0) {
551                 if (status == -ETIME) {
552                         ctdb_ban_self(ctdb);
553                 }
554                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
555                         ctdb_addr_to_str(&state->vnn->public_address),
556                         state->old->name,
557                         ctdb_vnn_iface_string(state->vnn)));
558
559                 /*
560                  * All we can do is reset the old interface
561                  * and let the next run fix it
562                  */
563                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
564                 state->vnn->iface = state->old;
565                 state->vnn->iface->references++;
566
567                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
568                 talloc_free(state);
569                 return;
570         }
571
572         if (ctdb->do_checkpublicip) {
573
574         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
575         if (ret != 0) {
576                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
577                 talloc_free(state);
578                 return;
579         }
580
581         }
582
583         /* the control succeeded */
584         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
585         talloc_free(state);
586         return;
587 }
588
589 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
590 {
591         state->vnn->update_in_flight = false;
592         return 0;
593 }
594
595 /*
596   update (move) an ip address
597  */
598 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
599                                 struct ctdb_req_control *c,
600                                 struct ctdb_vnn *vnn)
601 {
602         int ret;
603         struct ctdb_do_updateip_state *state;
604         struct ctdb_iface *old = vnn->iface;
605         const char *new_name;
606
607         if (vnn->update_in_flight) {
608                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
609                                     "update for this IP already in flight\n",
610                                     ctdb_addr_to_str(&vnn->public_address),
611                                     vnn->public_netmask_bits));
612                 return -1;
613         }
614
615         ctdb_vnn_unassign_iface(ctdb, vnn);
616         ret = ctdb_vnn_assign_iface(ctdb, vnn);
617         if (ret != 0) {
618                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
619                                  "assin a usable interface (old iface '%s')\n",
620                                  ctdb_addr_to_str(&vnn->public_address),
621                                  vnn->public_netmask_bits,
622                                  old->name));
623                 return -1;
624         }
625
626         new_name = ctdb_vnn_iface_string(vnn);
627         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
628                 /* A benign update from one interface onto itself.
629                  * no need to run the eventscripts in this case, just return
630                  * success.
631                  */
632                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
633                 return 0;
634         }
635
636         state = talloc(vnn, struct ctdb_do_updateip_state);
637         CTDB_NO_MEMORY(ctdb, state);
638
639         state->c = talloc_steal(ctdb, c);
640         state->old = old;
641         state->vnn = vnn;
642
643         vnn->update_in_flight = true;
644         talloc_set_destructor(state, ctdb_updateip_destructor);
645
646         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
647                             "interface %s to %s\n",
648                             ctdb_addr_to_str(&vnn->public_address),
649                             vnn->public_netmask_bits,
650                             old->name,
651                             new_name));
652
653         ret = ctdb_event_script_callback(ctdb,
654                                          state,
655                                          ctdb_do_updateip_callback,
656                                          state,
657                                          CTDB_EVENT_UPDATE_IP,
658                                          "%s %s %s %u",
659                                          state->old->name,
660                                          new_name,
661                                          ctdb_addr_to_str(&vnn->public_address),
662                                          vnn->public_netmask_bits);
663         if (ret != 0) {
664                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
665                                  ctdb_addr_to_str(&vnn->public_address),
666                                  old->name, new_name));
667                 talloc_free(state);
668                 return -1;
669         }
670
671         return 0;
672 }
673
674 /*
675   Find the vnn of the node that has a public ip address
676   returns -1 if the address is not known as a public address
677  */
678 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
679 {
680         struct ctdb_vnn *vnn;
681
682         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
683                 if (ctdb_same_ip(&vnn->public_address, addr)) {
684                         return vnn;
685                 }
686         }
687
688         return NULL;
689 }
690
691 /*
692   take over an ip address
693  */
694 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
695                                  struct ctdb_req_control *c,
696                                  TDB_DATA indata,
697                                  bool *async_reply)
698 {
699         int ret;
700         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
701         struct ctdb_vnn *vnn;
702         bool have_ip = false;
703         bool do_updateip = false;
704         bool do_takeip = false;
705         struct ctdb_iface *best_iface = NULL;
706
707         if (pip->pnn != ctdb->pnn) {
708                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
709                                  "with pnn %d, but we're node %d\n",
710                                  ctdb_addr_to_str(&pip->addr),
711                                  pip->pnn, ctdb->pnn));
712                 return -1;
713         }
714
715         /* update out vnn list */
716         vnn = find_public_ip_vnn(ctdb, &pip->addr);
717         if (vnn == NULL) {
718                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
719                         ctdb_addr_to_str(&pip->addr)));
720                 return 0;
721         }
722
723         if (ctdb->do_checkpublicip) {
724                 have_ip = ctdb_sys_have_ip(&pip->addr);
725         }
726         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
727         if (best_iface == NULL) {
728                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
729                                  "a usable interface (old %s, have_ip %d)\n",
730                                  ctdb_addr_to_str(&vnn->public_address),
731                                  vnn->public_netmask_bits,
732                                  ctdb_vnn_iface_string(vnn),
733                                  have_ip));
734                 return -1;
735         }
736
737         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
738                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
739                 have_ip = false;
740         }
741
742
743         if (vnn->iface == NULL && have_ip) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
746                                  ctdb_addr_to_str(&vnn->public_address)));
747                 return 0;
748         }
749
750         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
751                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752                                   "and we have it on iface[%s], but it was assigned to node %d"
753                                   "and we are node %d, banning ourself\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
756                 ctdb_ban_self(ctdb);
757                 return -1;
758         }
759
760         if (vnn->pnn == -1 && have_ip) {
761                 vnn->pnn = ctdb->pnn;
762                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
763                                   "and we already have it on iface[%s], update local daemon\n",
764                                  ctdb_addr_to_str(&vnn->public_address),
765                                   ctdb_vnn_iface_string(vnn)));
766                 return 0;
767         }
768
769         if (vnn->iface) {
770                 if (vnn->iface != best_iface) {
771                         if (!vnn->iface->link_up) {
772                                 do_updateip = true;
773                         } else if (vnn->iface->references > (best_iface->references + 1)) {
774                                 /* only move when the rebalance gains something */
775                                         do_updateip = true;
776                         }
777                 }
778         }
779
780         if (!have_ip) {
781                 if (do_updateip) {
782                         ctdb_vnn_unassign_iface(ctdb, vnn);
783                         do_updateip = false;
784                 }
785                 do_takeip = true;
786         }
787
788         if (do_takeip) {
789                 ret = ctdb_do_takeip(ctdb, c, vnn);
790                 if (ret != 0) {
791                         return -1;
792                 }
793         } else if (do_updateip) {
794                 ret = ctdb_do_updateip(ctdb, c, vnn);
795                 if (ret != 0) {
796                         return -1;
797                 }
798         } else {
799                 /*
800                  * The interface is up and the kernel known the ip
801                  * => do nothing
802                  */
803                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
804                         ctdb_addr_to_str(&pip->addr),
805                         vnn->public_netmask_bits,
806                         ctdb_vnn_iface_string(vnn)));
807                 return 0;
808         }
809
810         /* tell ctdb_control.c that we will be replying asynchronously */
811         *async_reply = true;
812
813         return 0;
814 }
815
816 /*
817   takeover an ip address old v4 style
818  */
819 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
820                                 struct ctdb_req_control *c,
821                                 TDB_DATA indata, 
822                                 bool *async_reply)
823 {
824         TDB_DATA data;
825         
826         data.dsize = sizeof(struct ctdb_public_ip);
827         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
828         CTDB_NO_MEMORY(ctdb, data.dptr);
829         
830         memcpy(data.dptr, indata.dptr, indata.dsize);
831         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
832 }
833
834 /*
835   kill any clients that are registered with a IP that is being released
836  */
837 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
838 {
839         struct ctdb_client_ip *ip;
840
841         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
842                 ctdb_addr_to_str(addr)));
843
844         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
845                 ctdb_sock_addr tmp_addr;
846
847                 tmp_addr = ip->addr;
848                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
849                         ip->client_id,
850                         ctdb_addr_to_str(&ip->addr)));
851
852                 if (ctdb_same_ip(&tmp_addr, addr)) {
853                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
854                                                                      ip->client_id, 
855                                                                      struct ctdb_client);
856                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
857                                 ip->client_id,
858                                 ctdb_addr_to_str(&ip->addr),
859                                 client->pid));
860
861                         if (client->pid != 0) {
862                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
863                                         (unsigned)client->pid,
864                                         ctdb_addr_to_str(addr),
865                                         ip->client_id));
866                                 kill(client->pid, SIGKILL);
867                         }
868                 }
869         }
870 }
871
872 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
873 {
874         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
875
876         DLIST_REMOVE(ctdb->vnn, vnn);
877         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
878         ctdb_vnn_unassign_iface(ctdb, vnn);
879         talloc_free(vnn);
880         talloc_free(mem_ctx);
881 }
882
883 /*
884   called when releaseip event finishes
885  */
886 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
887                                 void *private_data)
888 {
889         struct takeover_callback_state *state = 
890                 talloc_get_type(private_data, struct takeover_callback_state);
891         TDB_DATA data;
892
893         if (status == -ETIME) {
894                 ctdb_ban_self(ctdb);
895         }
896
897         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
898                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
899                                   ctdb_addr_to_str(state->addr)));
900                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
901                 talloc_free(state);
902                 return;
903         }
904
905         /* send a message to all clients of this node telling them
906            that the cluster has been reconfigured and they should
907            release any sockets on this IP */
908         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
909         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
910         data.dsize = strlen((char *)data.dptr)+1;
911
912         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
913
914         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
915
916         /* kill clients that have registered with this IP */
917         release_kill_clients(ctdb, state->addr);
918
919         ctdb_vnn_unassign_iface(ctdb, state->vnn);
920
921         /* Process the IP if it has been marked for deletion */
922         if (state->vnn->delete_pending) {
923                 do_delete_ip(ctdb, state->vnn);
924                 state->vnn = NULL;
925         }
926
927         /* the control succeeded */
928         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
929         talloc_free(state);
930 }
931
932 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
933 {
934         if (state->vnn != NULL) {
935                 state->vnn->update_in_flight = false;
936         }
937         return 0;
938 }
939
940 /*
941   release an ip address
942  */
943 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
944                                 struct ctdb_req_control *c,
945                                 TDB_DATA indata, 
946                                 bool *async_reply)
947 {
948         int ret;
949         struct takeover_callback_state *state;
950         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
951         struct ctdb_vnn *vnn;
952         char *iface;
953
954         /* update our vnn list */
955         vnn = find_public_ip_vnn(ctdb, &pip->addr);
956         if (vnn == NULL) {
957                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
958                         ctdb_addr_to_str(&pip->addr)));
959                 return 0;
960         }
961         vnn->pnn = pip->pnn;
962
963         /* stop any previous arps */
964         talloc_free(vnn->takeover_ctx);
965         vnn->takeover_ctx = NULL;
966
967         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
968          * lazy multicast to drop an IP from any node that isn't the
969          * intended new node.  The following causes makes ctdbd ignore
970          * a release for any address it doesn't host.
971          */
972         if (ctdb->do_checkpublicip) {
973                 if (!ctdb_sys_have_ip(&pip->addr)) {
974                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
975                                 ctdb_addr_to_str(&pip->addr),
976                                 vnn->public_netmask_bits,
977                                 ctdb_vnn_iface_string(vnn)));
978                         ctdb_vnn_unassign_iface(ctdb, vnn);
979                         return 0;
980                 }
981         } else {
982                 if (vnn->iface == NULL) {
983                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
984                                            ctdb_addr_to_str(&pip->addr),
985                                            vnn->public_netmask_bits));
986                         return 0;
987                 }
988         }
989
990         /* There is a potential race between take_ip and us because we
991          * update the VNN via a callback that run when the
992          * eventscripts have been run.  Avoid the race by allowing one
993          * update to be in flight at a time.
994          */
995         if (vnn->update_in_flight) {
996                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
997                                     "update for this IP already in flight\n",
998                                     ctdb_addr_to_str(&vnn->public_address),
999                                     vnn->public_netmask_bits));
1000                 return -1;
1001         }
1002
1003         if (ctdb->do_checkpublicip) {
1004                 iface = ctdb_sys_find_ifname(&pip->addr);
1005                 if (iface == NULL) {
1006                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
1007                         return 0;
1008                 }
1009                 if (vnn->iface == NULL) {
1010                         DEBUG(DEBUG_WARNING,
1011                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
1012                                ctdb_addr_to_str(&pip->addr),
1013                                iface));
1014                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
1015                         DEBUG(DEBUG_WARNING,
1016                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
1017                                ctdb_addr_to_str(&pip->addr),
1018                                iface,
1019                                ctdb_vnn_iface_string(vnn)));
1020                         /* Should we fix vnn->iface?  If we do, what
1021                          * happens to reference counts?
1022                          */
1023                 }
1024         } else {
1025                 iface = strdup(ctdb_vnn_iface_string(vnn));
1026         }
1027
1028         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1029                 ctdb_addr_to_str(&pip->addr),
1030                 vnn->public_netmask_bits,
1031                 iface,
1032                 pip->pnn));
1033
1034         state = talloc(ctdb, struct takeover_callback_state);
1035         CTDB_NO_MEMORY(ctdb, state);
1036
1037         state->c = talloc_steal(state, c);
1038         state->addr = talloc(state, ctdb_sock_addr);       
1039         CTDB_NO_MEMORY(ctdb, state->addr);
1040         *state->addr = pip->addr;
1041         state->vnn   = vnn;
1042
1043         vnn->update_in_flight = true;
1044         talloc_set_destructor(state, ctdb_releaseip_destructor);
1045
1046         ret = ctdb_event_script_callback(ctdb, 
1047                                          state, release_ip_callback, state,
1048                                          CTDB_EVENT_RELEASE_IP,
1049                                          "%s %s %u",
1050                                          iface,
1051                                          ctdb_addr_to_str(&pip->addr),
1052                                          vnn->public_netmask_bits);
1053         free(iface);
1054         if (ret != 0) {
1055                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1056                         ctdb_addr_to_str(&pip->addr),
1057                         ctdb_vnn_iface_string(vnn)));
1058                 talloc_free(state);
1059                 return -1;
1060         }
1061
1062         /* tell the control that we will be reply asynchronously */
1063         *async_reply = true;
1064         return 0;
1065 }
1066
1067 /*
1068   release an ip address old v4 style
1069  */
1070 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1071                                 struct ctdb_req_control *c,
1072                                 TDB_DATA indata, 
1073                                 bool *async_reply)
1074 {
1075         TDB_DATA data;
1076         
1077         data.dsize = sizeof(struct ctdb_public_ip);
1078         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1079         CTDB_NO_MEMORY(ctdb, data.dptr);
1080         
1081         memcpy(data.dptr, indata.dptr, indata.dsize);
1082         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1083 }
1084
1085
1086 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1087                                    ctdb_sock_addr *addr,
1088                                    unsigned mask, const char *ifaces,
1089                                    bool check_address)
1090 {
1091         struct ctdb_vnn      *vnn;
1092         uint32_t num = 0;
1093         char *tmp;
1094         const char *iface;
1095         int i;
1096         int ret;
1097
1098         tmp = strdup(ifaces);
1099         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100                 if (!ctdb_sys_check_iface_exists(iface)) {
1101                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1102                         free(tmp);
1103                         return -1;
1104                 }
1105         }
1106         free(tmp);
1107
1108         /* Verify that we dont have an entry for this ip yet */
1109         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1110                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1111                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1112                                 ctdb_addr_to_str(addr)));
1113                         return -1;
1114                 }               
1115         }
1116
1117         /* create a new vnn structure for this ip address */
1118         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1119         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1120         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1121         tmp = talloc_strdup(vnn, ifaces);
1122         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1123         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1124                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1125                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1126                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1127                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1128                 num++;
1129         }
1130         talloc_free(tmp);
1131         vnn->ifaces[num] = NULL;
1132         vnn->public_address      = *addr;
1133         vnn->public_netmask_bits = mask;
1134         vnn->pnn                 = -1;
1135         if (check_address) {
1136                 if (ctdb_sys_have_ip(addr)) {
1137                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1138                         vnn->pnn = ctdb->pnn;
1139                 }
1140         }
1141
1142         for (i=0; vnn->ifaces[i]; i++) {
1143                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1144                 if (ret != 0) {
1145                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1146                                            "for public_address[%s]\n",
1147                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1148                         talloc_free(vnn);
1149                         return -1;
1150                 }
1151         }
1152
1153         DLIST_ADD(ctdb->vnn, vnn);
1154
1155         return 0;
1156 }
1157
1158 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1159                                   struct timeval t, void *private_data)
1160 {
1161         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1162                                                         struct ctdb_context);
1163         struct ctdb_vnn *vnn;
1164
1165         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1166                 int i;
1167
1168                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1169                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1170                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1171                                         vnn->ifaces[i],
1172                                         ctdb_addr_to_str(&vnn->public_address)));
1173                         }
1174                 }
1175         }
1176
1177         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1178                 timeval_current_ofs(30, 0), 
1179                 ctdb_check_interfaces_event, ctdb);
1180 }
1181
1182
1183 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1184 {
1185         if (ctdb->check_public_ifaces_ctx != NULL) {
1186                 talloc_free(ctdb->check_public_ifaces_ctx);
1187                 ctdb->check_public_ifaces_ctx = NULL;
1188         }
1189
1190         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1191         if (ctdb->check_public_ifaces_ctx == NULL) {
1192                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1193         }
1194
1195         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1196                 timeval_current_ofs(30, 0), 
1197                 ctdb_check_interfaces_event, ctdb);
1198
1199         return 0;
1200 }
1201
1202
1203 /*
1204   setup the public address lists from a file
1205 */
1206 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1207 {
1208         char **lines;
1209         int nlines;
1210         int i;
1211
1212         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1213         if (lines == NULL) {
1214                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1215                 return -1;
1216         }
1217         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1218                 nlines--;
1219         }
1220
1221         for (i=0;i<nlines;i++) {
1222                 unsigned mask;
1223                 ctdb_sock_addr addr;
1224                 const char *addrstr;
1225                 const char *ifaces;
1226                 char *tok, *line;
1227
1228                 line = lines[i];
1229                 while ((*line == ' ') || (*line == '\t')) {
1230                         line++;
1231                 }
1232                 if (*line == '#') {
1233                         continue;
1234                 }
1235                 if (strcmp(line, "") == 0) {
1236                         continue;
1237                 }
1238                 tok = strtok(line, " \t");
1239                 addrstr = tok;
1240                 tok = strtok(NULL, " \t");
1241                 if (tok == NULL) {
1242                         if (NULL == ctdb->default_public_interface) {
1243                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1244                                          i+1));
1245                                 talloc_free(lines);
1246                                 return -1;
1247                         }
1248                         ifaces = ctdb->default_public_interface;
1249                 } else {
1250                         ifaces = tok;
1251                 }
1252
1253                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1254                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1255                         talloc_free(lines);
1256                         return -1;
1257                 }
1258                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1259                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1260                         talloc_free(lines);
1261                         return -1;
1262                 }
1263         }
1264
1265
1266         talloc_free(lines);
1267         return 0;
1268 }
1269
1270 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1271                               const char *iface,
1272                               const char *ip)
1273 {
1274         struct ctdb_vnn *svnn;
1275         struct ctdb_iface *cur = NULL;
1276         bool ok;
1277         int ret;
1278
1279         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1280         CTDB_NO_MEMORY(ctdb, svnn);
1281
1282         svnn->ifaces = talloc_array(svnn, const char *, 2);
1283         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1284         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1285         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1286         svnn->ifaces[1] = NULL;
1287
1288         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1289         if (!ok) {
1290                 talloc_free(svnn);
1291                 return -1;
1292         }
1293
1294         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1295         if (ret != 0) {
1296                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1297                                    "for single_ip[%s]\n",
1298                                    svnn->ifaces[0],
1299                                    ctdb_addr_to_str(&svnn->public_address)));
1300                 talloc_free(svnn);
1301                 return -1;
1302         }
1303
1304         /* assume the single public ip interface is initially "good" */
1305         cur = ctdb_find_iface(ctdb, iface);
1306         if (cur == NULL) {
1307                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1308                 return -1;
1309         }
1310         cur->link_up = true;
1311
1312         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1313         if (ret != 0) {
1314                 talloc_free(svnn);
1315                 return -1;
1316         }
1317
1318         ctdb->single_ip_vnn = svnn;
1319         return 0;
1320 }
1321
1322 struct ctdb_public_ip_list {
1323         struct ctdb_public_ip_list *next;
1324         uint32_t pnn;
1325         ctdb_sock_addr addr;
1326 };
1327
1328 /* Given a physical node, return the number of
1329    public addresses that is currently assigned to this node.
1330 */
1331 static int node_ip_coverage(struct ctdb_context *ctdb, 
1332         int32_t pnn,
1333         struct ctdb_public_ip_list *ips)
1334 {
1335         int num=0;
1336
1337         for (;ips;ips=ips->next) {
1338                 if (ips->pnn == pnn) {
1339                         num++;
1340                 }
1341         }
1342         return num;
1343 }
1344
1345
1346 /* Can the given node host the given IP: is the public IP known to the
1347  * node and is NOIPHOST unset?
1348 */
1349 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1350                              struct ctdb_ipflags ipflags,
1351                              struct ctdb_public_ip_list *ip)
1352 {
1353         struct ctdb_all_public_ips *public_ips;
1354         int i;
1355
1356         if (ipflags.noiphost) {
1357                 return false;
1358         }
1359
1360         public_ips = ctdb->nodes[pnn]->available_public_ips;
1361
1362         if (public_ips == NULL) {
1363                 return false;
1364         }
1365
1366         for (i=0; i<public_ips->num; i++) {
1367                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1368                         /* yes, this node can serve this public ip */
1369                         return true;
1370                 }
1371         }
1372
1373         return false;
1374 }
1375
1376 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1377                                  struct ctdb_ipflags ipflags,
1378                                  struct ctdb_public_ip_list *ip)
1379 {
1380         if (ipflags.noiptakeover) {
1381                 return false;
1382         }
1383
1384         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1385 }
1386
1387 /* search the node lists list for a node to takeover this ip.
1388    pick the node that currently are serving the least number of ips
1389    so that the ips get spread out evenly.
1390 */
1391 static int find_takeover_node(struct ctdb_context *ctdb, 
1392                 struct ctdb_ipflags *ipflags,
1393                 struct ctdb_public_ip_list *ip,
1394                 struct ctdb_public_ip_list *all_ips)
1395 {
1396         int pnn, min=0, num;
1397         int i, numnodes;
1398
1399         numnodes = talloc_array_length(ipflags);
1400         pnn    = -1;
1401         for (i=0; i<numnodes; i++) {
1402                 /* verify that this node can serve this ip */
1403                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1404                         /* no it couldnt   so skip to the next node */
1405                         continue;
1406                 }
1407
1408                 num = node_ip_coverage(ctdb, i, all_ips);
1409                 /* was this the first node we checked ? */
1410                 if (pnn == -1) {
1411                         pnn = i;
1412                         min  = num;
1413                 } else {
1414                         if (num < min) {
1415                                 pnn = i;
1416                                 min  = num;
1417                         }
1418                 }
1419         }       
1420         if (pnn == -1) {
1421                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1422                         ctdb_addr_to_str(&ip->addr)));
1423
1424                 return -1;
1425         }
1426
1427         ip->pnn = pnn;
1428         return 0;
1429 }
1430
1431 #define IP_KEYLEN       4
1432 static uint32_t *ip_key(ctdb_sock_addr *ip)
1433 {
1434         static uint32_t key[IP_KEYLEN];
1435
1436         bzero(key, sizeof(key));
1437
1438         switch (ip->sa.sa_family) {
1439         case AF_INET:
1440                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1441                 break;
1442         case AF_INET6: {
1443                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1444                 key[0]  = htonl(s6_a32[0]);
1445                 key[1]  = htonl(s6_a32[1]);
1446                 key[2]  = htonl(s6_a32[2]);
1447                 key[3]  = htonl(s6_a32[3]);
1448                 break;
1449         }
1450         default:
1451                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1452                 return key;
1453         }
1454
1455         return key;
1456 }
1457
1458 static void *add_ip_callback(void *parm, void *data)
1459 {
1460         struct ctdb_public_ip_list *this_ip = parm; 
1461         struct ctdb_public_ip_list *prev_ip = data; 
1462
1463         if (prev_ip == NULL) {
1464                 return parm;
1465         }
1466         if (this_ip->pnn == -1) {
1467                 this_ip->pnn = prev_ip->pnn;
1468         }
1469
1470         return parm;
1471 }
1472
1473 static int getips_count_callback(void *param, void *data)
1474 {
1475         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1476         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1477
1478         new_ip->next = *ip_list;
1479         *ip_list     = new_ip;
1480         return 0;
1481 }
1482
1483 static struct ctdb_public_ip_list *
1484 create_merged_ip_list(struct ctdb_context *ctdb)
1485 {
1486         int i, j;
1487         struct ctdb_public_ip_list *ip_list;
1488         struct ctdb_all_public_ips *public_ips;
1489
1490         if (ctdb->ip_tree != NULL) {
1491                 talloc_free(ctdb->ip_tree);
1492                 ctdb->ip_tree = NULL;
1493         }
1494         ctdb->ip_tree = trbt_create(ctdb, 0);
1495
1496         for (i=0;i<ctdb->num_nodes;i++) {
1497                 public_ips = ctdb->nodes[i]->known_public_ips;
1498
1499                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1500                         continue;
1501                 }
1502
1503                 /* there were no public ips for this node */
1504                 if (public_ips == NULL) {
1505                         continue;
1506                 }               
1507
1508                 for (j=0;j<public_ips->num;j++) {
1509                         struct ctdb_public_ip_list *tmp_ip; 
1510
1511                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1512                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1513                         /* Do not use information about IP addresses hosted
1514                          * on other nodes, it may not be accurate */
1515                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1516                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1517                         } else {
1518                                 tmp_ip->pnn = -1;
1519                         }
1520                         tmp_ip->addr = public_ips->ips[j].addr;
1521                         tmp_ip->next = NULL;
1522
1523                         trbt_insertarray32_callback(ctdb->ip_tree,
1524                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1525                                 add_ip_callback,
1526                                 tmp_ip);
1527                 }
1528         }
1529
1530         ip_list = NULL;
1531         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1532
1533         return ip_list;
1534 }
1535
1536 /* 
1537  * This is the length of the longtest common prefix between the IPs.
1538  * It is calculated by XOR-ing the 2 IPs together and counting the
1539  * number of leading zeroes.  The implementation means that all
1540  * addresses end up being 128 bits long.
1541  *
1542  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1543  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1544  * lots of nodes and IP addresses?
1545  */
1546 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1547 {
1548         uint32_t ip1_k[IP_KEYLEN];
1549         uint32_t *t;
1550         int i;
1551         uint32_t x;
1552
1553         uint32_t distance = 0;
1554
1555         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1556         t = ip_key(ip2);
1557         for (i=0; i<IP_KEYLEN; i++) {
1558                 x = ip1_k[i] ^ t[i];
1559                 if (x == 0) {
1560                         distance += 32;
1561                 } else {
1562                         /* Count number of leading zeroes. 
1563                          * FIXME? This could be optimised...
1564                          */
1565                         while ((x & (1 << 31)) == 0) {
1566                                 x <<= 1;
1567                                 distance += 1;
1568                         }
1569                 }
1570         }
1571
1572         return distance;
1573 }
1574
1575 /* Calculate the IP distance for the given IP relative to IPs on the
1576    given node.  The ips argument is generally the all_ips variable
1577    used in the main part of the algorithm.
1578  */
1579 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1580                                   struct ctdb_public_ip_list *ips,
1581                                   int pnn)
1582 {
1583         struct ctdb_public_ip_list *t;
1584         uint32_t d;
1585
1586         uint32_t sum = 0;
1587
1588         for (t=ips; t != NULL; t=t->next) {
1589                 if (t->pnn != pnn) {
1590                         continue;
1591                 }
1592
1593                 /* Optimisation: We never calculate the distance
1594                  * between an address and itself.  This allows us to
1595                  * calculate the effect of removing an address from a
1596                  * node by simply calculating the distance between
1597                  * that address and all of the exitsing addresses.
1598                  * Moreover, we assume that we're only ever dealing
1599                  * with addresses from all_ips so we can identify an
1600                  * address via a pointer rather than doing a more
1601                  * expensive address comparison. */
1602                 if (&(t->addr) == ip) {
1603                         continue;
1604                 }
1605
1606                 d = ip_distance(ip, &(t->addr));
1607                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1608         }
1609
1610         return sum;
1611 }
1612
1613 /* Return the LCP2 imbalance metric for addresses currently assigned
1614    to the given node.
1615  */
1616 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1617 {
1618         struct ctdb_public_ip_list *t;
1619
1620         uint32_t imbalance = 0;
1621
1622         for (t=all_ips; t!=NULL; t=t->next) {
1623                 if (t->pnn != pnn) {
1624                         continue;
1625                 }
1626                 /* Pass the rest of the IPs rather than the whole
1627                    all_ips input list.
1628                 */
1629                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1630         }
1631
1632         return imbalance;
1633 }
1634
1635 /* Allocate any unassigned IPs just by looping through the IPs and
1636  * finding the best node for each.
1637  */
1638 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1639                                       struct ctdb_ipflags *ipflags,
1640                                       struct ctdb_public_ip_list *all_ips)
1641 {
1642         struct ctdb_public_ip_list *tmp_ip;
1643
1644         /* loop over all ip's and find a physical node to cover for 
1645            each unassigned ip.
1646         */
1647         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1648                 if (tmp_ip->pnn == -1) {
1649                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1650                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1651                                         ctdb_addr_to_str(&tmp_ip->addr)));
1652                         }
1653                 }
1654         }
1655 }
1656
1657 /* Basic non-deterministic rebalancing algorithm.
1658  */
1659 static void basic_failback(struct ctdb_context *ctdb,
1660                            struct ctdb_ipflags *ipflags,
1661                            struct ctdb_public_ip_list *all_ips,
1662                            int num_ips)
1663 {
1664         int i, numnodes;
1665         int maxnode, maxnum, minnode, minnum, num, retries;
1666         struct ctdb_public_ip_list *tmp_ip;
1667
1668         numnodes = talloc_array_length(ipflags);
1669         retries = 0;
1670
1671 try_again:
1672         maxnum=0;
1673         minnum=0;
1674
1675         /* for each ip address, loop over all nodes that can serve
1676            this ip and make sure that the difference between the node
1677            serving the most and the node serving the least ip's are
1678            not greater than 1.
1679         */
1680         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1681                 if (tmp_ip->pnn == -1) {
1682                         continue;
1683                 }
1684
1685                 /* Get the highest and lowest number of ips's served by any 
1686                    valid node which can serve this ip.
1687                 */
1688                 maxnode = -1;
1689                 minnode = -1;
1690                 for (i=0; i<numnodes; i++) {
1691                         /* only check nodes that can actually serve this ip */
1692                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1693                                 /* no it couldnt   so skip to the next node */
1694                                 continue;
1695                         }
1696
1697                         num = node_ip_coverage(ctdb, i, all_ips);
1698                         if (maxnode == -1) {
1699                                 maxnode = i;
1700                                 maxnum  = num;
1701                         } else {
1702                                 if (num > maxnum) {
1703                                         maxnode = i;
1704                                         maxnum  = num;
1705                                 }
1706                         }
1707                         if (minnode == -1) {
1708                                 minnode = i;
1709                                 minnum  = num;
1710                         } else {
1711                                 if (num < minnum) {
1712                                         minnode = i;
1713                                         minnum  = num;
1714                                 }
1715                         }
1716                 }
1717                 if (maxnode == -1) {
1718                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1719                                 ctdb_addr_to_str(&tmp_ip->addr)));
1720
1721                         continue;
1722                 }
1723
1724                 /* if the spread between the smallest and largest coverage by
1725                    a node is >=2 we steal one of the ips from the node with
1726                    most coverage to even things out a bit.
1727                    try to do this a limited number of times since we dont
1728                    want to spend too much time balancing the ip coverage.
1729                 */
1730                 if ( (maxnum > minnum+1)
1731                      && (retries < (num_ips + 5)) ){
1732                         struct ctdb_public_ip_list *tmp;
1733
1734                         /* Reassign one of maxnode's VNNs */
1735                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1736                                 if (tmp->pnn == maxnode) {
1737                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1738                                         retries++;
1739                                         goto try_again;;
1740                                 }
1741                         }
1742                 }
1743         }
1744 }
1745
1746 static void lcp2_init(struct ctdb_context *tmp_ctx,
1747                       struct ctdb_ipflags *ipflags,
1748                       struct ctdb_public_ip_list *all_ips,
1749                       uint32_t *force_rebalance_nodes,
1750                       uint32_t **lcp2_imbalances,
1751                       bool **rebalance_candidates)
1752 {
1753         int i, numnodes;
1754         struct ctdb_public_ip_list *tmp_ip;
1755
1756         numnodes = talloc_array_length(ipflags);
1757
1758         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1759         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1760         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1761         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1762
1763         for (i=0; i<numnodes; i++) {
1764                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1765                 /* First step: assume all nodes are candidates */
1766                 (*rebalance_candidates)[i] = true;
1767         }
1768
1769         /* 2nd step: if a node has IPs assigned then it must have been
1770          * healthy before, so we remove it from consideration.  This
1771          * is overkill but is all we have because we don't maintain
1772          * state between takeover runs.  An alternative would be to
1773          * keep state and invalidate it every time the recovery master
1774          * changes.
1775          */
1776         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1777                 if (tmp_ip->pnn != -1) {
1778                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1779                 }
1780         }
1781
1782         /* 3rd step: if a node is forced to re-balance then
1783            we allow failback onto the node */
1784         if (force_rebalance_nodes == NULL) {
1785                 return;
1786         }
1787         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1788                 uint32_t pnn = force_rebalance_nodes[i];
1789                 if (pnn >= numnodes) {
1790                         DEBUG(DEBUG_ERR,
1791                               (__location__ "unknown node %u\n", pnn));
1792                         continue;
1793                 }
1794
1795                 DEBUG(DEBUG_NOTICE,
1796                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1797                 (*rebalance_candidates)[pnn] = true;
1798         }
1799 }
1800
1801 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1802  * the IP/node combination that will cost the least.
1803  */
1804 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1805                                      struct ctdb_ipflags *ipflags,
1806                                      struct ctdb_public_ip_list *all_ips,
1807                                      uint32_t *lcp2_imbalances)
1808 {
1809         struct ctdb_public_ip_list *tmp_ip;
1810         int dstnode, numnodes;
1811
1812         int minnode;
1813         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1814         struct ctdb_public_ip_list *minip;
1815
1816         bool should_loop = true;
1817         bool have_unassigned = true;
1818
1819         numnodes = talloc_array_length(ipflags);
1820
1821         while (have_unassigned && should_loop) {
1822                 should_loop = false;
1823
1824                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1825                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1826
1827                 minnode = -1;
1828                 mindsum = 0;
1829                 minip = NULL;
1830
1831                 /* loop over each unassigned ip. */
1832                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1833                         if (tmp_ip->pnn != -1) {
1834                                 continue;
1835                         }
1836
1837                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1838                                 /* only check nodes that can actually takeover this ip */
1839                                 if (!can_node_takeover_ip(ctdb, dstnode,
1840                                                           ipflags[dstnode],
1841                                                           tmp_ip)) {
1842                                         /* no it couldnt   so skip to the next node */
1843                                         continue;
1844                                 }
1845
1846                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1847                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1848                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1849                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1850                                                    dstnode,
1851                                                    dstimbl - lcp2_imbalances[dstnode]));
1852
1853
1854                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1855                                         minnode = dstnode;
1856                                         minimbl = dstimbl;
1857                                         mindsum = dstdsum;
1858                                         minip = tmp_ip;
1859                                         should_loop = true;
1860                                 }
1861                         }
1862                 }
1863
1864                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1865
1866                 /* If we found one then assign it to the given node. */
1867                 if (minnode != -1) {
1868                         minip->pnn = minnode;
1869                         lcp2_imbalances[minnode] = minimbl;
1870                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1871                                           ctdb_addr_to_str(&(minip->addr)),
1872                                           minnode,
1873                                           mindsum));
1874                 }
1875
1876                 /* There might be a better way but at least this is clear. */
1877                 have_unassigned = false;
1878                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1879                         if (tmp_ip->pnn == -1) {
1880                                 have_unassigned = true;
1881                         }
1882                 }
1883         }
1884
1885         /* We know if we have an unassigned addresses so we might as
1886          * well optimise.
1887          */
1888         if (have_unassigned) {
1889                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1890                         if (tmp_ip->pnn == -1) {
1891                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1892                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1893                         }
1894                 }
1895         }
1896 }
1897
1898 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1899  * to move IPs from, determines the best IP/destination node
1900  * combination to move from the source node.
1901  */
1902 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1903                                     struct ctdb_ipflags *ipflags,
1904                                     struct ctdb_public_ip_list *all_ips,
1905                                     int srcnode,
1906                                     uint32_t *lcp2_imbalances,
1907                                     bool *rebalance_candidates)
1908 {
1909         int dstnode, mindstnode, numnodes;
1910         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1911         uint32_t minsrcimbl, mindstimbl;
1912         struct ctdb_public_ip_list *minip;
1913         struct ctdb_public_ip_list *tmp_ip;
1914
1915         /* Find an IP and destination node that best reduces imbalance. */
1916         srcimbl = 0;
1917         minip = NULL;
1918         minsrcimbl = 0;
1919         mindstnode = -1;
1920         mindstimbl = 0;
1921
1922         numnodes = talloc_array_length(ipflags);
1923
1924         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1925         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1926                            srcnode, lcp2_imbalances[srcnode]));
1927
1928         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1929                 /* Only consider addresses on srcnode. */
1930                 if (tmp_ip->pnn != srcnode) {
1931                         continue;
1932                 }
1933
1934                 /* What is this IP address costing the source node? */
1935                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1936                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1937
1938                 /* Consider this IP address would cost each potential
1939                  * destination node.  Destination nodes are limited to
1940                  * those that are newly healthy, since we don't want
1941                  * to do gratuitous failover of IPs just to make minor
1942                  * balance improvements.
1943                  */
1944                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1945                         if (!rebalance_candidates[dstnode]) {
1946                                 continue;
1947                         }
1948
1949                         /* only check nodes that can actually takeover this ip */
1950                         if (!can_node_takeover_ip(ctdb, dstnode,
1951                                                   ipflags[dstnode], tmp_ip)) {
1952                                 /* no it couldnt   so skip to the next node */
1953                                 continue;
1954                         }
1955
1956                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1957                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1958                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1959                                            srcnode, -srcdsum,
1960                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1961                                            dstnode, dstdsum));
1962
1963                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1964                             (dstdsum < srcdsum) &&                      \
1965                             ((mindstnode == -1) ||                              \
1966                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1967
1968                                 minip = tmp_ip;
1969                                 minsrcimbl = srcimbl;
1970                                 mindstnode = dstnode;
1971                                 mindstimbl = dstimbl;
1972                         }
1973                 }
1974         }
1975         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1976
1977         if (mindstnode != -1) {
1978                 /* We found a move that makes things better... */
1979                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1980                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1981                                   ctdb_addr_to_str(&(minip->addr)),
1982                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1983
1984
1985                 lcp2_imbalances[srcnode] = minsrcimbl;
1986                 lcp2_imbalances[mindstnode] = mindstimbl;
1987                 minip->pnn = mindstnode;
1988
1989                 return true;
1990         }
1991
1992         return false;
1993         
1994 }
1995
1996 struct lcp2_imbalance_pnn {
1997         uint32_t imbalance;
1998         int pnn;
1999 };
2000
2001 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2002 {
2003         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2004         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2005
2006         if (lipa->imbalance > lipb->imbalance) {
2007                 return -1;
2008         } else if (lipa->imbalance == lipb->imbalance) {
2009                 return 0;
2010         } else {
2011                 return 1;
2012         }
2013 }
2014
2015 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2016  * node with the highest LCP2 imbalance, and then determines the best
2017  * IP/destination node combination to move from the source node.
2018  */
2019 static void lcp2_failback(struct ctdb_context *ctdb,
2020                           struct ctdb_ipflags *ipflags,
2021                           struct ctdb_public_ip_list *all_ips,
2022                           uint32_t *lcp2_imbalances,
2023                           bool *rebalance_candidates)
2024 {
2025         int i, numnodes;
2026         struct lcp2_imbalance_pnn * lips;
2027         bool again;
2028
2029         numnodes = talloc_array_length(ipflags);
2030
2031 try_again:
2032         /* Put the imbalances and nodes into an array, sort them and
2033          * iterate through candidates.  Usually the 1st one will be
2034          * used, so this doesn't cost much...
2035          */
2036         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2037         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2038         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2039         for (i=0; i<numnodes; i++) {
2040                 lips[i].imbalance = lcp2_imbalances[i];
2041                 lips[i].pnn = i;
2042                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2043         }
2044         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2045               lcp2_cmp_imbalance_pnn);
2046
2047         again = false;
2048         for (i=0; i<numnodes; i++) {
2049                 /* This means that all nodes had 0 or 1 addresses, so
2050                  * can't be imbalanced.
2051                  */
2052                 if (lips[i].imbalance == 0) {
2053                         break;
2054                 }
2055
2056                 if (lcp2_failback_candidate(ctdb,
2057                                             ipflags,
2058                                             all_ips,
2059                                             lips[i].pnn,
2060                                             lcp2_imbalances,
2061                                             rebalance_candidates)) {
2062                         again = true;
2063                         break;
2064                 }
2065         }
2066
2067         talloc_free(lips);
2068         if (again) {
2069                 goto try_again;
2070         }
2071 }
2072
2073 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2074                                     struct ctdb_ipflags *ipflags,
2075                                     struct ctdb_public_ip_list *all_ips)
2076 {
2077         struct ctdb_public_ip_list *tmp_ip;
2078
2079         /* verify that the assigned nodes can serve that public ip
2080            and set it to -1 if not
2081         */
2082         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2083                 if (tmp_ip->pnn == -1) {
2084                         continue;
2085                 }
2086                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2087                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2088                         /* this node can not serve this ip. */
2089                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2090                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2091                                            tmp_ip->pnn));
2092                         tmp_ip->pnn = -1;
2093                 }
2094         }
2095 }
2096
2097 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2098                                        struct ctdb_ipflags *ipflags,
2099                                        struct ctdb_public_ip_list *all_ips)
2100 {
2101         struct ctdb_public_ip_list *tmp_ip;
2102         int i, numnodes;
2103
2104         numnodes = talloc_array_length(ipflags);
2105
2106         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2107        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2108         *  always be allocated the same way for a specific set of
2109         *  available/unavailable nodes.
2110         */
2111
2112         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2113                 tmp_ip->pnn = i % numnodes;
2114         }
2115
2116         /* IP failback doesn't make sense with deterministic
2117          * IPs, since the modulo step above implicitly fails
2118          * back IPs to their "home" node.
2119          */
2120         if (1 == ctdb->tunable.no_ip_failback) {
2121                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2122         }
2123
2124         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2125
2126         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2127
2128         /* No failback here! */
2129 }
2130
2131 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2132                                           struct ctdb_ipflags *ipflags,
2133                                           struct ctdb_public_ip_list *all_ips)
2134 {
2135         /* This should be pushed down into basic_failback. */
2136         struct ctdb_public_ip_list *tmp_ip;
2137         int num_ips = 0;
2138         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2139                 num_ips++;
2140         }
2141
2142         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2143
2144         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2145
2146         /* If we don't want IPs to fail back then don't rebalance IPs. */
2147         if (1 == ctdb->tunable.no_ip_failback) {
2148                 return;
2149         }
2150
2151         /* Now, try to make sure the ip adresses are evenly distributed
2152            across the nodes.
2153         */
2154         basic_failback(ctdb, ipflags, all_ips, num_ips);
2155 }
2156
2157 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2158                           struct ctdb_ipflags *ipflags,
2159                           struct ctdb_public_ip_list *all_ips,
2160                           uint32_t *force_rebalance_nodes)
2161 {
2162         uint32_t *lcp2_imbalances;
2163         bool *rebalance_candidates;
2164         int numnodes, num_rebalance_candidates, i;
2165
2166         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2167
2168         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2169
2170         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2171                   &lcp2_imbalances, &rebalance_candidates);
2172
2173         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2174
2175         /* If we don't want IPs to fail back then don't rebalance IPs. */
2176         if (1 == ctdb->tunable.no_ip_failback) {
2177                 goto finished;
2178         }
2179
2180         /* It is only worth continuing if we have suitable target
2181          * nodes to transfer IPs to.  This check is much cheaper than
2182          * continuing on...
2183          */
2184         numnodes = talloc_array_length(ipflags);
2185         num_rebalance_candidates = 0;
2186         for (i=0; i<numnodes; i++) {
2187                 if (rebalance_candidates[i]) {
2188                         num_rebalance_candidates++;
2189                 }
2190         }
2191         if (num_rebalance_candidates == 0) {
2192                 goto finished;
2193         }
2194
2195         /* Now, try to make sure the ip adresses are evenly distributed
2196            across the nodes.
2197         */
2198         lcp2_failback(ctdb, ipflags, all_ips,
2199                       lcp2_imbalances, rebalance_candidates);
2200
2201 finished:
2202         talloc_free(tmp_ctx);
2203 }
2204
2205 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2206 {
2207         int i, num_healthy;
2208
2209         /* Count how many completely healthy nodes we have */
2210         num_healthy = 0;
2211         for (i=0;i<nodemap->num;i++) {
2212                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2213                         num_healthy++;
2214                 }
2215         }
2216
2217         return num_healthy == 0;
2218 }
2219
2220 /* The calculation part of the IP allocation algorithm. */
2221 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2222                                    struct ctdb_ipflags *ipflags,
2223                                    struct ctdb_public_ip_list **all_ips_p,
2224                                    uint32_t *force_rebalance_nodes)
2225 {
2226         /* since nodes only know about those public addresses that
2227            can be served by that particular node, no single node has
2228            a full list of all public addresses that exist in the cluster.
2229            Walk over all node structures and create a merged list of
2230            all public addresses that exist in the cluster.
2231
2232            keep the tree of ips around as ctdb->ip_tree
2233         */
2234         *all_ips_p = create_merged_ip_list(ctdb);
2235
2236         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2237                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2238         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2239                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2240         } else {
2241                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2242         }
2243
2244         /* at this point ->pnn is the node which will own each IP
2245            or -1 if there is no node that can cover this ip
2246         */
2247
2248         return;
2249 }
2250
2251 struct get_tunable_callback_data {
2252         const char *tunable;
2253         uint32_t *out;
2254         bool fatal;
2255 };
2256
2257 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2258                                  int32_t res, TDB_DATA outdata,
2259                                  void *callback)
2260 {
2261         struct get_tunable_callback_data *cd =
2262                 (struct get_tunable_callback_data *)callback;
2263         int size;
2264
2265         if (res != 0) {
2266                 /* Already handled in fail callback */
2267                 return;
2268         }
2269
2270         if (outdata.dsize != sizeof(uint32_t)) {
2271                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2272                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2273                                  (int)outdata.dsize));
2274                 cd->fatal = true;
2275                 return;
2276         }
2277
2278         size = talloc_array_length(cd->out);
2279         if (pnn >= size) {
2280                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2281                                  cd->tunable, pnn, size));
2282                 return;
2283         }
2284
2285                 
2286         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2287 }
2288
2289 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2290                                        int32_t res, TDB_DATA outdata,
2291                                        void *callback)
2292 {
2293         struct get_tunable_callback_data *cd =
2294                 (struct get_tunable_callback_data *)callback;
2295
2296         switch (res) {
2297         case -ETIME:
2298                 DEBUG(DEBUG_ERR,
2299                       ("Timed out getting tunable \"%s\" from node %d\n",
2300                        cd->tunable, pnn));
2301                 cd->fatal = true;
2302                 break;
2303         case -EINVAL:
2304         case -1:
2305                 DEBUG(DEBUG_WARNING,
2306                       ("Tunable \"%s\" not implemented on node %d\n",
2307                        cd->tunable, pnn));
2308                 break;
2309         default:
2310                 DEBUG(DEBUG_ERR,
2311                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2312                        cd->tunable, pnn));
2313                 cd->fatal = true;
2314         }
2315 }
2316
2317 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2318                                         TALLOC_CTX *tmp_ctx,
2319                                         struct ctdb_node_map *nodemap,
2320                                         const char *tunable,
2321                                         uint32_t default_value)
2322 {
2323         TDB_DATA data;
2324         struct ctdb_control_get_tunable *t;
2325         uint32_t *nodes;
2326         uint32_t *tvals;
2327         struct get_tunable_callback_data callback_data;
2328         int i;
2329
2330         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2331         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2332         for (i=0; i<nodemap->num; i++) {
2333                 tvals[i] = default_value;
2334         }
2335                 
2336         callback_data.out = tvals;
2337         callback_data.tunable = tunable;
2338         callback_data.fatal = false;
2339
2340         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2341         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2342         t = (struct ctdb_control_get_tunable *)data.dptr;
2343         t->length = strlen(tunable)+1;
2344         memcpy(t->name, tunable, t->length);
2345         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2346         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2347                                       nodes, 0, TAKEOVER_TIMEOUT(),
2348                                       false, data,
2349                                       get_tunable_callback,
2350                                       get_tunable_fail_callback,
2351                                       &callback_data) != 0) {
2352                 if (callback_data.fatal) {
2353                         talloc_free(tvals);
2354                         tvals = NULL;
2355                 }
2356         }
2357         talloc_free(nodes);
2358         talloc_free(data.dptr);
2359
2360         return tvals;
2361 }
2362
2363 struct get_runstate_callback_data {
2364         enum ctdb_runstate *out;
2365         bool fatal;
2366 };
2367
2368 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2369                                   int32_t res, TDB_DATA outdata,
2370                                   void *callback_data)
2371 {
2372         struct get_runstate_callback_data *cd =
2373                 (struct get_runstate_callback_data *)callback_data;
2374         int size;
2375
2376         if (res != 0) {
2377                 /* Already handled in fail callback */
2378                 return;
2379         }
2380
2381         if (outdata.dsize != sizeof(uint32_t)) {
2382                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2383                                  pnn, (int)sizeof(uint32_t),
2384                                  (int)outdata.dsize));
2385                 cd->fatal = true;
2386                 return;
2387         }
2388
2389         size = talloc_array_length(cd->out);
2390         if (pnn >= size) {
2391                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2392                                  pnn, size));
2393                 return;
2394         }
2395
2396         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2397 }
2398
2399 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2400                                        int32_t res, TDB_DATA outdata,
2401                                        void *callback)
2402 {
2403         struct get_runstate_callback_data *cd =
2404                 (struct get_runstate_callback_data *)callback;
2405
2406         switch (res) {
2407         case -ETIME:
2408                 DEBUG(DEBUG_ERR,
2409                       ("Timed out getting runstate from node %d\n", pnn));
2410                 cd->fatal = true;
2411                 break;
2412         default:
2413                 DEBUG(DEBUG_WARNING,
2414                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2415                        pnn));
2416         }
2417 }
2418
2419 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2420                                                     TALLOC_CTX *tmp_ctx,
2421                                                     struct ctdb_node_map *nodemap,
2422                                                     enum ctdb_runstate default_value)
2423 {
2424         uint32_t *nodes;
2425         enum ctdb_runstate *rs;
2426         struct get_runstate_callback_data callback_data;
2427         int i;
2428
2429         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2430         CTDB_NO_MEMORY_NULL(ctdb, rs);
2431         for (i=0; i<nodemap->num; i++) {
2432                 rs[i] = default_value;
2433         }
2434
2435         callback_data.out = rs;
2436         callback_data.fatal = false;
2437
2438         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2439         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2440                                       nodes, 0, TAKEOVER_TIMEOUT(),
2441                                       true, tdb_null,
2442                                       get_runstate_callback,
2443                                       get_runstate_fail_callback,
2444                                       &callback_data) != 0) {
2445                 if (callback_data.fatal) {
2446                         free(rs);
2447                         rs = NULL;
2448                 }
2449         }
2450         talloc_free(nodes);
2451
2452         return rs;
2453 }
2454
2455 /* Set internal flags for IP allocation:
2456  *   Clear ip flags
2457  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2458  *   Set NOIPHOST ip flag for each INACTIVE node
2459  *   if all nodes are disabled:
2460  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2461  *   else
2462  *     Set NOIPHOST ip flags for disabled nodes
2463  */
2464 static struct ctdb_ipflags *
2465 set_ipflags_internal(struct ctdb_context *ctdb,
2466                      TALLOC_CTX *tmp_ctx,
2467                      struct ctdb_node_map *nodemap,
2468                      uint32_t *tval_noiptakeover,
2469                      uint32_t *tval_noiphostonalldisabled,
2470                      enum ctdb_runstate *runstate)
2471 {
2472         int i;
2473         struct ctdb_ipflags *ipflags;
2474
2475         /* Clear IP flags - implicit due to talloc_zero */
2476         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2477         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2478
2479         for (i=0;i<nodemap->num;i++) {
2480                 /* Can not take IPs on node with NoIPTakeover set */
2481                 if (tval_noiptakeover[i] != 0) {
2482                         ipflags[i].noiptakeover = true;
2483                 }
2484
2485                 /* Can not host IPs on node not in RUNNING state */
2486                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2487                         ipflags[i].noiphost = true;
2488                         continue;
2489                 }
2490                 /* Can not host IPs on INACTIVE node */
2491                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2492                         ipflags[i].noiphost = true;
2493                 }
2494         }
2495
2496         if (all_nodes_are_disabled(nodemap)) {
2497                 /* If all nodes are disabled, can not host IPs on node
2498                  * with NoIPHostOnAllDisabled set
2499                  */
2500                 for (i=0;i<nodemap->num;i++) {
2501                         if (tval_noiphostonalldisabled[i] != 0) {
2502                                 ipflags[i].noiphost = true;
2503                         }
2504                 }
2505         } else {
2506                 /* If some nodes are not disabled, then can not host
2507                  * IPs on DISABLED node
2508                  */
2509                 for (i=0;i<nodemap->num;i++) {
2510                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2511                                 ipflags[i].noiphost = true;
2512                         }
2513                 }
2514         }
2515
2516         return ipflags;
2517 }
2518
2519 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2520                                         TALLOC_CTX *tmp_ctx,
2521                                         struct ctdb_node_map *nodemap)
2522 {
2523         uint32_t *tval_noiptakeover;
2524         uint32_t *tval_noiphostonalldisabled;
2525         struct ctdb_ipflags *ipflags;
2526         enum ctdb_runstate *runstate;
2527
2528
2529         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2530                                                    "NoIPTakeover", 0);
2531         if (tval_noiptakeover == NULL) {
2532                 return NULL;
2533         }
2534
2535         tval_noiphostonalldisabled =
2536                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2537                                        "NoIPHostOnAllDisabled", 0);
2538         if (tval_noiphostonalldisabled == NULL) {
2539                 /* Caller frees tmp_ctx */
2540                 return NULL;
2541         }
2542
2543         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2544          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2545          * reasonable behaviour on a mixed cluster during upgrade.
2546          */
2547         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2548                                            CTDB_RUNSTATE_RUNNING);
2549         if (runstate == NULL) {
2550                 /* Caller frees tmp_ctx */
2551                 return NULL;
2552         }
2553
2554         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2555                                        tval_noiptakeover,
2556                                        tval_noiphostonalldisabled,
2557                                        runstate);
2558
2559         talloc_free(tval_noiptakeover);
2560         talloc_free(tval_noiphostonalldisabled);
2561         talloc_free(runstate);
2562
2563         return ipflags;
2564 }
2565
2566 struct iprealloc_callback_data {
2567         bool *retry_nodes;
2568         int retry_count;
2569         client_async_callback fail_callback;
2570         void *fail_callback_data;
2571         struct ctdb_node_map *nodemap;
2572 };
2573
2574 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2575                                         int32_t res, TDB_DATA outdata,
2576                                         void *callback)
2577 {
2578         int numnodes;
2579         struct iprealloc_callback_data *cd =
2580                 (struct iprealloc_callback_data *)callback;
2581
2582         numnodes = talloc_array_length(cd->retry_nodes);
2583         if (pnn > numnodes) {
2584                 DEBUG(DEBUG_ERR,
2585                       ("ipreallocated failure from node %d, "
2586                        "but only %d nodes in nodemap\n",
2587                        pnn, numnodes));
2588                 return;
2589         }
2590
2591         /* Can't run the "ipreallocated" event on a INACTIVE node */
2592         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2593                 DEBUG(DEBUG_WARNING,
2594                       ("ipreallocated failed on inactive node %d, ignoring\n",
2595                        pnn));
2596                 return;
2597         }
2598
2599         switch (res) {
2600         case -ETIME:
2601                 /* If the control timed out then that's a real error,
2602                  * so call the real fail callback
2603                  */
2604                 if (cd->fail_callback) {
2605                         cd->fail_callback(ctdb, pnn, res, outdata,
2606                                           cd->fail_callback_data);
2607                 } else {
2608                         DEBUG(DEBUG_WARNING,
2609                               ("iprealloc timed out but no callback registered\n"));
2610                 }
2611                 break;
2612         default:
2613                 /* If not a timeout then either the ipreallocated
2614                  * eventscript (or some setup) failed.  This might
2615                  * have failed because the IPREALLOCATED control isn't
2616                  * implemented - right now there is no way of knowing
2617                  * because the error codes are all folded down to -1.
2618                  * Consider retrying using EVENTSCRIPT control...
2619                  */
2620                 DEBUG(DEBUG_WARNING,
2621                       ("ipreallocated failure from node %d, flagging retry\n",
2622                        pnn));
2623                 cd->retry_nodes[pnn] = true;
2624                 cd->retry_count++;
2625         }
2626 }
2627
2628 struct takeover_callback_data {
2629         bool *node_failed;
2630         client_async_callback fail_callback;
2631         void *fail_callback_data;
2632         struct ctdb_node_map *nodemap;
2633 };
2634
2635 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2636                                        uint32_t node_pnn, int32_t res,
2637                                        TDB_DATA outdata, void *callback_data)
2638 {
2639         struct takeover_callback_data *cd =
2640                 talloc_get_type_abort(callback_data,
2641                                       struct takeover_callback_data);
2642         int i;
2643
2644         for (i = 0; i < cd->nodemap->num; i++) {
2645                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2646                         break;
2647                 }
2648         }
2649
2650         if (i == cd->nodemap->num) {
2651                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2652                 return;
2653         }
2654
2655         if (!cd->node_failed[i]) {
2656                 cd->node_failed[i] = true;
2657                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2658                                   cd->fail_callback_data);
2659         }
2660 }
2661
2662 /*
2663   make any IP alias changes for public addresses that are necessary 
2664  */
2665 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2666                       uint32_t *force_rebalance_nodes,
2667                       client_async_callback fail_callback, void *callback_data)
2668 {
2669         int i, j, ret;
2670         struct ctdb_public_ip ip;
2671         struct ctdb_public_ipv4 ipv4;
2672         uint32_t *nodes;
2673         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2674         TDB_DATA data;
2675         struct timeval timeout;
2676         struct client_async_data *async_data;
2677         struct ctdb_client_control_state *state;
2678         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2679         struct ctdb_ipflags *ipflags;
2680         struct takeover_callback_data *takeover_data;
2681         struct iprealloc_callback_data iprealloc_data;
2682         bool *retry_data;
2683
2684         /*
2685          * ip failover is completely disabled, just send out the 
2686          * ipreallocated event.
2687          */
2688         if (ctdb->tunable.disable_ip_failover != 0) {
2689                 goto ipreallocated;
2690         }
2691
2692         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2693         if (ipflags == NULL) {
2694                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2695                 talloc_free(tmp_ctx);
2696                 return -1;
2697         }
2698
2699         ZERO_STRUCT(ip);
2700
2701         /* Do the IP reassignment calculations */
2702         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2703
2704         /* Now tell all nodes to release any public IPs should not
2705          * host.  This will be a NOOP on nodes that don't currently
2706          * hold the given IP.
2707          */
2708         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2709         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2710
2711         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2712                                                        bool, nodemap->num);
2713         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2714         takeover_data->fail_callback = fail_callback;
2715         takeover_data->fail_callback_data = callback_data;
2716         takeover_data->nodemap = nodemap;
2717
2718         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2719         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2720
2721         async_data->fail_callback = takeover_run_fail_callback;
2722         async_data->callback_data = takeover_data;
2723
2724         for (i=0;i<nodemap->num;i++) {
2725                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2726                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2727                         continue;
2728                 }
2729
2730                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2731                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2732                                 /* This node should be serving this
2733                                    vnn so dont tell it to release the ip
2734                                 */
2735                                 continue;
2736                         }
2737                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2738                                 ipv4.pnn = tmp_ip->pnn;
2739                                 ipv4.sin = tmp_ip->addr.ip;
2740
2741                                 timeout = TAKEOVER_TIMEOUT();
2742                                 data.dsize = sizeof(ipv4);
2743                                 data.dptr  = (uint8_t *)&ipv4;
2744                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2745                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2746                                                 data, async_data,
2747                                                 &timeout, NULL);
2748                         } else {
2749                                 ip.pnn  = tmp_ip->pnn;
2750                                 ip.addr = tmp_ip->addr;
2751
2752                                 timeout = TAKEOVER_TIMEOUT();
2753                                 data.dsize = sizeof(ip);
2754                                 data.dptr  = (uint8_t *)&ip;
2755                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2756                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2757                                                 data, async_data,
2758                                                 &timeout, NULL);
2759                         }
2760
2761                         if (state == NULL) {
2762                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2763                                 talloc_free(tmp_ctx);
2764                                 return -1;
2765                         }
2766                 
2767                         ctdb_client_async_add(async_data, state);
2768                 }
2769         }
2770         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2771                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2772                 talloc_free(tmp_ctx);
2773                 return -1;
2774         }
2775         talloc_free(async_data);
2776
2777
2778         /* tell all nodes to get their own IPs */
2779         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2780         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2781
2782         async_data->fail_callback = fail_callback;
2783         async_data->callback_data = callback_data;
2784
2785         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2786                 if (tmp_ip->pnn == -1) {
2787                         /* this IP won't be taken over */
2788                         continue;
2789                 }
2790
2791                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2792                         ipv4.pnn = tmp_ip->pnn;
2793                         ipv4.sin = tmp_ip->addr.ip;
2794
2795                         timeout = TAKEOVER_TIMEOUT();
2796                         data.dsize = sizeof(ipv4);
2797                         data.dptr  = (uint8_t *)&ipv4;
2798                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2799                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2800                                         data, async_data,
2801                                         &timeout, NULL);
2802                 } else {
2803                         ip.pnn  = tmp_ip->pnn;
2804                         ip.addr = tmp_ip->addr;
2805
2806                         timeout = TAKEOVER_TIMEOUT();
2807                         data.dsize = sizeof(ip);
2808                         data.dptr  = (uint8_t *)&ip;
2809                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2810                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2811                                         data, async_data,
2812                                         &timeout, NULL);
2813                 }
2814                 if (state == NULL) {
2815                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2816                         talloc_free(tmp_ctx);
2817                         return -1;
2818                 }
2819                 
2820                 ctdb_client_async_add(async_data, state);
2821         }
2822         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2823                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2824                 talloc_free(tmp_ctx);
2825                 return -1;
2826         }
2827
2828 ipreallocated:
2829         /* 
2830          * Tell all nodes to run eventscripts to process the
2831          * "ipreallocated" event.  This can do a lot of things,
2832          * including restarting services to reconfigure them if public
2833          * IPs have moved.  Once upon a time this event only used to
2834          * update natwg.
2835          */
2836         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2837         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2838         iprealloc_data.retry_nodes = retry_data;
2839         iprealloc_data.retry_count = 0;
2840         iprealloc_data.fail_callback = fail_callback;
2841         iprealloc_data.fail_callback_data = callback_data;
2842         iprealloc_data.nodemap = nodemap;
2843
2844         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2845         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2846                                         nodes, 0, TAKEOVER_TIMEOUT(),
2847                                         false, tdb_null,
2848                                         NULL, iprealloc_fail_callback,
2849                                         &iprealloc_data);
2850         if (ret != 0) {
2851                 /* If the control failed then we should retry to any
2852                  * nodes flagged by iprealloc_fail_callback using the
2853                  * EVENTSCRIPT control.  This is a best-effort at
2854                  * backward compatiblity when running a mixed cluster
2855                  * where some nodes have not yet been upgraded to
2856                  * support the IPREALLOCATED control.
2857                  */
2858                 DEBUG(DEBUG_WARNING,
2859                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2860
2861                 nodes = talloc_array(tmp_ctx, uint32_t,
2862                                      iprealloc_data.retry_count);
2863                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2864
2865                 j = 0;
2866                 for (i=0; i<nodemap->num; i++) {
2867                         if (iprealloc_data.retry_nodes[i]) {
2868                                 nodes[j] = i;
2869                                 j++;
2870                         }
2871                 }
2872
2873                 data.dptr  = discard_const("ipreallocated");
2874                 data.dsize = strlen((char *)data.dptr) + 1; 
2875                 ret = ctdb_client_async_control(ctdb,
2876                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2877                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2878                                                 false, data,
2879                                                 NULL, fail_callback,
2880                                                 callback_data);
2881                 if (ret != 0) {
2882                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2883                 }
2884         }
2885
2886         talloc_free(tmp_ctx);
2887         return ret;
2888 }
2889
2890
2891 /*
2892   destroy a ctdb_client_ip structure
2893  */
2894 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2895 {
2896         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2897                 ctdb_addr_to_str(&ip->addr),
2898                 ntohs(ip->addr.ip.sin_port),
2899                 ip->client_id));
2900
2901         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2902         return 0;
2903 }
2904
2905 /*
2906   called by a client to inform us of a TCP connection that it is managing
2907   that should tickled with an ACK when IP takeover is done
2908   we handle both the old ipv4 style of packets as well as the new ipv4/6
2909   pdus.
2910  */
2911 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2912                                 TDB_DATA indata)
2913 {
2914         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2915         struct ctdb_control_tcp *old_addr = NULL;
2916         struct ctdb_control_tcp_addr new_addr;
2917         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2918         struct ctdb_tcp_list *tcp;
2919         struct ctdb_tcp_connection t;
2920         int ret;
2921         TDB_DATA data;
2922         struct ctdb_client_ip *ip;
2923         struct ctdb_vnn *vnn;
2924         ctdb_sock_addr addr;
2925
2926         /* If we don't have public IPs, tickles are useless */
2927         if (ctdb->vnn == NULL) {
2928                 return 0;
2929         }
2930
2931         switch (indata.dsize) {
2932         case sizeof(struct ctdb_control_tcp):
2933                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2934                 ZERO_STRUCT(new_addr);
2935                 tcp_sock = &new_addr;
2936                 tcp_sock->src.ip  = old_addr->src;
2937                 tcp_sock->dest.ip = old_addr->dest;
2938                 break;
2939         case sizeof(struct ctdb_control_tcp_addr):
2940                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2941                 break;
2942         default:
2943                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2944                                  "to ctdb_control_tcp_client. size was %d but "
2945                                  "only allowed sizes are %lu and %lu\n",
2946                                  (int)indata.dsize,
2947                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2948                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2949                 return -1;
2950         }
2951
2952         addr = tcp_sock->src;
2953         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2954         addr = tcp_sock->dest;
2955         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2956
2957         ZERO_STRUCT(addr);
2958         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2959         vnn = find_public_ip_vnn(ctdb, &addr);
2960         if (vnn == NULL) {
2961                 switch (addr.sa.sa_family) {
2962                 case AF_INET:
2963                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2964                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2965                                         ctdb_addr_to_str(&addr)));
2966                         }
2967                         break;
2968                 case AF_INET6:
2969                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2970                                 ctdb_addr_to_str(&addr)));
2971                         break;
2972                 default:
2973                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2974                 }
2975
2976                 return 0;
2977         }
2978
2979         if (vnn->pnn != ctdb->pnn) {
2980                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2981                         ctdb_addr_to_str(&addr),
2982                         client_id, client->pid));
2983                 /* failing this call will tell smbd to die */
2984                 return -1;
2985         }
2986
2987         ip = talloc(client, struct ctdb_client_ip);
2988         CTDB_NO_MEMORY(ctdb, ip);
2989
2990         ip->ctdb      = ctdb;
2991         ip->addr      = addr;
2992         ip->client_id = client_id;
2993         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2994         DLIST_ADD(ctdb->client_ip_list, ip);
2995
2996         tcp = talloc(client, struct ctdb_tcp_list);
2997         CTDB_NO_MEMORY(ctdb, tcp);
2998
2999         tcp->connection.src_addr = tcp_sock->src;
3000         tcp->connection.dst_addr = tcp_sock->dest;
3001
3002         DLIST_ADD(client->tcp_list, tcp);
3003
3004         t.src_addr = tcp_sock->src;
3005         t.dst_addr = tcp_sock->dest;
3006
3007         data.dptr = (uint8_t *)&t;
3008         data.dsize = sizeof(t);
3009
3010         switch (addr.sa.sa_family) {
3011         case AF_INET:
3012                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3013                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
3014                         ctdb_addr_to_str(&tcp_sock->src),
3015                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
3016                 break;
3017         case AF_INET6:
3018                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3019                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
3020                         ctdb_addr_to_str(&tcp_sock->src),
3021                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3022                 break;
3023         default:
3024                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3025         }
3026
3027
3028         /* tell all nodes about this tcp connection */
3029         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3030                                        CTDB_CONTROL_TCP_ADD,
3031                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3032         if (ret != 0) {
3033                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3034                 return -1;
3035         }
3036
3037         return 0;
3038 }
3039
3040 /*
3041   find a tcp address on a list
3042  */
3043 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3044                                            struct ctdb_tcp_connection *tcp)
3045 {
3046         int i;
3047
3048         if (array == NULL) {
3049                 return NULL;
3050         }
3051
3052         for (i=0;i<array->num;i++) {
3053                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3054                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3055                         return &array->connections[i];
3056                 }
3057         }
3058         return NULL;
3059 }
3060
3061
3062
3063 /*
3064   called by a daemon to inform us of a TCP connection that one of its
3065   clients managing that should tickled with an ACK when IP takeover is
3066   done
3067  */
3068 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3069 {
3070         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3071         struct ctdb_tcp_array *tcparray;
3072         struct ctdb_tcp_connection tcp;
3073         struct ctdb_vnn *vnn;
3074
3075         /* If we don't have public IPs, tickles are useless */
3076         if (ctdb->vnn == NULL) {
3077                 return 0;
3078         }
3079
3080         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3081         if (vnn == NULL) {
3082                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3083                         ctdb_addr_to_str(&p->dst_addr)));
3084
3085                 return -1;
3086         }
3087
3088
3089         tcparray = vnn->tcp_array;
3090
3091         /* If this is the first tickle */
3092         if (tcparray == NULL) {
3093                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3094                 CTDB_NO_MEMORY(ctdb, tcparray);
3095                 vnn->tcp_array = tcparray;
3096
3097                 tcparray->num = 0;
3098                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3099                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3100
3101                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3102                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3103                 tcparray->num++;
3104
3105                 if (tcp_update_needed) {
3106                         vnn->tcp_update_needed = true;
3107                 }
3108                 return 0;
3109         }
3110
3111
3112         /* Do we already have this tickle ?*/
3113         tcp.src_addr = p->src_addr;
3114         tcp.dst_addr = p->dst_addr;
3115         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3116                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3117                         ctdb_addr_to_str(&tcp.dst_addr),
3118                         ntohs(tcp.dst_addr.ip.sin_port),
3119                         vnn->pnn));
3120                 return 0;
3121         }
3122
3123         /* A new tickle, we must add it to the array */
3124         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3125                                         struct ctdb_tcp_connection,
3126                                         tcparray->num+1);
3127         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3128
3129         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3130         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3131         tcparray->num++;
3132
3133         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3134                 ctdb_addr_to_str(&tcp.dst_addr),
3135                 ntohs(tcp.dst_addr.ip.sin_port),
3136                 vnn->pnn));
3137
3138         if (tcp_update_needed) {
3139                 vnn->tcp_update_needed = true;
3140         }
3141
3142         return 0;
3143 }
3144
3145
3146 /*
3147   called by a daemon to inform us of a TCP connection that one of its
3148   clients managing that should tickled with an ACK when IP takeover is
3149   done
3150  */
3151 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3152 {
3153         struct ctdb_tcp_connection *tcpp;
3154         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3155
3156         if (vnn == NULL) {
3157                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3158                         ctdb_addr_to_str(&conn->dst_addr)));
3159                 return;
3160         }
3161
3162         /* if the array is empty we cant remove it
3163            and we dont need to do anything
3164          */
3165         if (vnn->tcp_array == NULL) {
3166                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3167                         ctdb_addr_to_str(&conn->dst_addr),
3168                         ntohs(conn->dst_addr.ip.sin_port)));
3169                 return;
3170         }
3171
3172
3173         /* See if we know this connection
3174            if we dont know this connection  then we dont need to do anything
3175          */
3176         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3177         if (tcpp == NULL) {
3178                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3179                         ctdb_addr_to_str(&conn->dst_addr),
3180                         ntohs(conn->dst_addr.ip.sin_port)));
3181                 return;
3182         }
3183
3184
3185         /* We need to remove this entry from the array.
3186            Instead of allocating a new array and copying data to it
3187            we cheat and just copy the last entry in the existing array
3188            to the entry that is to be removed and just shring the 
3189            ->num field
3190          */
3191         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3192         vnn->tcp_array->num--;
3193
3194         /* If we deleted the last entry we also need to remove the entire array
3195          */
3196         if (vnn->tcp_array->num == 0) {
3197                 talloc_free(vnn->tcp_array);
3198                 vnn->tcp_array = NULL;
3199         }               
3200
3201         vnn->tcp_update_needed = true;
3202
3203         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3204                 ctdb_addr_to_str(&conn->src_addr),
3205                 ntohs(conn->src_addr.ip.sin_port)));
3206 }
3207
3208
3209 /*
3210   called by a daemon to inform us of a TCP connection that one of its
3211   clients used are no longer needed in the tickle database
3212  */
3213 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3214 {
3215         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3216
3217         /* If we don't have public IPs, tickles are useless */
3218         if (ctdb->vnn == NULL) {
3219                 return 0;
3220         }
3221
3222         ctdb_remove_tcp_connection(ctdb, conn);
3223
3224         return 0;
3225 }
3226
3227
3228 /*
3229   Called when another daemon starts - caises all tickles for all
3230   public addresses we are serving to be sent to the new node on the
3231   next check.  This actually causes the next scheduled call to
3232   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3233   doesn't require careful error handling.
3234  */
3235 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3236 {
3237         struct ctdb_vnn *vnn;
3238
3239         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3240                 vnn->tcp_update_needed = true;
3241         }
3242
3243         return 0;
3244 }
3245
3246
3247 /*
3248   called when a client structure goes away - hook to remove
3249   elements from the tcp_list in all daemons
3250  */
3251 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3252 {
3253         while (client->tcp_list) {
3254                 struct ctdb_tcp_list *tcp = client->tcp_list;
3255                 DLIST_REMOVE(client->tcp_list, tcp);
3256                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3257         }
3258 }
3259
3260
3261 /*
3262   release all IPs on shutdown
3263  */
3264 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3265 {
3266         struct ctdb_vnn *vnn;
3267         int count = 0;
3268
3269         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3270                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3271                         ctdb_vnn_unassign_iface(ctdb, vnn);
3272                         continue;
3273                 }
3274                 if (!vnn->iface) {
3275                         continue;
3276                 }
3277
3278                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3279                                     ctdb_addr_to_str(&vnn->public_address),
3280                                     vnn->public_netmask_bits,
3281                                     ctdb_vnn_iface_string(vnn)));
3282
3283                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3284                                   ctdb_vnn_iface_string(vnn),
3285                                   ctdb_addr_to_str(&vnn->public_address),
3286                                   vnn->public_netmask_bits);
3287                 release_kill_clients(ctdb, &vnn->public_address);
3288                 ctdb_vnn_unassign_iface(ctdb, vnn);
3289                 count++;
3290         }
3291
3292         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3293 }
3294
3295
3296 /*
3297   get list of public IPs
3298  */
3299 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3300                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3301 {
3302         int i, num, len;
3303         struct ctdb_all_public_ips *ips;
3304         struct ctdb_vnn *vnn;
3305         bool only_available = false;
3306
3307         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3308                 only_available = true;
3309         }
3310
3311         /* count how many public ip structures we have */
3312         num = 0;
3313         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3314                 num++;
3315         }
3316
3317         len = offsetof(struct ctdb_all_public_ips, ips) + 
3318                 num*sizeof(struct ctdb_public_ip);
3319         ips = talloc_zero_size(outdata, len);
3320         CTDB_NO_MEMORY(ctdb, ips);
3321
3322         i = 0;
3323         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3324                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3325                         continue;
3326                 }
3327                 ips->ips[i].pnn  = vnn->pnn;
3328                 ips->ips[i].addr = vnn->public_address;
3329                 i++;
3330         }
3331         ips->num = i;
3332         len = offsetof(struct ctdb_all_public_ips, ips) +
3333                 i*sizeof(struct ctdb_public_ip);
3334
3335         outdata->dsize = len;
3336         outdata->dptr  = (uint8_t *)ips;
3337
3338         return 0;
3339 }
3340
3341
3342 /*
3343   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3344  */
3345 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3346                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3347 {
3348         int i, num, len;
3349         struct ctdb_all_public_ipsv4 *ips;
3350         struct ctdb_vnn *vnn;
3351
3352         /* count how many public ip structures we have */
3353         num = 0;
3354         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3355                 if (vnn->public_address.sa.sa_family != AF_INET) {
3356                         continue;
3357                 }
3358                 num++;
3359         }
3360
3361         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3362                 num*sizeof(struct ctdb_public_ipv4);
3363         ips = talloc_zero_size(outdata, len);
3364         CTDB_NO_MEMORY(ctdb, ips);
3365
3366         outdata->dsize = len;
3367         outdata->dptr  = (uint8_t *)ips;
3368
3369         ips->num = num;
3370         i = 0;
3371         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3372                 if (vnn->public_address.sa.sa_family != AF_INET) {
3373                         continue;
3374                 }
3375                 ips->ips[i].pnn = vnn->pnn;
3376                 ips->ips[i].sin = vnn->public_address.ip;
3377                 i++;
3378         }
3379
3380         return 0;
3381 }
3382
3383 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3384                                         struct ctdb_req_control *c,
3385                                         TDB_DATA indata,
3386                                         TDB_DATA *outdata)
3387 {
3388         int i, num, len;
3389         ctdb_sock_addr *addr;
3390         struct ctdb_control_public_ip_info *info;
3391         struct ctdb_vnn *vnn;
3392
3393         addr = (ctdb_sock_addr *)indata.dptr;
3394
3395         vnn = find_public_ip_vnn(ctdb, addr);
3396         if (vnn == NULL) {
3397                 /* if it is not a public ip   it could be our 'single ip' */
3398                 if (ctdb->single_ip_vnn) {
3399                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3400                                 vnn = ctdb->single_ip_vnn;
3401                         }
3402                 }
3403         }
3404         if (vnn == NULL) {
3405                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3406                                  "'%s'not a public address\n",
3407                                  ctdb_addr_to_str(addr)));
3408                 return -1;
3409         }
3410
3411         /* count how many public ip structures we have */
3412         num = 0;
3413         for (;vnn->ifaces[num];) {
3414                 num++;
3415         }
3416
3417         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3418                 num*sizeof(struct ctdb_control_iface_info);
3419         info = talloc_zero_size(outdata, len);
3420         CTDB_NO_MEMORY(ctdb, info);
3421
3422         info->ip.addr = vnn->public_address;
3423         info->ip.pnn = vnn->pnn;
3424         info->active_idx = 0xFFFFFFFF;
3425
3426         for (i=0; vnn->ifaces[i]; i++) {
3427                 struct ctdb_iface *cur;
3428
3429                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3430                 if (cur == NULL) {
3431                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3432                                            vnn->ifaces[i]));
3433                         return -1;
3434                 }
3435                 if (vnn->iface == cur) {
3436                         info->active_idx = i;
3437                 }
3438                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3439                 info->ifaces[i].link_state = cur->link_up;
3440                 info->ifaces[i].references = cur->references;
3441         }
3442         info->num = i;
3443         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3444                 i*sizeof(struct ctdb_control_iface_info);
3445
3446         outdata->dsize = len;
3447         outdata->dptr  = (uint8_t *)info;
3448
3449         return 0;
3450 }
3451
3452 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3453                                 struct ctdb_req_control *c,
3454                                 TDB_DATA *outdata)
3455 {
3456         int i, num, len;
3457         struct ctdb_control_get_ifaces *ifaces;
3458         struct ctdb_iface *cur;
3459
3460         /* count how many public ip structures we have */
3461         num = 0;
3462         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3463                 num++;
3464         }
3465
3466         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3467                 num*sizeof(struct ctdb_control_iface_info);
3468         ifaces = talloc_zero_size(outdata, len);
3469         CTDB_NO_MEMORY(ctdb, ifaces);
3470
3471         i = 0;
3472         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3473                 strcpy(ifaces->ifaces[i].name, cur->name);
3474                 ifaces->ifaces[i].link_state = cur->link_up;
3475                 ifaces->ifaces[i].references = cur->references;
3476                 i++;
3477         }
3478         ifaces->num = i;
3479         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3480                 i*sizeof(struct ctdb_control_iface_info);
3481
3482         outdata->dsize = len;
3483         outdata->dptr  = (uint8_t *)ifaces;
3484
3485         return 0;
3486 }
3487
3488 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3489                                     struct ctdb_req_control *c,
3490                                     TDB_DATA indata)
3491 {
3492         struct ctdb_control_iface_info *info;
3493         struct ctdb_iface *iface;
3494         bool link_up = false;
3495
3496         info = (struct ctdb_control_iface_info *)indata.dptr;
3497
3498         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3499                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3500                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3501                                   len, len, info->name));
3502                 return -1;
3503         }
3504
3505         switch (info->link_state) {
3506         case 0:
3507                 link_up = false;
3508                 break;
3509         case 1:
3510                 link_up = true;
3511                 break;
3512         default:
3513                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3514                                   (unsigned int)info->link_state));
3515                 return -1;
3516         }
3517
3518         if (info->references != 0) {
3519                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3520                                   (unsigned int)info->references));
3521                 return -1;
3522         }
3523
3524         iface = ctdb_find_iface(ctdb, info->name);
3525         if (iface == NULL) {
3526                 return -1;
3527         }
3528
3529         if (link_up == iface->link_up) {
3530                 return 0;
3531         }
3532
3533         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3534               ("iface[%s] has changed it's link status %s => %s\n",
3535                iface->name,
3536                iface->link_up?"up":"down",
3537                link_up?"up":"down"));
3538
3539         iface->link_up = link_up;
3540         return 0;
3541 }
3542
3543
3544 /* 
3545    structure containing the listening socket and the list of tcp connections
3546    that the ctdb daemon is to kill
3547 */
3548 struct ctdb_kill_tcp {
3549         struct ctdb_vnn *vnn;
3550         struct ctdb_context *ctdb;
3551         int capture_fd;
3552         struct fd_event *fde;
3553         trbt_tree_t *connections;
3554         void *private_data;
3555 };
3556
3557 /*
3558   a tcp connection that is to be killed
3559  */
3560 struct ctdb_killtcp_con {
3561         ctdb_sock_addr src_addr;
3562         ctdb_sock_addr dst_addr;
3563         int count;
3564         struct ctdb_kill_tcp *killtcp;
3565 };
3566
3567 /* this function is used to create a key to represent this socketpair
3568    in the killtcp tree.
3569    this key is used to insert and lookup matching socketpairs that are
3570    to be tickled and RST
3571 */
3572 #define KILLTCP_KEYLEN  10
3573 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3574 {
3575         static uint32_t key[KILLTCP_KEYLEN];
3576
3577         bzero(key, sizeof(key));
3578
3579         if (src->sa.sa_family != dst->sa.sa_family) {
3580                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3581                 return key;
3582         }
3583         
3584         switch (src->sa.sa_family) {
3585         case AF_INET:
3586                 key[0]  = dst->ip.sin_addr.s_addr;
3587                 key[1]  = src->ip.sin_addr.s_addr;
3588                 key[2]  = dst->ip.sin_port;
3589                 key[3]  = src->ip.sin_port;
3590                 break;
3591         case AF_INET6: {
3592                 uint32_t *dst6_addr32 =
3593                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3594                 uint32_t *src6_addr32 =
3595                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3596                 key[0]  = dst6_addr32[3];
3597                 key[1]  = src6_addr32[3];
3598                 key[2]  = dst6_addr32[2];
3599                 key[3]  = src6_addr32[2];
3600                 key[4]  = dst6_addr32[1];
3601                 key[5]  = src6_addr32[1];
3602                 key[6]  = dst6_addr32[0];
3603                 key[7]  = src6_addr32[0];
3604                 key[8]  = dst->ip6.sin6_port;
3605                 key[9]  = src->ip6.sin6_port;
3606                 break;
3607         }
3608         default:
3609                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3610                 return key;
3611         }
3612
3613         return key;
3614 }
3615
3616 /*
3617   called when we get a read event on the raw socket
3618  */
3619 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3620                                 uint16_t flags, void *private_data)
3621 {
3622         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3623         struct ctdb_killtcp_con *con;
3624         ctdb_sock_addr src, dst;
3625         uint32_t ack_seq, seq;
3626
3627         if (!(flags & EVENT_FD_READ)) {
3628                 return;
3629         }
3630
3631         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3632                                 killtcp->private_data,
3633                                 &src, &dst,
3634                                 &ack_seq, &seq) != 0) {
3635                 /* probably a non-tcp ACK packet */
3636                 return;
3637         }
3638
3639         /* check if we have this guy in our list of connections
3640            to kill
3641         */
3642         con = trbt_lookuparray32(killtcp->connections, 
3643                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3644         if (con == NULL) {
3645                 /* no this was some other packet we can just ignore */
3646                 return;
3647         }
3648
3649         /* This one has been tickled !
3650            now reset him and remove him from the list.
3651          */
3652         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3653                 ntohs(con->dst_addr.ip.sin_port),
3654                 ctdb_addr_to_str(&con->src_addr),
3655                 ntohs(con->src_addr.ip.sin_port)));
3656
3657         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3658         talloc_free(con);
3659 }
3660
3661
3662 /* when traversing the list of all tcp connections to send tickle acks to
3663    (so that we can capture the ack coming back and kill the connection
3664     by a RST)
3665    this callback is called for each connection we are currently trying to kill
3666 */
3667 static int tickle_connection_traverse(void *param, void *data)
3668 {
3669         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3670
3671         /* have tried too many times, just give up */
3672         if (con->count >= 5) {
3673                 /* can't delete in traverse: reparent to delete_cons */
3674                 talloc_steal(param, con);
3675                 return 0;
3676         }
3677
3678         /* othervise, try tickling it again */
3679         con->count++;
3680         ctdb_sys_send_tcp(
3681                 (ctdb_sock_addr *)&con->dst_addr,
3682                 (ctdb_sock_addr *)&con->src_addr,
3683                 0, 0, 0);
3684         return 0;
3685 }
3686
3687
3688 /* 
3689    called every second until all sentenced connections have been reset
3690  */
3691 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3692                                               struct timeval t, void *private_data)
3693 {
3694         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3695         void *delete_cons = talloc_new(NULL);
3696
3697         /* loop over all connections sending tickle ACKs */
3698         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3699
3700         /* now we've finished traverse, it's safe to do deletion. */
3701         talloc_free(delete_cons);
3702
3703         /* If there are no more connections to kill we can remove the
3704            entire killtcp structure
3705          */
3706         if ( (killtcp->connections == NULL) || 
3707              (killtcp->connections->root == NULL) ) {
3708                 talloc_free(killtcp);
3709                 return;
3710         }
3711
3712         /* try tickling them again in a seconds time
3713          */
3714         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3715                         ctdb_tickle_sentenced_connections, killtcp);
3716 }
3717
3718 /*
3719   destroy the killtcp structure
3720  */
3721 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3722 {
3723         struct ctdb_vnn *tmpvnn;
3724
3725         /* verify that this vnn is still active */
3726         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3727                 if (tmpvnn == killtcp->vnn) {
3728                         break;
3729                 }
3730         }
3731
3732         if (tmpvnn == NULL) {
3733                 return 0;
3734         }
3735
3736         if (killtcp->vnn->killtcp != killtcp) {
3737                 return 0;
3738         }
3739
3740         killtcp->vnn->killtcp = NULL;
3741
3742         return 0;
3743 }
3744
3745
3746 /* nothing fancy here, just unconditionally replace any existing
3747    connection structure with the new one.
3748
3749    dont even free the old one if it did exist, that one is talloc_stolen
3750    by the same node in the tree anyway and will be deleted when the new data 
3751    is deleted
3752 */
3753 static void *add_killtcp_callback(void *parm, void *data)
3754 {
3755         return parm;
3756 }
3757
3758 /*
3759   add a tcp socket to the list of connections we want to RST
3760  */
3761 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3762                                        ctdb_sock_addr *s,
3763                                        ctdb_sock_addr *d)
3764 {
3765         ctdb_sock_addr src, dst;
3766         struct ctdb_kill_tcp *killtcp;
3767         struct ctdb_killtcp_con *con;
3768         struct ctdb_vnn *vnn;
3769
3770         ctdb_canonicalize_ip(s, &src);
3771         ctdb_canonicalize_ip(d, &dst);
3772
3773         vnn = find_public_ip_vnn(ctdb, &dst);
3774         if (vnn == NULL) {
3775                 vnn = find_public_ip_vnn(ctdb, &src);
3776         }
3777         if (vnn == NULL) {
3778                 /* if it is not a public ip   it could be our 'single ip' */
3779                 if (ctdb->single_ip_vnn) {
3780                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3781                                 vnn = ctdb->single_ip_vnn;
3782                         }
3783                 }
3784         }
3785         if (vnn == NULL) {
3786                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3787                 return -1;
3788         }
3789
3790         killtcp = vnn->killtcp;
3791         
3792         /* If this is the first connection to kill we must allocate
3793            a new structure
3794          */
3795         if (killtcp == NULL) {
3796                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3797                 CTDB_NO_MEMORY(ctdb, killtcp);
3798
3799                 killtcp->vnn         = vnn;
3800                 killtcp->ctdb        = ctdb;
3801                 killtcp->capture_fd  = -1;
3802                 killtcp->connections = trbt_create(killtcp, 0);
3803
3804                 vnn->killtcp         = killtcp;
3805                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3806         }
3807
3808
3809
3810         /* create a structure that describes this connection we want to
3811            RST and store it in killtcp->connections
3812         */
3813         con = talloc(killtcp, struct ctdb_killtcp_con);
3814         CTDB_NO_MEMORY(ctdb, con);
3815         con->src_addr = src;
3816         con->dst_addr = dst;
3817         con->count    = 0;
3818         con->killtcp  = killtcp;
3819
3820
3821         trbt_insertarray32_callback(killtcp->connections,
3822                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3823                         add_killtcp_callback, con);
3824
3825         /* 
3826            If we dont have a socket to listen on yet we must create it
3827          */
3828         if (killtcp->capture_fd == -1) {
3829                 const char *iface = ctdb_vnn_iface_string(vnn);
3830                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3831                 if (killtcp->capture_fd == -1) {
3832                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3833                                           "socket on iface '%s' for killtcp (%s)\n",
3834                                           iface, strerror(errno)));
3835                         goto failed;
3836                 }
3837         }
3838
3839
3840         if (killtcp->fde == NULL) {
3841                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3842                                             EVENT_FD_READ,
3843                                             capture_tcp_handler, killtcp);
3844                 tevent_fd_set_auto_close(killtcp->fde);
3845
3846                 /* We also need to set up some events to tickle all these connections
3847                    until they are all reset
3848                 */
3849                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3850                                 ctdb_tickle_sentenced_connections, killtcp);
3851         }
3852
3853         /* tickle him once now */
3854         ctdb_sys_send_tcp(
3855                 &con->dst_addr,
3856                 &con->src_addr,
3857                 0, 0, 0);
3858
3859         return 0;
3860
3861 failed:
3862         talloc_free(vnn->killtcp);
3863         vnn->killtcp = NULL;
3864         return -1;
3865 }
3866
3867 /*
3868   kill a TCP connection.
3869  */
3870 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3871 {
3872         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3873
3874         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3875 }
3876
3877 /*
3878   called by a daemon to inform us of the entire list of TCP tickles for
3879   a particular public address.
3880   this control should only be sent by the node that is currently serving
3881   that public address.
3882  */
3883 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3884 {
3885         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3886         struct ctdb_tcp_array *tcparray;
3887         struct ctdb_vnn *vnn;
3888
3889         /* We must at least have tickles.num or else we cant verify the size
3890            of the received data blob
3891          */
3892         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3893                                         tickles.connections)) {
3894                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3895                 return -1;
3896         }
3897
3898         /* verify that the size of data matches what we expect */
3899         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3900                                 tickles.connections)
3901                          + sizeof(struct ctdb_tcp_connection)
3902                                  * list->tickles.num) {
3903                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3904                 return -1;
3905         }
3906
3907         vnn = find_public_ip_vnn(ctdb, &list->addr);
3908         if (vnn == NULL) {
3909                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3910                         ctdb_addr_to_str(&list->addr)));
3911
3912                 return 1;
3913         }
3914
3915         /* remove any old ticklelist we might have */
3916         talloc_free(vnn->tcp_array);
3917         vnn->tcp_array = NULL;
3918
3919         tcparray = talloc(vnn, struct ctdb_tcp_array);
3920         CTDB_NO_MEMORY(ctdb, tcparray);
3921
3922         tcparray->num = list->tickles.num;
3923
3924         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3925         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3926
3927         memcpy(tcparray->connections, &list->tickles.connections[0],
3928                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3929
3930         /* We now have a new fresh tickle list array for this vnn */
3931         vnn->tcp_array = tcparray;
3932
3933         return 0;
3934 }
3935
3936 /*
3937   called to return the full list of tickles for the puclic address associated 
3938   with the provided vnn
3939  */
3940 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3941 {
3942         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3943         struct ctdb_control_tcp_tickle_list *list;
3944         struct ctdb_tcp_array *tcparray;
3945         int num;
3946         struct ctdb_vnn *vnn;
3947
3948         vnn = find_public_ip_vnn(ctdb, addr);
3949         if (vnn == NULL) {
3950                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3951                         ctdb_addr_to_str(addr)));
3952
3953                 return 1;
3954         }
3955
3956         tcparray = vnn->tcp_array;
3957         if (tcparray) {
3958                 num = tcparray->num;
3959         } else {
3960                 num = 0;
3961         }
3962
3963         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3964                                 tickles.connections)
3965                         + sizeof(struct ctdb_tcp_connection) * num;
3966
3967         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3968         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3969         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3970
3971         list->addr = *addr;
3972         list->tickles.num = num;
3973         if (num) {
3974                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3975                         sizeof(struct ctdb_tcp_connection) * num);
3976         }
3977
3978         return 0;
3979 }
3980
3981
3982 /*
3983   set the list of all tcp tickles for a public address
3984  */
3985 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3986                                             ctdb_sock_addr *addr,
3987                                             struct ctdb_tcp_array *tcparray)
3988 {
3989         int ret, num;
3990         TDB_DATA data;
3991         struct ctdb_control_tcp_tickle_list *list;
3992
3993         if (tcparray) {
3994                 num = tcparray->num;
3995         } else {
3996                 num = 0;
3997         }
3998
3999         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
4000                                 tickles.connections) +
4001                         sizeof(struct ctdb_tcp_connection) * num;
4002         data.dptr = talloc_size(ctdb, data.dsize);
4003         CTDB_NO_MEMORY(ctdb, data.dptr);
4004
4005         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
4006         list->addr = *addr;
4007         list->tickles.num = num;
4008         if (tcparray) {
4009                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
4010         }
4011
4012         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
4013                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4014                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4015         if (ret != 0) {
4016                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4017                 return -1;
4018         }
4019
4020         talloc_free(data.dptr);
4021
4022         return ret;
4023 }
4024
4025
4026 /*
4027   perform tickle updates if required
4028  */
4029 static void ctdb_update_tcp_tickles(struct event_context *ev, 
4030                                 struct timed_event *te, 
4031                                 struct timeval t, void *private_data)
4032 {
4033         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4034         int ret;
4035         struct ctdb_vnn *vnn;
4036
4037         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4038                 /* we only send out updates for public addresses that 
4039                    we have taken over
4040                  */
4041                 if (ctdb->pnn != vnn->pnn) {
4042                         continue;
4043                 }
4044                 /* We only send out the updates if we need to */
4045                 if (!vnn->tcp_update_needed) {
4046                         continue;
4047                 }
4048                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
4049                                                        &vnn->public_address,
4050                                                        vnn->tcp_array);
4051                 if (ret != 0) {
4052                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4053                                 ctdb_addr_to_str(&vnn->public_address)));
4054                 } else {
4055                         vnn->tcp_update_needed = false;
4056                 }
4057         }
4058
4059         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4060                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4061                              ctdb_update_tcp_tickles, ctdb);
4062 }               
4063         
4064
4065 /*
4066   start periodic update of tcp tickles
4067  */
4068 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4069 {
4070         ctdb->tickle_update_context = talloc_new(ctdb);
4071
4072         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4073                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4074                              ctdb_update_tcp_tickles, ctdb);
4075 }
4076
4077
4078
4079
4080 struct control_gratious_arp {
4081         struct ctdb_context *ctdb;
4082         ctdb_sock_addr addr;
4083         const char *iface;
4084         int count;
4085 };
4086
4087 /*
4088   send a control_gratuitous arp
4089  */
4090 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4091                                   struct timeval t, void *private_data)
4092 {
4093         int ret;
4094         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4095                                                         struct control_gratious_arp);
4096
4097         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4098         if (ret != 0) {
4099                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4100                                  arp->iface, strerror(errno)));
4101         }
4102
4103
4104         arp->count++;
4105         if (arp->count == CTDB_ARP_REPEAT) {
4106                 talloc_free(arp);
4107                 return;
4108         }
4109
4110         event_add_timed(arp->ctdb->ev, arp, 
4111                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4112                         send_gratious_arp, arp);
4113 }
4114
4115
4116 /*
4117   send a gratious arp 
4118  */
4119 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4120 {
4121         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4122         struct control_gratious_arp *arp;
4123
4124         /* verify the size of indata */
4125         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4126                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4127                                  (unsigned)indata.dsize, 
4128                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4129                 return -1;
4130         }
4131         if (indata.dsize != 
4132                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4133                 + gratious_arp->len ) ){
4134
4135                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4136                         "but should be %u bytes\n", 
4137                          (unsigned)indata.dsize, 
4138                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4139                 return -1;
4140         }
4141
4142
4143         arp = talloc(ctdb, struct control_gratious_arp);
4144         CTDB_NO_MEMORY(ctdb, arp);
4145
4146         arp->ctdb  = ctdb;
4147         arp->addr   = gratious_arp->addr;
4148         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4149         CTDB_NO_MEMORY(ctdb, arp->iface);
4150         arp->count = 0;
4151         
4152         event_add_timed(arp->ctdb->ev, arp, 
4153                         timeval_zero(), send_gratious_arp, arp);
4154
4155         return 0;
4156 }
4157
4158 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4159 {
4160         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4161         int ret;
4162
4163         /* verify the size of indata */
4164         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4165                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4166                 return -1;
4167         }
4168         if (indata.dsize != 
4169                 ( offsetof(struct ctdb_control_ip_iface, iface)
4170                 + pub->len ) ){
4171
4172                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4173                         "but should be %u bytes\n", 
4174                          (unsigned)indata.dsize, 
4175                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4176                 return -1;
4177         }
4178
4179         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4180
4181         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4182
4183         if (ret != 0) {
4184                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4185                 return -1;
4186         }
4187
4188         return 0;
4189 }
4190
4191 struct delete_ip_callback_state {
4192         struct ctdb_req_control *c;
4193 };
4194
4195 /*
4196   called when releaseip event finishes for del_public_address
4197  */
4198 static void delete_ip_callback(struct ctdb_context *ctdb,
4199                                int32_t status, TDB_DATA data,
4200                                const char *errormsg,
4201                                void *private_data)
4202 {
4203         struct delete_ip_callback_state *state =
4204                 talloc_get_type(private_data, struct delete_ip_callback_state);
4205
4206         /* If release failed then fail. */
4207         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4208         talloc_free(private_data);
4209 }
4210
4211 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4212                                         struct ctdb_req_control *c,
4213                                         TDB_DATA indata, bool *async_reply)
4214 {
4215         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4216         struct ctdb_vnn *vnn;
4217
4218         /* verify the size of indata */
4219         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4220                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4221                 return -1;
4222         }
4223         if (indata.dsize != 
4224                 ( offsetof(struct ctdb_control_ip_iface, iface)
4225                 + pub->len ) ){
4226
4227                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4228                         "but should be %u bytes\n", 
4229                          (unsigned)indata.dsize, 
4230                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4231                 return -1;
4232         }
4233
4234         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4235
4236         /* walk over all public addresses until we find a match */
4237         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4238                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4239                         if (vnn->pnn == ctdb->pnn) {
4240                                 struct delete_ip_callback_state *state;
4241                                 struct ctdb_public_ip *ip;
4242                                 TDB_DATA data;
4243                                 int ret;
4244
4245                                 vnn->delete_pending = true;
4246
4247                                 state = talloc(ctdb,
4248                                                struct delete_ip_callback_state);
4249                                 CTDB_NO_MEMORY(ctdb, state);
4250                                 state->c = c;
4251
4252                                 ip = talloc(state, struct ctdb_public_ip);
4253                                 if (ip == NULL) {
4254                                         DEBUG(DEBUG_ERR,
4255                                               (__location__ " Out of memory\n"));
4256                                         talloc_free(state);
4257                                         return -1;
4258                                 }
4259                                 ip->pnn = -1;
4260                                 ip->addr = pub->addr;
4261
4262                                 data.dsize = sizeof(struct ctdb_public_ip);
4263                                 data.dptr = (unsigned char *)ip;
4264
4265                                 ret = ctdb_daemon_send_control(ctdb,
4266                                                                ctdb_get_pnn(ctdb),
4267                                                                0,
4268                                                                CTDB_CONTROL_RELEASE_IP,
4269                                                                0, 0,
4270                                                                data,
4271                                                                delete_ip_callback,
4272                                                                state);
4273                                 if (ret == -1) {
4274                                         DEBUG(DEBUG_ERR,
4275                                               (__location__ "Unable to send "
4276                                                "CTDB_CONTROL_RELEASE_IP\n"));
4277                                         talloc_free(state);
4278                                         return -1;
4279                                 }
4280
4281                                 state->c = talloc_steal(state, c);
4282                                 *async_reply = true;
4283                         } else {
4284                                 /* This IP is not hosted on the
4285                                  * current node so just delete it
4286                                  * now. */
4287                                 do_delete_ip(ctdb, vnn);
4288                         }
4289
4290                         return 0;
4291                 }
4292         }
4293
4294         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4295                          ctdb_addr_to_str(&pub->addr)));
4296         return -1;
4297 }
4298
4299
4300 struct ipreallocated_callback_state {
4301         struct ctdb_req_control *c;
4302 };
4303
4304 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4305                                         int status, void *p)
4306 {
4307         struct ipreallocated_callback_state *state =
4308                 talloc_get_type(p, struct ipreallocated_callback_state);
4309
4310         if (status != 0) {
4311                 DEBUG(DEBUG_ERR,
4312                       (" \"ipreallocated\" event script failed (status %d)\n",
4313                        status));
4314                 if (status == -ETIME) {
4315                         ctdb_ban_self(ctdb);
4316                 }
4317         }
4318
4319         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4320         talloc_free(state);
4321 }
4322
4323 /* A control to run the ipreallocated event */
4324 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4325                                    struct ctdb_req_control *c,
4326                                    bool *async_reply)
4327 {
4328         int ret;
4329         struct ipreallocated_callback_state *state;
4330
4331         state = talloc(ctdb, struct ipreallocated_callback_state);
4332         CTDB_NO_MEMORY(ctdb, state);
4333
4334         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4335
4336         ret = ctdb_event_script_callback(ctdb, state,
4337                                          ctdb_ipreallocated_callback, state,
4338                                          CTDB_EVENT_IPREALLOCATED,
4339                                          "%s", "");
4340
4341         if (ret != 0) {
4342                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4343                 talloc_free(state);
4344                 return -1;
4345         }
4346
4347         /* tell the control that we will be reply asynchronously */
4348         state->c    = talloc_steal(state, c);
4349         *async_reply = true;
4350
4351         return 0;
4352 }
4353
4354
4355 /* This function is called from the recovery daemon to verify that a remote
4356    node has the expected ip allocation.
4357    This is verified against ctdb->ip_tree
4358 */
4359 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4360                                 struct ctdb_all_public_ips *ips,
4361                                 uint32_t pnn)
4362 {
4363         struct ctdb_public_ip_list *tmp_ip; 
4364         int i;
4365
4366         if (ctdb->ip_tree == NULL) {
4367                 /* dont know the expected allocation yet, assume remote node
4368                    is correct. */
4369                 return 0;
4370         }
4371
4372         if (ips == NULL) {
4373                 return 0;
4374         }
4375
4376         for (i=0; i<ips->num; i++) {
4377                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4378                 if (tmp_ip == NULL) {
4379                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4380                         return -1;
4381                 }
4382
4383                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4384                         continue;
4385                 }
4386
4387                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4388                         DEBUG(DEBUG_ERR,
4389                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4390                                pnn,
4391                                ctdb_addr_to_str(&ips->ips[i].addr),
4392                                ips->ips[i].pnn, tmp_ip->pnn));
4393                         return -1;
4394                 }
4395         }
4396
4397         return 0;
4398 }
4399
4400 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4401 {
4402         struct ctdb_public_ip_list *tmp_ip; 
4403
4404         if (ctdb->ip_tree == NULL) {
4405                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4406                 return -1;
4407         }
4408
4409         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4410         if (tmp_ip == NULL) {
4411                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4412                 return -1;
4413         }
4414
4415         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4416         tmp_ip->pnn = ip->pnn;
4417
4418         return 0;
4419 }
4420
4421
4422 struct ctdb_reloadips_handle {
4423         struct ctdb_context *ctdb;
4424         struct ctdb_req_control *c;
4425         int status;
4426         int fd[2];
4427         pid_t child;
4428         struct fd_event *fde;
4429 };
4430
4431 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4432 {
4433         if (h == h->ctdb->reload_ips) {
4434                 h->ctdb->reload_ips = NULL;
4435         }
4436         if (h->c != NULL) {
4437                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4438                 h->c = NULL;
4439         }
4440         ctdb_kill(h->ctdb, h->child, SIGKILL);
4441         return 0;
4442 }
4443
4444 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4445                                 struct timed_event *te,
4446                                 struct timeval t, void *private_data)
4447 {
4448         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4449
4450         talloc_free(h);
4451 }       
4452
4453 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4454                              uint16_t flags, void *private_data)
4455 {
4456         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4457
4458         char res;
4459         int ret;
4460
4461         ret = read(h->fd[0], &res, 1);
4462         if (ret < 1 || res != 0) {
4463                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4464                 res = 1;
4465         }
4466         h->status = res;
4467
4468         talloc_free(h);
4469 }
4470
4471 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4472 {
4473         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4474         struct ctdb_all_public_ips *ips;
4475         struct ctdb_vnn *vnn;
4476         struct client_async_data *async_data;
4477         struct timeval timeout;
4478         TDB_DATA data;
4479         struct ctdb_client_control_state *state;
4480         bool first_add;
4481         int i, ret;
4482
4483         CTDB_NO_MEMORY(ctdb, mem_ctx);
4484
4485         /* Read IPs from local node */
4486         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4487                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4488         if (ret != 0) {
4489                 DEBUG(DEBUG_ERR,
4490                       ("Unable to fetch public IPs from local node\n"));
4491                 talloc_free(mem_ctx);
4492                 return -1;
4493         }
4494
4495         /* Read IPs file - this is safe since this is a child process */
4496         ctdb->vnn = NULL;
4497         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4498                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4499                 talloc_free(mem_ctx);
4500                 return -1;
4501         }
4502
4503         async_data = talloc_zero(mem_ctx, struct client_async_data);
4504         CTDB_NO_MEMORY(ctdb, async_data);
4505
4506         /* Compare IPs between node and file for IPs to be deleted */
4507         for (i = 0; i < ips->num; i++) {
4508                 /* */
4509                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4510                         if (ctdb_same_ip(&vnn->public_address,
4511                                          &ips->ips[i].addr)) {
4512                                 /* IP is still in file */
4513                                 break;
4514                         }
4515                 }
4516
4517                 if (vnn == NULL) {
4518                         /* Delete IP ips->ips[i] */
4519                         struct ctdb_control_ip_iface *pub;
4520
4521                         DEBUG(DEBUG_NOTICE,
4522                               ("IP %s no longer configured, deleting it\n",
4523                                ctdb_addr_to_str(&ips->ips[i].addr)));
4524
4525                         pub = talloc_zero(mem_ctx,
4526                                           struct ctdb_control_ip_iface);
4527                         CTDB_NO_MEMORY(ctdb, pub);
4528
4529                         pub->addr  = ips->ips[i].addr;
4530                         pub->mask  = 0;
4531                         pub->len   = 0;
4532
4533                         timeout = TAKEOVER_TIMEOUT();
4534
4535                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4536                                               iface) + pub->len;
4537                         data.dptr = (uint8_t *)pub;
4538
4539                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4540                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4541                                                   0, data, async_data,
4542                                                   &timeout, NULL);
4543                         if (state == NULL) {
4544                                 DEBUG(DEBUG_ERR,
4545                                       (__location__
4546                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4547                                 goto failed;
4548                         }
4549
4550                         ctdb_client_async_add(async_data, state);
4551                 }
4552         }
4553
4554         /* Compare IPs between node and file for IPs to be added */
4555         first_add = true;
4556         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4557                 for (i = 0; i < ips->num; i++) {
4558                         if (ctdb_same_ip(&vnn->public_address,
4559                                          &ips->ips[i].addr)) {
4560                                 /* IP already on node */
4561                                 break;
4562                         }
4563                 }
4564                 if (i == ips->num) {
4565                         /* Add IP ips->ips[i] */
4566                         struct ctdb_control_ip_iface *pub;
4567                         const char *ifaces = NULL;
4568                         uint32_t len;
4569                         int iface = 0;
4570
4571                         DEBUG(DEBUG_NOTICE,
4572                               ("New IP %s configured, adding it\n",
4573                                ctdb_addr_to_str(&vnn->public_address)));
4574                         if (first_add) {
4575                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4576
4577                                 data.dsize = sizeof(pnn);
4578                                 data.dptr  = (uint8_t *)&pnn;
4579
4580                                 ret = ctdb_client_send_message(
4581                                         ctdb,
4582                                         CTDB_BROADCAST_CONNECTED,
4583                                         CTDB_SRVID_REBALANCE_NODE,
4584                                         data);
4585                                 if (ret != 0) {
4586                                         DEBUG(DEBUG_WARNING,
4587                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4588                                 }
4589
4590                                 first_add = false;
4591                         }
4592
4593                         ifaces = vnn->ifaces[0];
4594                         iface = 1;
4595                         while (vnn->ifaces[iface] != NULL) {
4596                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4597                                                          vnn->ifaces[iface]);
4598                                 iface++;
4599                         }
4600
4601                         len   = strlen(ifaces) + 1;
4602                         pub = talloc_zero_size(mem_ctx,
4603                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4604                         CTDB_NO_MEMORY(ctdb, pub);
4605
4606                         pub->addr  = vnn->public_address;
4607                         pub->mask  = vnn->public_netmask_bits;
4608                         pub->len   = len;
4609                         memcpy(&pub->iface[0], ifaces, pub->len);
4610
4611                         timeout = TAKEOVER_TIMEOUT();
4612
4613                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4614                                               iface) + pub->len;
4615                         data.dptr = (uint8_t *)pub;
4616
4617                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4618                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4619                                                   0, data, async_data,
4620                                                   &timeout, NULL);
4621                         if (state == NULL) {
4622                                 DEBUG(DEBUG_ERR,
4623                                       (__location__
4624                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4625                                 goto failed;
4626                         }
4627
4628                         ctdb_client_async_add(async_data, state);
4629                 }
4630         }
4631
4632         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4633                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4634                 goto failed;
4635         }
4636
4637         talloc_free(mem_ctx);
4638         return 0;
4639
4640 failed:
4641         talloc_free(mem_ctx);
4642         return -1;
4643 }
4644
4645 /* This control is sent to force the node to re-read the public addresses file
4646    and drop any addresses we should nnot longer host, and add new addresses
4647    that we are now able to host
4648 */
4649 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4650 {
4651         struct ctdb_reloadips_handle *h;
4652         pid_t parent = getpid();
4653
4654         if (ctdb->reload_ips != NULL) {
4655                 talloc_free(ctdb->reload_ips);
4656                 ctdb->reload_ips = NULL;
4657         }
4658
4659         h = talloc(ctdb, struct ctdb_reloadips_handle);
4660         CTDB_NO_MEMORY(ctdb, h);
4661         h->ctdb     = ctdb;
4662         h->c        = NULL;
4663         h->status   = -1;
4664         
4665         if (pipe(h->fd) == -1) {
4666                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4667                 talloc_free(h);
4668                 return -1;
4669         }
4670
4671         h->child = ctdb_fork(ctdb);
4672         if (h->child == (pid_t)-1) {
4673                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4674                 close(h->fd[0]);
4675                 close(h->fd[1]);
4676                 talloc_free(h);
4677                 return -1;
4678         }
4679
4680         /* child process */
4681         if (h->child == 0) {
4682                 signed char res = 0;
4683
4684                 close(h->fd[0]);
4685                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4686
4687                 ctdb_set_process_name("ctdb_reloadips");
4688                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4689                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4690                         res = -1;
4691                 } else {
4692                         res = ctdb_reloadips_child(ctdb);
4693                         if (res != 0) {
4694                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4695                         }
4696                 }
4697
4698                 write(h->fd[1], &res, 1);
4699                 /* make sure we die when our parent dies */
4700                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4701                         sleep(5);
4702                 }
4703                 _exit(0);
4704         }
4705
4706         h->c             = talloc_steal(h, c);
4707
4708         close(h->fd[1]);
4709         set_close_on_exec(h->fd[0]);
4710
4711         talloc_set_destructor(h, ctdb_reloadips_destructor);
4712
4713
4714         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4715                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4716                         (void *)h);
4717         tevent_fd_set_auto_close(h->fde);
4718
4719         event_add_timed(ctdb->ev, h,
4720                         timeval_current_ofs(120, 0),
4721                         ctdb_reloadips_timeout_event, h);
4722
4723         /* we reply later */
4724         *async_reply = true;
4725         return 0;
4726 }