recoverd: Trying to failback more IPs no longer allocates unassigned IPs
[garming/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Check if this is a public ip known to the node, i.e. can that
1303    node takeover this ip ?
1304 */
1305 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                 struct ctdb_public_ip_list *ip)
1307 {
1308         struct ctdb_all_public_ips *public_ips;
1309         int i;
1310
1311         public_ips = ctdb->nodes[pnn]->available_public_ips;
1312
1313         if (public_ips == NULL) {
1314                 return -1;
1315         }
1316
1317         for (i=0;i<public_ips->num;i++) {
1318                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1319                         /* yes, this node can serve this public ip */
1320                         return 0;
1321                 }
1322         }
1323
1324         return -1;
1325 }
1326
1327
1328 /* search the node lists list for a node to takeover this ip.
1329    pick the node that currently are serving the least number of ips
1330    so that the ips get spread out evenly.
1331 */
1332 static int find_takeover_node(struct ctdb_context *ctdb, 
1333                 struct ctdb_node_map *nodemap, uint32_t mask, 
1334                 struct ctdb_public_ip_list *ip,
1335                 struct ctdb_public_ip_list *all_ips)
1336 {
1337         int pnn, min=0, num;
1338         int i;
1339
1340         pnn    = -1;
1341         for (i=0;i<nodemap->num;i++) {
1342                 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1343                         /* This node is not allowed to takeover any addresses
1344                         */
1345                         continue;
1346                 }
1347
1348                 if (nodemap->nodes[i].flags & mask) {
1349                         /* This node is not healty and can not be used to serve
1350                            a public address 
1351                         */
1352                         continue;
1353                 }
1354
1355                 /* verify that this node can serve this ip */
1356                 if (can_node_serve_ip(ctdb, i, ip)) {
1357                         /* no it couldnt   so skip to the next node */
1358                         continue;
1359                 }
1360
1361                 num = node_ip_coverage(ctdb, i, all_ips);
1362                 /* was this the first node we checked ? */
1363                 if (pnn == -1) {
1364                         pnn = i;
1365                         min  = num;
1366                 } else {
1367                         if (num < min) {
1368                                 pnn = i;
1369                                 min  = num;
1370                         }
1371                 }
1372         }       
1373         if (pnn == -1) {
1374                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1375                         ctdb_addr_to_str(&ip->addr)));
1376
1377                 return -1;
1378         }
1379
1380         ip->pnn = pnn;
1381         return 0;
1382 }
1383
1384 #define IP_KEYLEN       4
1385 static uint32_t *ip_key(ctdb_sock_addr *ip)
1386 {
1387         static uint32_t key[IP_KEYLEN];
1388
1389         bzero(key, sizeof(key));
1390
1391         switch (ip->sa.sa_family) {
1392         case AF_INET:
1393                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1394                 break;
1395         case AF_INET6: {
1396                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1397                 key[0]  = htonl(s6_a32[0]);
1398                 key[1]  = htonl(s6_a32[1]);
1399                 key[2]  = htonl(s6_a32[2]);
1400                 key[3]  = htonl(s6_a32[3]);
1401                 break;
1402         }
1403         default:
1404                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1405                 return key;
1406         }
1407
1408         return key;
1409 }
1410
1411 static void *add_ip_callback(void *parm, void *data)
1412 {
1413         struct ctdb_public_ip_list *this_ip = parm; 
1414         struct ctdb_public_ip_list *prev_ip = data; 
1415
1416         if (prev_ip == NULL) {
1417                 return parm;
1418         }
1419         if (this_ip->pnn == -1) {
1420                 this_ip->pnn = prev_ip->pnn;
1421         }
1422
1423         return parm;
1424 }
1425
1426 static int getips_count_callback(void *param, void *data)
1427 {
1428         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1429         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1430
1431         new_ip->next = *ip_list;
1432         *ip_list     = new_ip;
1433         return 0;
1434 }
1435
1436 static struct ctdb_public_ip_list *
1437 create_merged_ip_list(struct ctdb_context *ctdb)
1438 {
1439         int i, j;
1440         struct ctdb_public_ip_list *ip_list;
1441         struct ctdb_all_public_ips *public_ips;
1442
1443         if (ctdb->ip_tree != NULL) {
1444                 talloc_free(ctdb->ip_tree);
1445                 ctdb->ip_tree = NULL;
1446         }
1447         ctdb->ip_tree = trbt_create(ctdb, 0);
1448
1449         for (i=0;i<ctdb->num_nodes;i++) {
1450                 public_ips = ctdb->nodes[i]->known_public_ips;
1451
1452                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1453                         continue;
1454                 }
1455
1456                 /* there were no public ips for this node */
1457                 if (public_ips == NULL) {
1458                         continue;
1459                 }               
1460
1461                 for (j=0;j<public_ips->num;j++) {
1462                         struct ctdb_public_ip_list *tmp_ip; 
1463
1464                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1465                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1466                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1467                         tmp_ip->addr = public_ips->ips[j].addr;
1468                         tmp_ip->next = NULL;
1469
1470                         trbt_insertarray32_callback(ctdb->ip_tree,
1471                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1472                                 add_ip_callback,
1473                                 tmp_ip);
1474                 }
1475         }
1476
1477         ip_list = NULL;
1478         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1479
1480         return ip_list;
1481 }
1482
1483 /* 
1484  * This is the length of the longtest common prefix between the IPs.
1485  * It is calculated by XOR-ing the 2 IPs together and counting the
1486  * number of leading zeroes.  The implementation means that all
1487  * addresses end up being 128 bits long.
1488  *
1489  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1490  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1491  * lots of nodes and IP addresses?
1492  */
1493 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1494 {
1495         uint32_t ip1_k[IP_KEYLEN];
1496         uint32_t *t;
1497         int i;
1498         uint32_t x;
1499
1500         uint32_t distance = 0;
1501
1502         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1503         t = ip_key(ip2);
1504         for (i=0; i<IP_KEYLEN; i++) {
1505                 x = ip1_k[i] ^ t[i];
1506                 if (x == 0) {
1507                         distance += 32;
1508                 } else {
1509                         /* Count number of leading zeroes. 
1510                          * FIXME? This could be optimised...
1511                          */
1512                         while ((x & (1 << 31)) == 0) {
1513                                 x <<= 1;
1514                                 distance += 1;
1515                         }
1516                 }
1517         }
1518
1519         return distance;
1520 }
1521
1522 /* Calculate the IP distance for the given IP relative to IPs on the
1523    given node.  The ips argument is generally the all_ips variable
1524    used in the main part of the algorithm.
1525  */
1526 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1527                                   struct ctdb_public_ip_list *ips,
1528                                   int pnn)
1529 {
1530         struct ctdb_public_ip_list *t;
1531         uint32_t d;
1532
1533         uint32_t sum = 0;
1534
1535         for (t=ips; t != NULL; t=t->next) {
1536                 if (t->pnn != pnn) {
1537                         continue;
1538                 }
1539
1540                 /* Optimisation: We never calculate the distance
1541                  * between an address and itself.  This allows us to
1542                  * calculate the effect of removing an address from a
1543                  * node by simply calculating the distance between
1544                  * that address and all of the exitsing addresses.
1545                  * Moreover, we assume that we're only ever dealing
1546                  * with addresses from all_ips so we can identify an
1547                  * address via a pointer rather than doing a more
1548                  * expensive address comparison. */
1549                 if (&(t->addr) == ip) {
1550                         continue;
1551                 }
1552
1553                 d = ip_distance(ip, &(t->addr));
1554                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1555         }
1556
1557         return sum;
1558 }
1559
1560 /* Return the LCP2 imbalance metric for addresses currently assigned
1561    to the given node.
1562  */
1563 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1564 {
1565         struct ctdb_public_ip_list *t;
1566
1567         uint32_t imbalance = 0;
1568
1569         for (t=all_ips; t!=NULL; t=t->next) {
1570                 if (t->pnn != pnn) {
1571                         continue;
1572                 }
1573                 /* Pass the rest of the IPs rather than the whole
1574                    all_ips input list.
1575                 */
1576                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1577         }
1578
1579         return imbalance;
1580 }
1581
1582 /* Allocate any unassigned IPs just by looping through the IPs and
1583  * finding the best node for each.
1584  */
1585 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1586                                       struct ctdb_node_map *nodemap,
1587                                       uint32_t mask,
1588                                       struct ctdb_public_ip_list *all_ips)
1589 {
1590         struct ctdb_public_ip_list *tmp_ip;
1591
1592         /* loop over all ip's and find a physical node to cover for 
1593            each unassigned ip.
1594         */
1595         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1596                 if (tmp_ip->pnn == -1) {
1597                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1598                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1599                                         ctdb_addr_to_str(&tmp_ip->addr)));
1600                         }
1601                 }
1602         }
1603 }
1604
1605 /* Basic non-deterministic rebalancing algorithm.
1606  */
1607 static bool basic_failback(struct ctdb_context *ctdb,
1608                            struct ctdb_node_map *nodemap,
1609                            uint32_t mask,
1610                            struct ctdb_public_ip_list *all_ips,
1611                            int num_ips,
1612                            int *retries)
1613 {
1614         int i;
1615         int maxnode, maxnum=0, minnode, minnum=0, num;
1616         struct ctdb_public_ip_list *tmp_ip;
1617
1618         /* for each ip address, loop over all nodes that can serve
1619            this ip and make sure that the difference between the node
1620            serving the most and the node serving the least ip's are
1621            not greater than 1.
1622         */
1623         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1624                 if (tmp_ip->pnn == -1) {
1625                         continue;
1626                 }
1627
1628                 /* Get the highest and lowest number of ips's served by any 
1629                    valid node which can serve this ip.
1630                 */
1631                 maxnode = -1;
1632                 minnode = -1;
1633                 for (i=0;i<nodemap->num;i++) {
1634                         if (nodemap->nodes[i].flags & mask) {
1635                                 continue;
1636                         }
1637
1638                         /* Only check nodes that are allowed to takeover an ip */
1639                         if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1640                                 continue;
1641                         }
1642
1643                         /* only check nodes that can actually serve this ip */
1644                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1645                                 /* no it couldnt   so skip to the next node */
1646                                 continue;
1647                         }
1648
1649                         num = node_ip_coverage(ctdb, i, all_ips);
1650                         if (maxnode == -1) {
1651                                 maxnode = i;
1652                                 maxnum  = num;
1653                         } else {
1654                                 if (num > maxnum) {
1655                                         maxnode = i;
1656                                         maxnum  = num;
1657                                 }
1658                         }
1659                         if (minnode == -1) {
1660                                 minnode = i;
1661                                 minnum  = num;
1662                         } else {
1663                                 if (num < minnum) {
1664                                         minnode = i;
1665                                         minnum  = num;
1666                                 }
1667                         }
1668                 }
1669                 if (maxnode == -1) {
1670                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1671                                 ctdb_addr_to_str(&tmp_ip->addr)));
1672
1673                         continue;
1674                 }
1675
1676                 /* if the spread between the smallest and largest coverage by
1677                    a node is >=2 we steal one of the ips from the node with
1678                    most coverage to even things out a bit.
1679                    try to do this a limited number of times since we dont
1680                    want to spend too much time balancing the ip coverage.
1681                 */
1682                 if ( (maxnum > minnum+1)
1683                      && (*retries < (num_ips + 5)) ){
1684                         struct ctdb_public_ip_list *tmp;
1685
1686                         /* Reassign one of maxnode's VNNs */
1687                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1688                                 if (tmp->pnn == maxnode) {
1689                                         (void)find_takeover_node(ctdb, nodemap, mask, tmp, all_ips);
1690                                         (*retries)++;
1691                                         return true;
1692                                 }
1693                         }
1694                 }
1695         }
1696
1697         return false;
1698 }
1699
1700 struct ctdb_rebalancenodes {
1701         struct ctdb_rebalancenodes *next;
1702         uint32_t pnn;
1703 };
1704 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1705
1706
1707 /* set this flag to force the node to be rebalanced even if it just didnt
1708    become healthy again.
1709 */
1710 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1711 {
1712         struct ctdb_rebalancenodes *rebalance;
1713
1714         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1715                 if (rebalance->pnn == pnn) {
1716                         return;
1717                 }
1718         }
1719
1720         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1721         rebalance->pnn = pnn;
1722         rebalance->next = force_rebalance_list;
1723         force_rebalance_list = rebalance;
1724 }
1725
1726 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1727  * that we can unit test it.
1728  */
1729 static void lcp2_init(struct ctdb_context * tmp_ctx,
1730                struct ctdb_node_map * nodemap,
1731                uint32_t mask,
1732                struct ctdb_public_ip_list *all_ips,
1733                uint32_t **lcp2_imbalances,
1734                bool **newly_healthy)
1735 {
1736         int i;
1737         struct ctdb_public_ip_list *tmp_ip;
1738
1739         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1740         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1741         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1742         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1743
1744         for (i=0;i<nodemap->num;i++) {
1745                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1746                 /* First step: is the node "healthy"? */
1747                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1748         }
1749
1750         /* 2nd step: if a ndoe has IPs assigned then it must have been
1751          * healthy before, so we remove it from consideration... */
1752         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1753                 if (tmp_ip->pnn != -1) {
1754                         (*newly_healthy)[tmp_ip->pnn] = false;
1755                 }
1756         }
1757
1758         /* 3rd step: if a node is forced to re-balance then
1759            we allow failback onto the node */
1760         while (force_rebalance_list != NULL) {
1761                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1762
1763                 if (force_rebalance_list->pnn <= nodemap->num) {
1764                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1765                 }
1766
1767                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1768                 talloc_free(force_rebalance_list);
1769                 force_rebalance_list = next;
1770         }
1771 }
1772
1773 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1774  * the IP/node combination that will cost the least.
1775  */
1776 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1777                               struct ctdb_node_map *nodemap,
1778                               uint32_t mask,
1779                               struct ctdb_public_ip_list *all_ips,
1780                               uint32_t *lcp2_imbalances)
1781 {
1782         struct ctdb_public_ip_list *tmp_ip;
1783         int dstnode;
1784
1785         int minnode;
1786         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1787         struct ctdb_public_ip_list *minip;
1788
1789         bool should_loop = true;
1790         bool have_unassigned = true;
1791
1792         while (have_unassigned && should_loop) {
1793                 should_loop = false;
1794
1795                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1796                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1797
1798                 minnode = -1;
1799                 mindsum = 0;
1800                 minip = NULL;
1801
1802                 /* loop over each unassigned ip. */
1803                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1804                         if (tmp_ip->pnn != -1) {
1805                                 continue;
1806                         }
1807
1808                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1809                                 /* Only check nodes that are allowed to takeover an ip */
1810                                 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1811                                         continue;
1812                                 }
1813
1814                                 /* only check nodes that can actually serve this ip */
1815                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1816                                         /* no it couldnt   so skip to the next node */
1817                                         continue;
1818                                 }
1819                                 if (nodemap->nodes[dstnode].flags & mask) {
1820                                         continue;
1821                                 }
1822
1823                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1824                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1825                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1826                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1827                                                    dstnode,
1828                                                    dstimbl - lcp2_imbalances[dstnode]));
1829
1830
1831                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1832                                         minnode = dstnode;
1833                                         minimbl = dstimbl;
1834                                         mindsum = dstdsum;
1835                                         minip = tmp_ip;
1836                                         should_loop = true;
1837                                 }
1838                         }
1839                 }
1840
1841                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1842
1843                 /* If we found one then assign it to the given node. */
1844                 if (minnode != -1) {
1845                         minip->pnn = minnode;
1846                         lcp2_imbalances[minnode] = minimbl;
1847                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1848                                           ctdb_addr_to_str(&(minip->addr)),
1849                                           minnode,
1850                                           mindsum));
1851                 }
1852
1853                 /* There might be a better way but at least this is clear. */
1854                 have_unassigned = false;
1855                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1856                         if (tmp_ip->pnn == -1) {
1857                                 have_unassigned = true;
1858                         }
1859                 }
1860         }
1861
1862         /* We know if we have an unassigned addresses so we might as
1863          * well optimise.
1864          */
1865         if (have_unassigned) {
1866                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1867                         if (tmp_ip->pnn == -1) {
1868                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1869                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1870                         }
1871                 }
1872         }
1873 }
1874
1875 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1876  * to move IPs from, determines the best IP/destination node
1877  * combination to move from the source node.
1878  */
1879 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1880                                     struct ctdb_node_map *nodemap,
1881                                     struct ctdb_public_ip_list *all_ips,
1882                                     int srcnode,
1883                                     uint32_t candimbl,
1884                                     uint32_t *lcp2_imbalances,
1885                                     bool *newly_healthy)
1886 {
1887         int dstnode, mindstnode;
1888         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1889         uint32_t minsrcimbl, mindstimbl;
1890         struct ctdb_public_ip_list *minip;
1891         struct ctdb_public_ip_list *tmp_ip;
1892
1893         /* Find an IP and destination node that best reduces imbalance. */
1894         minip = NULL;
1895         minsrcimbl = 0;
1896         mindstnode = -1;
1897         mindstimbl = 0;
1898
1899         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1900         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1901
1902         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1903                 /* Only consider addresses on srcnode. */
1904                 if (tmp_ip->pnn != srcnode) {
1905                         continue;
1906                 }
1907
1908                 /* What is this IP address costing the source node? */
1909                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1910                 srcimbl = candimbl - srcdsum;
1911
1912                 /* Consider this IP address would cost each potential
1913                  * destination node.  Destination nodes are limited to
1914                  * those that are newly healthy, since we don't want
1915                  * to do gratuitous failover of IPs just to make minor
1916                  * balance improvements.
1917                  */
1918                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1919                         if (! newly_healthy[dstnode]) {
1920                                 continue;
1921                         }
1922
1923                         /* Only check nodes that are allowed to takeover an ip */
1924                         if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1925                                 continue;
1926                         }
1927
1928                         /* only check nodes that can actually serve this ip */
1929                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1930                                 /* no it couldnt   so skip to the next node */
1931                                 continue;
1932                         }
1933
1934                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1935                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1936                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1937                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1938                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1939                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1940
1941                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1942                             ((mindstnode == -1) ||                              \
1943                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1944
1945                                 minip = tmp_ip;
1946                                 minsrcimbl = srcimbl;
1947                                 mindstnode = dstnode;
1948                                 mindstimbl = dstimbl;
1949                         }
1950                 }
1951         }
1952         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1953
1954         if (mindstnode != -1) {
1955                 /* We found a move that makes things better... */
1956                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1957                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1958                                   ctdb_addr_to_str(&(minip->addr)),
1959                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1960
1961
1962                 lcp2_imbalances[srcnode] = srcimbl;
1963                 lcp2_imbalances[mindstnode] = mindstimbl;
1964                 minip->pnn = mindstnode;
1965
1966                 return true;
1967         }
1968
1969         return false;
1970         
1971 }
1972
1973 struct lcp2_imbalance_pnn {
1974         uint32_t imbalance;
1975         int pnn;
1976 };
1977
1978 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1979 {
1980         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1981         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1982
1983         if (lipa->imbalance > lipb->imbalance) {
1984                 return -1;
1985         } else if (lipa->imbalance == lipb->imbalance) {
1986                 return 0;
1987         } else {
1988                 return 1;
1989         }
1990 }
1991
1992 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1993  * node with the highest LCP2 imbalance, and then determines the best
1994  * IP/destination node combination to move from the source node.
1995  */
1996 static bool lcp2_failback(struct ctdb_context *ctdb,
1997                           struct ctdb_node_map *nodemap,
1998                           uint32_t mask,
1999                           struct ctdb_public_ip_list *all_ips,
2000                           uint32_t *lcp2_imbalances,
2001                           bool *newly_healthy)
2002 {
2003         int i, num_newly_healthy;
2004         struct lcp2_imbalance_pnn * lips;
2005         bool ret;
2006
2007         /* It is only worth continuing if we have suitable target
2008          * nodes to transfer IPs to.  This check is much cheaper than
2009          * continuing on...
2010          */
2011         num_newly_healthy = 0;
2012         for (i = 0; i < nodemap->num; i++) {
2013                 if (newly_healthy[i]) {
2014                         num_newly_healthy++;
2015                 }
2016         }
2017         if (num_newly_healthy == 0) {
2018                 return false;
2019         }
2020
2021         /* Put the imbalances and nodes into an array, sort them and
2022          * iterate through candidates.  Usually the 1st one will be
2023          * used, so this doesn't cost much...
2024          */
2025         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2026         for (i = 0; i < nodemap->num; i++) {
2027                 lips[i].imbalance = lcp2_imbalances[i];
2028                 lips[i].pnn = i;
2029         }
2030         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2031               lcp2_cmp_imbalance_pnn);
2032
2033         ret = false;
2034         for (i = 0; i < nodemap->num; i++) {
2035                 /* This means that all nodes had 0 or 1 addresses, so
2036                  * can't be imbalanced.
2037                  */
2038                 if (lips[i].imbalance == 0) {
2039                         break;
2040                 }
2041
2042                 if (lcp2_failback_candidate(ctdb,
2043                                             nodemap,
2044                                             all_ips,
2045                                             lips[i].pnn,
2046                                             lips[i].imbalance,
2047                                             lcp2_imbalances,
2048                                             newly_healthy)) {
2049                         ret = true;
2050                         break;
2051                 }
2052         }
2053
2054         talloc_free(lips);
2055         return ret;
2056 }
2057
2058 /* The calculation part of the IP allocation algorithm. */
2059 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2060                                    struct ctdb_node_map *nodemap,
2061                                    struct ctdb_public_ip_list **all_ips_p)
2062 {
2063         int i, num_healthy, retries, num_ips;
2064         uint32_t mask;
2065         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2066         uint32_t *lcp2_imbalances;
2067         bool *newly_healthy;
2068
2069         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2070
2071         /* Count how many completely healthy nodes we have */
2072         num_healthy = 0;
2073         for (i=0;i<nodemap->num;i++) {
2074                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2075                         num_healthy++;
2076                 }
2077         }
2078
2079         /* If we have healthy nodes then we will only consider them
2080            for serving public addresses
2081         */
2082         mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
2083         if ((num_healthy == 0) &&
2084             (ctdb->tunable.no_ip_takeover_on_disabled == 0)) {
2085                 /* We didnt have any completely healthy nodes so
2086                    use "disabled" nodes as a fallback
2087                 */
2088                 mask = NODE_FLAGS_INACTIVE;
2089         }
2090
2091         /* since nodes only know about those public addresses that
2092            can be served by that particular node, no single node has
2093            a full list of all public addresses that exist in the cluster.
2094            Walk over all node structures and create a merged list of
2095            all public addresses that exist in the cluster.
2096
2097            keep the tree of ips around as ctdb->ip_tree
2098         */
2099         all_ips = create_merged_ip_list(ctdb);
2100         *all_ips_p = all_ips; /* minimal code changes */
2101
2102         /* Count how many ips we have */
2103         num_ips = 0;
2104         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2105                 num_ips++;
2106         }
2107
2108         /* If we want deterministic ip allocations, i.e. that the ip addresses
2109            will always be allocated the same way for a specific set of
2110            available/unavailable nodes.
2111         */
2112         if (1 == ctdb->tunable.deterministic_public_ips) {              
2113                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2114                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2115                         tmp_ip->pnn = i%nodemap->num;
2116                 }
2117
2118                 /* IP failback doesn't make sense with deterministic
2119                  * IPs, since the modulo step above implicitly fails
2120                  * back IPs to their "home" node.
2121                  */
2122                 if (1 == ctdb->tunable.no_ip_failback) {
2123                         DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2124                 }
2125         }
2126
2127
2128         /* mark all public addresses with a masked node as being served by
2129            node -1
2130         */
2131         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2132                 if (tmp_ip->pnn == -1) {
2133                         continue;
2134                 }
2135                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
2136                         tmp_ip->pnn = -1;
2137                 }
2138         }
2139
2140         /* verify that the assigned nodes can serve that public ip
2141            and set it to -1 if not
2142         */
2143         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2144                 if (tmp_ip->pnn == -1) {
2145                         continue;
2146                 }
2147                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2148                         /* this node can not serve this ip. */
2149                         tmp_ip->pnn = -1;
2150                 }
2151         }
2152
2153         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2154                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2155         }
2156
2157         /* now we must redistribute all public addresses with takeover node
2158            -1 among the nodes available
2159         */
2160         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2161                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2162         } else {
2163                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2164         }
2165
2166         /* If we don't want IPs to fail back or if deterministic IPs
2167          * are being used, then don't rebalance IPs.
2168          */
2169         if ((1 == ctdb->tunable.no_ip_failback) ||
2170             (1 == ctdb->tunable.deterministic_public_ips)) {
2171                 goto finished;
2172         }
2173
2174
2175         /* now, try to make sure the ip adresses are evenly distributed
2176            across the node.
2177         */
2178         retries = 0;
2179 try_again:
2180         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2181                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
2182                         goto try_again;
2183                 }
2184         } else {
2185                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
2186                         goto try_again;
2187                 }
2188         }
2189
2190         /* finished distributing the public addresses, now just send the 
2191            info out to the nodes */
2192 finished:
2193         /* at this point ->pnn is the node which will own each IP
2194            or -1 if there is no node that can cover this ip
2195         */
2196
2197         talloc_free(tmp_ctx);
2198
2199         return;
2200 }
2201
2202 static void noiptakeover_cb(struct ctdb_context *ctdb, uint32_t pnn, int32_t res, TDB_DATA outdata, void *callback)
2203 {
2204         struct ctdb_node_map *nodemap = (struct ctdb_node_map *)callback;
2205
2206         if (res != 0) {
2207                 DEBUG(DEBUG_ERR,("Failure to read NoIPTakeover tunable from remote node %d\n", pnn));
2208                 return;
2209         }
2210
2211         if (outdata.dsize != sizeof(uint32_t)) {
2212                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading NoIPTakeover tunable from node %d. Expected %d bytes but received %d bytes\n", pnn, (int)sizeof(uint32_t), (int)outdata.dsize));
2213                 return;
2214         }
2215
2216         if (pnn >= nodemap->num) {
2217                 DEBUG(DEBUG_ERR,("Got NoIPTakeover reply from node %d but nodemap only has %d entries\n", pnn, nodemap->num));
2218                 return;
2219         }
2220
2221         if (*(uint32_t *)outdata.dptr != 0) {
2222                 nodemap->nodes[pnn].flags |= NODE_FLAGS_NOIPTAKEOVER;
2223         }
2224 }
2225
2226 /*
2227   make any IP alias changes for public addresses that are necessary 
2228  */
2229 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2230                       client_async_callback fail_callback, void *callback_data)
2231 {
2232         int i;
2233         struct ctdb_public_ip ip;
2234         struct ctdb_public_ipv4 ipv4;
2235         struct ctdb_control_get_tunable *t;
2236         uint32_t *nodes;
2237         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2238         TDB_DATA data;
2239         struct timeval timeout;
2240         struct client_async_data *async_data;
2241         struct ctdb_client_control_state *state;
2242         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2243         uint32_t disable_timeout;
2244
2245         /*
2246          * ip failover is completely disabled, just send out the 
2247          * ipreallocated event.
2248          */
2249         if (ctdb->tunable.disable_ip_failover != 0) {
2250                 goto ipreallocated;
2251         }
2252
2253
2254         /* assume all nodes do support failback */
2255         for (i=0;i<nodemap->num;i++) {
2256                 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2257         }
2258         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen("NoIPTakeover") + 1;
2259         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2260         t = (struct ctdb_control_get_tunable *)data.dptr;
2261         t->length = strlen("NoIPTakeover")+1;
2262         memcpy(t->name, "NoIPTakeover", t->length);
2263         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2264         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2265                                       nodes, 0, TAKEOVER_TIMEOUT(),
2266                                       false, data,
2267                                       noiptakeover_cb, NULL,
2268                                       nodemap) != 0) {
2269                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get noiptakeover tunable failed\n"));
2270         }
2271         talloc_free(nodes);
2272         talloc_free(data.dptr);
2273
2274
2275         ZERO_STRUCT(ip);
2276
2277         /* Do the IP reassignment calculations */
2278         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2279
2280         /* The recovery daemon does regular sanity checks of the IPs.
2281          * However, sometimes it is overzealous and thinks changes are
2282          * required when they're already underway.  This stops the
2283          * checks for a while before we start moving IPs.
2284          */
2285         disable_timeout = ctdb->tunable.takeover_timeout;
2286         data.dptr  = (uint8_t*)&disable_timeout;
2287         data.dsize = sizeof(disable_timeout);
2288         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2289                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2290                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2291         }
2292
2293         /* now tell all nodes to delete any alias that they should not
2294            have.  This will be a NOOP on nodes that don't currently
2295            hold the given alias */
2296         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2297         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2298
2299         async_data->fail_callback = fail_callback;
2300         async_data->callback_data = callback_data;
2301
2302         for (i=0;i<nodemap->num;i++) {
2303                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2304                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2305                         continue;
2306                 }
2307
2308                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2309                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2310                                 /* This node should be serving this
2311                                    vnn so dont tell it to release the ip
2312                                 */
2313                                 continue;
2314                         }
2315                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2316                                 ipv4.pnn = tmp_ip->pnn;
2317                                 ipv4.sin = tmp_ip->addr.ip;
2318
2319                                 timeout = TAKEOVER_TIMEOUT();
2320                                 data.dsize = sizeof(ipv4);
2321                                 data.dptr  = (uint8_t *)&ipv4;
2322                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2323                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2324                                                 data, async_data,
2325                                                 &timeout, NULL);
2326                         } else {
2327                                 ip.pnn  = tmp_ip->pnn;
2328                                 ip.addr = tmp_ip->addr;
2329
2330                                 timeout = TAKEOVER_TIMEOUT();
2331                                 data.dsize = sizeof(ip);
2332                                 data.dptr  = (uint8_t *)&ip;
2333                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2334                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2335                                                 data, async_data,
2336                                                 &timeout, NULL);
2337                         }
2338
2339                         if (state == NULL) {
2340                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2341                                 talloc_free(tmp_ctx);
2342                                 return -1;
2343                         }
2344                 
2345                         ctdb_client_async_add(async_data, state);
2346                 }
2347         }
2348         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2349                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2350                 talloc_free(tmp_ctx);
2351                 return -1;
2352         }
2353         talloc_free(async_data);
2354
2355
2356         /* tell all nodes to get their own IPs */
2357         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2358         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2359
2360         async_data->fail_callback = fail_callback;
2361         async_data->callback_data = callback_data;
2362
2363         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2364                 if (tmp_ip->pnn == -1) {
2365                         /* this IP won't be taken over */
2366                         continue;
2367                 }
2368
2369                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2370                         ipv4.pnn = tmp_ip->pnn;
2371                         ipv4.sin = tmp_ip->addr.ip;
2372
2373                         timeout = TAKEOVER_TIMEOUT();
2374                         data.dsize = sizeof(ipv4);
2375                         data.dptr  = (uint8_t *)&ipv4;
2376                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2377                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2378                                         data, async_data,
2379                                         &timeout, NULL);
2380                 } else {
2381                         ip.pnn  = tmp_ip->pnn;
2382                         ip.addr = tmp_ip->addr;
2383
2384                         timeout = TAKEOVER_TIMEOUT();
2385                         data.dsize = sizeof(ip);
2386                         data.dptr  = (uint8_t *)&ip;
2387                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2388                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2389                                         data, async_data,
2390                                         &timeout, NULL);
2391                 }
2392                 if (state == NULL) {
2393                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2394                         talloc_free(tmp_ctx);
2395                         return -1;
2396                 }
2397                 
2398                 ctdb_client_async_add(async_data, state);
2399         }
2400         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2401                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2402                 talloc_free(tmp_ctx);
2403                 return -1;
2404         }
2405
2406 ipreallocated:
2407         /* 
2408          * Tell all nodes to run eventscripts to process the
2409          * "ipreallocated" event.  This can do a lot of things,
2410          * including restarting services to reconfigure them if public
2411          * IPs have moved.  Once upon a time this event only used to
2412          * update natwg.
2413          */
2414         data.dptr  = discard_const("ipreallocated");
2415         data.dsize = strlen((char *)data.dptr) + 1; 
2416         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2417         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2418                                       nodes, 0, TAKEOVER_TIMEOUT(),
2419                                       false, data,
2420                                       NULL, fail_callback,
2421                                       callback_data) != 0) {
2422                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2423         }
2424
2425         talloc_free(tmp_ctx);
2426         return 0;
2427 }
2428
2429
2430 /*
2431   destroy a ctdb_client_ip structure
2432  */
2433 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2434 {
2435         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2436                 ctdb_addr_to_str(&ip->addr),
2437                 ntohs(ip->addr.ip.sin_port),
2438                 ip->client_id));
2439
2440         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2441         return 0;
2442 }
2443
2444 /*
2445   called by a client to inform us of a TCP connection that it is managing
2446   that should tickled with an ACK when IP takeover is done
2447   we handle both the old ipv4 style of packets as well as the new ipv4/6
2448   pdus.
2449  */
2450 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2451                                 TDB_DATA indata)
2452 {
2453         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2454         struct ctdb_control_tcp *old_addr = NULL;
2455         struct ctdb_control_tcp_addr new_addr;
2456         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2457         struct ctdb_tcp_list *tcp;
2458         struct ctdb_tcp_connection t;
2459         int ret;
2460         TDB_DATA data;
2461         struct ctdb_client_ip *ip;
2462         struct ctdb_vnn *vnn;
2463         ctdb_sock_addr addr;
2464
2465         switch (indata.dsize) {
2466         case sizeof(struct ctdb_control_tcp):
2467                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2468                 ZERO_STRUCT(new_addr);
2469                 tcp_sock = &new_addr;
2470                 tcp_sock->src.ip  = old_addr->src;
2471                 tcp_sock->dest.ip = old_addr->dest;
2472                 break;
2473         case sizeof(struct ctdb_control_tcp_addr):
2474                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2475                 break;
2476         default:
2477                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2478                                  "to ctdb_control_tcp_client. size was %d but "
2479                                  "only allowed sizes are %lu and %lu\n",
2480                                  (int)indata.dsize,
2481                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2482                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2483                 return -1;
2484         }
2485
2486         addr = tcp_sock->src;
2487         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2488         addr = tcp_sock->dest;
2489         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2490
2491         ZERO_STRUCT(addr);
2492         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2493         vnn = find_public_ip_vnn(ctdb, &addr);
2494         if (vnn == NULL) {
2495                 switch (addr.sa.sa_family) {
2496                 case AF_INET:
2497                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2498                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2499                                         ctdb_addr_to_str(&addr)));
2500                         }
2501                         break;
2502                 case AF_INET6:
2503                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2504                                 ctdb_addr_to_str(&addr)));
2505                         break;
2506                 default:
2507                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2508                 }
2509
2510                 return 0;
2511         }
2512
2513         if (vnn->pnn != ctdb->pnn) {
2514                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2515                         ctdb_addr_to_str(&addr),
2516                         client_id, client->pid));
2517                 /* failing this call will tell smbd to die */
2518                 return -1;
2519         }
2520
2521         ip = talloc(client, struct ctdb_client_ip);
2522         CTDB_NO_MEMORY(ctdb, ip);
2523
2524         ip->ctdb      = ctdb;
2525         ip->addr      = addr;
2526         ip->client_id = client_id;
2527         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2528         DLIST_ADD(ctdb->client_ip_list, ip);
2529
2530         tcp = talloc(client, struct ctdb_tcp_list);
2531         CTDB_NO_MEMORY(ctdb, tcp);
2532
2533         tcp->connection.src_addr = tcp_sock->src;
2534         tcp->connection.dst_addr = tcp_sock->dest;
2535
2536         DLIST_ADD(client->tcp_list, tcp);
2537
2538         t.src_addr = tcp_sock->src;
2539         t.dst_addr = tcp_sock->dest;
2540
2541         data.dptr = (uint8_t *)&t;
2542         data.dsize = sizeof(t);
2543
2544         switch (addr.sa.sa_family) {
2545         case AF_INET:
2546                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2547                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2548                         ctdb_addr_to_str(&tcp_sock->src),
2549                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2550                 break;
2551         case AF_INET6:
2552                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2553                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2554                         ctdb_addr_to_str(&tcp_sock->src),
2555                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2556                 break;
2557         default:
2558                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2559         }
2560
2561
2562         /* tell all nodes about this tcp connection */
2563         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2564                                        CTDB_CONTROL_TCP_ADD,
2565                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2566         if (ret != 0) {
2567                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2568                 return -1;
2569         }
2570
2571         return 0;
2572 }
2573
2574 /*
2575   find a tcp address on a list
2576  */
2577 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2578                                            struct ctdb_tcp_connection *tcp)
2579 {
2580         int i;
2581
2582         if (array == NULL) {
2583                 return NULL;
2584         }
2585
2586         for (i=0;i<array->num;i++) {
2587                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2588                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2589                         return &array->connections[i];
2590                 }
2591         }
2592         return NULL;
2593 }
2594
2595
2596
2597 /*
2598   called by a daemon to inform us of a TCP connection that one of its
2599   clients managing that should tickled with an ACK when IP takeover is
2600   done
2601  */
2602 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2603 {
2604         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2605         struct ctdb_tcp_array *tcparray;
2606         struct ctdb_tcp_connection tcp;
2607         struct ctdb_vnn *vnn;
2608
2609         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2610         if (vnn == NULL) {
2611                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2612                         ctdb_addr_to_str(&p->dst_addr)));
2613
2614                 return -1;
2615         }
2616
2617
2618         tcparray = vnn->tcp_array;
2619
2620         /* If this is the first tickle */
2621         if (tcparray == NULL) {
2622                 tcparray = talloc_size(ctdb->nodes, 
2623                         offsetof(struct ctdb_tcp_array, connections) +
2624                         sizeof(struct ctdb_tcp_connection) * 1);
2625                 CTDB_NO_MEMORY(ctdb, tcparray);
2626                 vnn->tcp_array = tcparray;
2627
2628                 tcparray->num = 0;
2629                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2630                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2631
2632                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2633                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2634                 tcparray->num++;
2635
2636                 if (tcp_update_needed) {
2637                         vnn->tcp_update_needed = true;
2638                 }
2639                 return 0;
2640         }
2641
2642
2643         /* Do we already have this tickle ?*/
2644         tcp.src_addr = p->src_addr;
2645         tcp.dst_addr = p->dst_addr;
2646         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2647                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2648                         ctdb_addr_to_str(&tcp.dst_addr),
2649                         ntohs(tcp.dst_addr.ip.sin_port),
2650                         vnn->pnn));
2651                 return 0;
2652         }
2653
2654         /* A new tickle, we must add it to the array */
2655         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2656                                         struct ctdb_tcp_connection,
2657                                         tcparray->num+1);
2658         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2659
2660         vnn->tcp_array = tcparray;
2661         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2662         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2663         tcparray->num++;
2664                                 
2665         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2666                 ctdb_addr_to_str(&tcp.dst_addr),
2667                 ntohs(tcp.dst_addr.ip.sin_port),
2668                 vnn->pnn));
2669
2670         if (tcp_update_needed) {
2671                 vnn->tcp_update_needed = true;
2672         }
2673
2674         return 0;
2675 }
2676
2677
2678 /*
2679   called by a daemon to inform us of a TCP connection that one of its
2680   clients managing that should tickled with an ACK when IP takeover is
2681   done
2682  */
2683 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2684 {
2685         struct ctdb_tcp_connection *tcpp;
2686         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2687
2688         if (vnn == NULL) {
2689                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2690                         ctdb_addr_to_str(&conn->dst_addr)));
2691                 return;
2692         }
2693
2694         /* if the array is empty we cant remove it
2695            and we dont need to do anything
2696          */
2697         if (vnn->tcp_array == NULL) {
2698                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2699                         ctdb_addr_to_str(&conn->dst_addr),
2700                         ntohs(conn->dst_addr.ip.sin_port)));
2701                 return;
2702         }
2703
2704
2705         /* See if we know this connection
2706            if we dont know this connection  then we dont need to do anything
2707          */
2708         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2709         if (tcpp == NULL) {
2710                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2711                         ctdb_addr_to_str(&conn->dst_addr),
2712                         ntohs(conn->dst_addr.ip.sin_port)));
2713                 return;
2714         }
2715
2716
2717         /* We need to remove this entry from the array.
2718            Instead of allocating a new array and copying data to it
2719            we cheat and just copy the last entry in the existing array
2720            to the entry that is to be removed and just shring the 
2721            ->num field
2722          */
2723         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2724         vnn->tcp_array->num--;
2725
2726         /* If we deleted the last entry we also need to remove the entire array
2727          */
2728         if (vnn->tcp_array->num == 0) {
2729                 talloc_free(vnn->tcp_array);
2730                 vnn->tcp_array = NULL;
2731         }               
2732
2733         vnn->tcp_update_needed = true;
2734
2735         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2736                 ctdb_addr_to_str(&conn->src_addr),
2737                 ntohs(conn->src_addr.ip.sin_port)));
2738 }
2739
2740
2741 /*
2742   called by a daemon to inform us of a TCP connection that one of its
2743   clients used are no longer needed in the tickle database
2744  */
2745 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2746 {
2747         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2748
2749         ctdb_remove_tcp_connection(ctdb, conn);
2750
2751         return 0;
2752 }
2753
2754
2755 /*
2756   called when a daemon restarts - send all tickes for all public addresses
2757   we are serving immediately to the new node.
2758  */
2759 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2760 {
2761 /*XXX here we should send all tickes we are serving to the new node */
2762         return 0;
2763 }
2764
2765
2766 /*
2767   called when a client structure goes away - hook to remove
2768   elements from the tcp_list in all daemons
2769  */
2770 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2771 {
2772         while (client->tcp_list) {
2773                 struct ctdb_tcp_list *tcp = client->tcp_list;
2774                 DLIST_REMOVE(client->tcp_list, tcp);
2775                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2776         }
2777 }
2778
2779
2780 /*
2781   release all IPs on shutdown
2782  */
2783 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2784 {
2785         struct ctdb_vnn *vnn;
2786
2787         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2788                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2789                         ctdb_vnn_unassign_iface(ctdb, vnn);
2790                         continue;
2791                 }
2792                 if (!vnn->iface) {
2793                         continue;
2794                 }
2795                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2796                                   ctdb_vnn_iface_string(vnn),
2797                                   ctdb_addr_to_str(&vnn->public_address),
2798                                   vnn->public_netmask_bits);
2799                 release_kill_clients(ctdb, &vnn->public_address);
2800                 ctdb_vnn_unassign_iface(ctdb, vnn);
2801         }
2802 }
2803
2804
2805 /*
2806   get list of public IPs
2807  */
2808 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2809                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2810 {
2811         int i, num, len;
2812         struct ctdb_all_public_ips *ips;
2813         struct ctdb_vnn *vnn;
2814         bool only_available = false;
2815
2816         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2817                 only_available = true;
2818         }
2819
2820         /* count how many public ip structures we have */
2821         num = 0;
2822         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2823                 num++;
2824         }
2825
2826         len = offsetof(struct ctdb_all_public_ips, ips) + 
2827                 num*sizeof(struct ctdb_public_ip);
2828         ips = talloc_zero_size(outdata, len);
2829         CTDB_NO_MEMORY(ctdb, ips);
2830
2831         i = 0;
2832         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2833                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2834                         continue;
2835                 }
2836                 ips->ips[i].pnn  = vnn->pnn;
2837                 ips->ips[i].addr = vnn->public_address;
2838                 i++;
2839         }
2840         ips->num = i;
2841         len = offsetof(struct ctdb_all_public_ips, ips) +
2842                 i*sizeof(struct ctdb_public_ip);
2843
2844         outdata->dsize = len;
2845         outdata->dptr  = (uint8_t *)ips;
2846
2847         return 0;
2848 }
2849
2850
2851 /*
2852   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2853  */
2854 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2855                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2856 {
2857         int i, num, len;
2858         struct ctdb_all_public_ipsv4 *ips;
2859         struct ctdb_vnn *vnn;
2860
2861         /* count how many public ip structures we have */
2862         num = 0;
2863         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2864                 if (vnn->public_address.sa.sa_family != AF_INET) {
2865                         continue;
2866                 }
2867                 num++;
2868         }
2869
2870         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2871                 num*sizeof(struct ctdb_public_ipv4);
2872         ips = talloc_zero_size(outdata, len);
2873         CTDB_NO_MEMORY(ctdb, ips);
2874
2875         outdata->dsize = len;
2876         outdata->dptr  = (uint8_t *)ips;
2877
2878         ips->num = num;
2879         i = 0;
2880         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2881                 if (vnn->public_address.sa.sa_family != AF_INET) {
2882                         continue;
2883                 }
2884                 ips->ips[i].pnn = vnn->pnn;
2885                 ips->ips[i].sin = vnn->public_address.ip;
2886                 i++;
2887         }
2888
2889         return 0;
2890 }
2891
2892 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2893                                         struct ctdb_req_control *c,
2894                                         TDB_DATA indata,
2895                                         TDB_DATA *outdata)
2896 {
2897         int i, num, len;
2898         ctdb_sock_addr *addr;
2899         struct ctdb_control_public_ip_info *info;
2900         struct ctdb_vnn *vnn;
2901
2902         addr = (ctdb_sock_addr *)indata.dptr;
2903
2904         vnn = find_public_ip_vnn(ctdb, addr);
2905         if (vnn == NULL) {
2906                 /* if it is not a public ip   it could be our 'single ip' */
2907                 if (ctdb->single_ip_vnn) {
2908                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2909                                 vnn = ctdb->single_ip_vnn;
2910                         }
2911                 }
2912         }
2913         if (vnn == NULL) {
2914                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2915                                  "'%s'not a public address\n",
2916                                  ctdb_addr_to_str(addr)));
2917                 return -1;
2918         }
2919
2920         /* count how many public ip structures we have */
2921         num = 0;
2922         for (;vnn->ifaces[num];) {
2923                 num++;
2924         }
2925
2926         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2927                 num*sizeof(struct ctdb_control_iface_info);
2928         info = talloc_zero_size(outdata, len);
2929         CTDB_NO_MEMORY(ctdb, info);
2930
2931         info->ip.addr = vnn->public_address;
2932         info->ip.pnn = vnn->pnn;
2933         info->active_idx = 0xFFFFFFFF;
2934
2935         for (i=0; vnn->ifaces[i]; i++) {
2936                 struct ctdb_iface *cur;
2937
2938                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2939                 if (cur == NULL) {
2940                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2941                                            vnn->ifaces[i]));
2942                         return -1;
2943                 }
2944                 if (vnn->iface == cur) {
2945                         info->active_idx = i;
2946                 }
2947                 strcpy(info->ifaces[i].name, cur->name);
2948                 info->ifaces[i].link_state = cur->link_up;
2949                 info->ifaces[i].references = cur->references;
2950         }
2951         info->num = i;
2952         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2953                 i*sizeof(struct ctdb_control_iface_info);
2954
2955         outdata->dsize = len;
2956         outdata->dptr  = (uint8_t *)info;
2957
2958         return 0;
2959 }
2960
2961 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2962                                 struct ctdb_req_control *c,
2963                                 TDB_DATA *outdata)
2964 {
2965         int i, num, len;
2966         struct ctdb_control_get_ifaces *ifaces;
2967         struct ctdb_iface *cur;
2968
2969         /* count how many public ip structures we have */
2970         num = 0;
2971         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2972                 num++;
2973         }
2974
2975         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2976                 num*sizeof(struct ctdb_control_iface_info);
2977         ifaces = talloc_zero_size(outdata, len);
2978         CTDB_NO_MEMORY(ctdb, ifaces);
2979
2980         i = 0;
2981         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2982                 strcpy(ifaces->ifaces[i].name, cur->name);
2983                 ifaces->ifaces[i].link_state = cur->link_up;
2984                 ifaces->ifaces[i].references = cur->references;
2985                 i++;
2986         }
2987         ifaces->num = i;
2988         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2989                 i*sizeof(struct ctdb_control_iface_info);
2990
2991         outdata->dsize = len;
2992         outdata->dptr  = (uint8_t *)ifaces;
2993
2994         return 0;
2995 }
2996
2997 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2998                                     struct ctdb_req_control *c,
2999                                     TDB_DATA indata)
3000 {
3001         struct ctdb_control_iface_info *info;
3002         struct ctdb_iface *iface;
3003         bool link_up = false;
3004
3005         info = (struct ctdb_control_iface_info *)indata.dptr;
3006
3007         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3008                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3009                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3010                                   len, len, info->name));
3011                 return -1;
3012         }
3013
3014         switch (info->link_state) {
3015         case 0:
3016                 link_up = false;
3017                 break;
3018         case 1:
3019                 link_up = true;
3020                 break;
3021         default:
3022                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3023                                   (unsigned int)info->link_state));
3024                 return -1;
3025         }
3026
3027         if (info->references != 0) {
3028                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3029                                   (unsigned int)info->references));
3030                 return -1;
3031         }
3032
3033         iface = ctdb_find_iface(ctdb, info->name);
3034         if (iface == NULL) {
3035                 return -1;
3036         }
3037
3038         if (link_up == iface->link_up) {
3039                 return 0;
3040         }
3041
3042         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3043               ("iface[%s] has changed it's link status %s => %s\n",
3044                iface->name,
3045                iface->link_up?"up":"down",
3046                link_up?"up":"down"));
3047
3048         iface->link_up = link_up;
3049         return 0;
3050 }
3051
3052
3053 /* 
3054    structure containing the listening socket and the list of tcp connections
3055    that the ctdb daemon is to kill
3056 */
3057 struct ctdb_kill_tcp {
3058         struct ctdb_vnn *vnn;
3059         struct ctdb_context *ctdb;
3060         int capture_fd;
3061         struct fd_event *fde;
3062         trbt_tree_t *connections;
3063         void *private_data;
3064 };
3065
3066 /*
3067   a tcp connection that is to be killed
3068  */
3069 struct ctdb_killtcp_con {
3070         ctdb_sock_addr src_addr;
3071         ctdb_sock_addr dst_addr;
3072         int count;
3073         struct ctdb_kill_tcp *killtcp;
3074 };
3075
3076 /* this function is used to create a key to represent this socketpair
3077    in the killtcp tree.
3078    this key is used to insert and lookup matching socketpairs that are
3079    to be tickled and RST
3080 */
3081 #define KILLTCP_KEYLEN  10
3082 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3083 {
3084         static uint32_t key[KILLTCP_KEYLEN];
3085
3086         bzero(key, sizeof(key));
3087
3088         if (src->sa.sa_family != dst->sa.sa_family) {
3089                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3090                 return key;
3091         }
3092         
3093         switch (src->sa.sa_family) {
3094         case AF_INET:
3095                 key[0]  = dst->ip.sin_addr.s_addr;
3096                 key[1]  = src->ip.sin_addr.s_addr;
3097                 key[2]  = dst->ip.sin_port;
3098                 key[3]  = src->ip.sin_port;
3099                 break;
3100         case AF_INET6: {
3101                 uint32_t *dst6_addr32 =
3102                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3103                 uint32_t *src6_addr32 =
3104                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3105                 key[0]  = dst6_addr32[3];
3106                 key[1]  = src6_addr32[3];
3107                 key[2]  = dst6_addr32[2];
3108                 key[3]  = src6_addr32[2];
3109                 key[4]  = dst6_addr32[1];
3110                 key[5]  = src6_addr32[1];
3111                 key[6]  = dst6_addr32[0];
3112                 key[7]  = src6_addr32[0];
3113                 key[8]  = dst->ip6.sin6_port;
3114                 key[9]  = src->ip6.sin6_port;
3115                 break;
3116         }
3117         default:
3118                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3119                 return key;
3120         }
3121
3122         return key;
3123 }
3124
3125 /*
3126   called when we get a read event on the raw socket
3127  */
3128 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3129                                 uint16_t flags, void *private_data)
3130 {
3131         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3132         struct ctdb_killtcp_con *con;
3133         ctdb_sock_addr src, dst;
3134         uint32_t ack_seq, seq;
3135
3136         if (!(flags & EVENT_FD_READ)) {
3137                 return;
3138         }
3139
3140         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3141                                 killtcp->private_data,
3142                                 &src, &dst,
3143                                 &ack_seq, &seq) != 0) {
3144                 /* probably a non-tcp ACK packet */
3145                 return;
3146         }
3147
3148         /* check if we have this guy in our list of connections
3149            to kill
3150         */
3151         con = trbt_lookuparray32(killtcp->connections, 
3152                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3153         if (con == NULL) {
3154                 /* no this was some other packet we can just ignore */
3155                 return;
3156         }
3157
3158         /* This one has been tickled !
3159            now reset him and remove him from the list.
3160          */
3161         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3162                 ntohs(con->dst_addr.ip.sin_port),
3163                 ctdb_addr_to_str(&con->src_addr),
3164                 ntohs(con->src_addr.ip.sin_port)));
3165
3166         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3167         talloc_free(con);
3168 }
3169
3170
3171 /* when traversing the list of all tcp connections to send tickle acks to
3172    (so that we can capture the ack coming back and kill the connection
3173     by a RST)
3174    this callback is called for each connection we are currently trying to kill
3175 */
3176 static int tickle_connection_traverse(void *param, void *data)
3177 {
3178         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3179
3180         /* have tried too many times, just give up */
3181         if (con->count >= 5) {
3182                 /* can't delete in traverse: reparent to delete_cons */
3183                 talloc_steal(param, con);
3184                 return 0;
3185         }
3186
3187         /* othervise, try tickling it again */
3188         con->count++;
3189         ctdb_sys_send_tcp(
3190                 (ctdb_sock_addr *)&con->dst_addr,
3191                 (ctdb_sock_addr *)&con->src_addr,
3192                 0, 0, 0);
3193         return 0;
3194 }
3195
3196
3197 /* 
3198    called every second until all sentenced connections have been reset
3199  */
3200 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3201                                               struct timeval t, void *private_data)
3202 {
3203         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3204         void *delete_cons = talloc_new(NULL);
3205
3206         /* loop over all connections sending tickle ACKs */
3207         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3208
3209         /* now we've finished traverse, it's safe to do deletion. */
3210         talloc_free(delete_cons);
3211
3212         /* If there are no more connections to kill we can remove the
3213            entire killtcp structure
3214          */
3215         if ( (killtcp->connections == NULL) || 
3216              (killtcp->connections->root == NULL) ) {
3217                 talloc_free(killtcp);
3218                 return;
3219         }
3220
3221         /* try tickling them again in a seconds time
3222          */
3223         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3224                         ctdb_tickle_sentenced_connections, killtcp);
3225 }
3226
3227 /*
3228   destroy the killtcp structure
3229  */
3230 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3231 {
3232         struct ctdb_vnn *tmpvnn;
3233
3234         /* verify that this vnn is still active */
3235         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3236                 if (tmpvnn == killtcp->vnn) {
3237                         break;
3238                 }
3239         }
3240
3241         if (tmpvnn == NULL) {
3242                 return 0;
3243         }
3244
3245         if (killtcp->vnn->killtcp != killtcp) {
3246                 return 0;
3247         }
3248
3249         killtcp->vnn->killtcp = NULL;
3250
3251         return 0;
3252 }
3253
3254
3255 /* nothing fancy here, just unconditionally replace any existing
3256    connection structure with the new one.
3257
3258    dont even free the old one if it did exist, that one is talloc_stolen
3259    by the same node in the tree anyway and will be deleted when the new data 
3260    is deleted
3261 */
3262 static void *add_killtcp_callback(void *parm, void *data)
3263 {
3264         return parm;
3265 }
3266
3267 /*
3268   add a tcp socket to the list of connections we want to RST
3269  */
3270 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3271                                        ctdb_sock_addr *s,
3272                                        ctdb_sock_addr *d)
3273 {
3274         ctdb_sock_addr src, dst;
3275         struct ctdb_kill_tcp *killtcp;
3276         struct ctdb_killtcp_con *con;
3277         struct ctdb_vnn *vnn;
3278
3279         ctdb_canonicalize_ip(s, &src);
3280         ctdb_canonicalize_ip(d, &dst);
3281
3282         vnn = find_public_ip_vnn(ctdb, &dst);
3283         if (vnn == NULL) {
3284                 vnn = find_public_ip_vnn(ctdb, &src);
3285         }
3286         if (vnn == NULL) {
3287                 /* if it is not a public ip   it could be our 'single ip' */
3288                 if (ctdb->single_ip_vnn) {
3289                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3290                                 vnn = ctdb->single_ip_vnn;
3291                         }
3292                 }
3293         }
3294         if (vnn == NULL) {
3295                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3296                 return -1;
3297         }
3298
3299         killtcp = vnn->killtcp;
3300         
3301         /* If this is the first connection to kill we must allocate
3302            a new structure
3303          */
3304         if (killtcp == NULL) {
3305                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3306                 CTDB_NO_MEMORY(ctdb, killtcp);
3307
3308                 killtcp->vnn         = vnn;
3309                 killtcp->ctdb        = ctdb;
3310                 killtcp->capture_fd  = -1;
3311                 killtcp->connections = trbt_create(killtcp, 0);
3312
3313                 vnn->killtcp         = killtcp;
3314                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3315         }
3316
3317
3318
3319         /* create a structure that describes this connection we want to
3320            RST and store it in killtcp->connections
3321         */
3322         con = talloc(killtcp, struct ctdb_killtcp_con);
3323         CTDB_NO_MEMORY(ctdb, con);
3324         con->src_addr = src;
3325         con->dst_addr = dst;
3326         con->count    = 0;
3327         con->killtcp  = killtcp;
3328
3329
3330         trbt_insertarray32_callback(killtcp->connections,
3331                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3332                         add_killtcp_callback, con);
3333
3334         /* 
3335            If we dont have a socket to listen on yet we must create it
3336          */
3337         if (killtcp->capture_fd == -1) {
3338                 const char *iface = ctdb_vnn_iface_string(vnn);
3339                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3340                 if (killtcp->capture_fd == -1) {
3341                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3342                                           "socket on iface '%s' for killtcp (%s)\n",
3343                                           iface, strerror(errno)));
3344                         goto failed;
3345                 }
3346         }
3347
3348
3349         if (killtcp->fde == NULL) {
3350                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3351                                             EVENT_FD_READ,
3352                                             capture_tcp_handler, killtcp);
3353                 tevent_fd_set_auto_close(killtcp->fde);
3354
3355                 /* We also need to set up some events to tickle all these connections
3356                    until they are all reset
3357                 */
3358                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3359                                 ctdb_tickle_sentenced_connections, killtcp);
3360         }
3361
3362         /* tickle him once now */
3363         ctdb_sys_send_tcp(
3364                 &con->dst_addr,
3365                 &con->src_addr,
3366                 0, 0, 0);
3367
3368         return 0;
3369
3370 failed:
3371         talloc_free(vnn->killtcp);
3372         vnn->killtcp = NULL;
3373         return -1;
3374 }
3375
3376 /*
3377   kill a TCP connection.
3378  */
3379 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3380 {
3381         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3382
3383         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3384 }
3385
3386 /*
3387   called by a daemon to inform us of the entire list of TCP tickles for
3388   a particular public address.
3389   this control should only be sent by the node that is currently serving
3390   that public address.
3391  */
3392 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3393 {
3394         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3395         struct ctdb_tcp_array *tcparray;
3396         struct ctdb_vnn *vnn;
3397
3398         /* We must at least have tickles.num or else we cant verify the size
3399            of the received data blob
3400          */
3401         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3402                                         tickles.connections)) {
3403                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3404                 return -1;
3405         }
3406
3407         /* verify that the size of data matches what we expect */
3408         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3409                                 tickles.connections)
3410                          + sizeof(struct ctdb_tcp_connection)
3411                                  * list->tickles.num) {
3412                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3413                 return -1;
3414         }       
3415
3416         vnn = find_public_ip_vnn(ctdb, &list->addr);
3417         if (vnn == NULL) {
3418                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3419                         ctdb_addr_to_str(&list->addr)));
3420
3421                 return 1;
3422         }
3423
3424         /* remove any old ticklelist we might have */
3425         talloc_free(vnn->tcp_array);
3426         vnn->tcp_array = NULL;
3427
3428         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3429         CTDB_NO_MEMORY(ctdb, tcparray);
3430
3431         tcparray->num = list->tickles.num;
3432
3433         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3434         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3435
3436         memcpy(tcparray->connections, &list->tickles.connections[0], 
3437                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3438
3439         /* We now have a new fresh tickle list array for this vnn */
3440         vnn->tcp_array = talloc_steal(vnn, tcparray);
3441         
3442         return 0;
3443 }
3444
3445 /*
3446   called to return the full list of tickles for the puclic address associated 
3447   with the provided vnn
3448  */
3449 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3450 {
3451         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3452         struct ctdb_control_tcp_tickle_list *list;
3453         struct ctdb_tcp_array *tcparray;
3454         int num;
3455         struct ctdb_vnn *vnn;
3456
3457         vnn = find_public_ip_vnn(ctdb, addr);
3458         if (vnn == NULL) {
3459                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3460                         ctdb_addr_to_str(addr)));
3461
3462                 return 1;
3463         }
3464
3465         tcparray = vnn->tcp_array;
3466         if (tcparray) {
3467                 num = tcparray->num;
3468         } else {
3469                 num = 0;
3470         }
3471
3472         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3473                                 tickles.connections)
3474                         + sizeof(struct ctdb_tcp_connection) * num;
3475
3476         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3477         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3478         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3479
3480         list->addr = *addr;
3481         list->tickles.num = num;
3482         if (num) {
3483                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3484                         sizeof(struct ctdb_tcp_connection) * num);
3485         }
3486
3487         return 0;
3488 }
3489
3490
3491 /*
3492   set the list of all tcp tickles for a public address
3493  */
3494 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3495                               struct timeval timeout, uint32_t destnode, 
3496                               ctdb_sock_addr *addr,
3497                               struct ctdb_tcp_array *tcparray)
3498 {
3499         int ret, num;
3500         TDB_DATA data;
3501         struct ctdb_control_tcp_tickle_list *list;
3502
3503         if (tcparray) {
3504                 num = tcparray->num;
3505         } else {
3506                 num = 0;
3507         }
3508
3509         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3510                                 tickles.connections) +
3511                         sizeof(struct ctdb_tcp_connection) * num;
3512         data.dptr = talloc_size(ctdb, data.dsize);
3513         CTDB_NO_MEMORY(ctdb, data.dptr);
3514
3515         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3516         list->addr = *addr;
3517         list->tickles.num = num;
3518         if (tcparray) {
3519                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3520         }
3521
3522         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3523                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3524                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3525         if (ret != 0) {
3526                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3527                 return -1;
3528         }
3529
3530         talloc_free(data.dptr);
3531
3532         return ret;
3533 }
3534
3535
3536 /*
3537   perform tickle updates if required
3538  */
3539 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3540                                 struct timed_event *te, 
3541                                 struct timeval t, void *private_data)
3542 {
3543         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3544         int ret;
3545         struct ctdb_vnn *vnn;
3546
3547         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3548                 /* we only send out updates for public addresses that 
3549                    we have taken over
3550                  */
3551                 if (ctdb->pnn != vnn->pnn) {
3552                         continue;
3553                 }
3554                 /* We only send out the updates if we need to */
3555                 if (!vnn->tcp_update_needed) {
3556                         continue;
3557                 }
3558                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3559                                 TAKEOVER_TIMEOUT(),
3560                                 CTDB_BROADCAST_CONNECTED,
3561                                 &vnn->public_address,
3562                                 vnn->tcp_array);
3563                 if (ret != 0) {
3564                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3565                                 ctdb_addr_to_str(&vnn->public_address)));
3566                 }
3567         }
3568
3569         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3570                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3571                              ctdb_update_tcp_tickles, ctdb);
3572 }               
3573         
3574
3575 /*
3576   start periodic update of tcp tickles
3577  */
3578 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3579 {
3580         ctdb->tickle_update_context = talloc_new(ctdb);
3581
3582         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3583                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3584                              ctdb_update_tcp_tickles, ctdb);
3585 }
3586
3587
3588
3589
3590 struct control_gratious_arp {
3591         struct ctdb_context *ctdb;
3592         ctdb_sock_addr addr;
3593         const char *iface;
3594         int count;
3595 };
3596
3597 /*
3598   send a control_gratuitous arp
3599  */
3600 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3601                                   struct timeval t, void *private_data)
3602 {
3603         int ret;
3604         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3605                                                         struct control_gratious_arp);
3606
3607         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3608         if (ret != 0) {
3609                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3610                                  arp->iface, strerror(errno)));
3611         }
3612
3613
3614         arp->count++;
3615         if (arp->count == CTDB_ARP_REPEAT) {
3616                 talloc_free(arp);
3617                 return;
3618         }
3619
3620         event_add_timed(arp->ctdb->ev, arp, 
3621                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3622                         send_gratious_arp, arp);
3623 }
3624
3625
3626 /*
3627   send a gratious arp 
3628  */
3629 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3630 {
3631         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3632         struct control_gratious_arp *arp;
3633
3634         /* verify the size of indata */
3635         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3636                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3637                                  (unsigned)indata.dsize, 
3638                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3639                 return -1;
3640         }
3641         if (indata.dsize != 
3642                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3643                 + gratious_arp->len ) ){
3644
3645                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3646                         "but should be %u bytes\n", 
3647                          (unsigned)indata.dsize, 
3648                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3649                 return -1;
3650         }
3651
3652
3653         arp = talloc(ctdb, struct control_gratious_arp);
3654         CTDB_NO_MEMORY(ctdb, arp);
3655
3656         arp->ctdb  = ctdb;
3657         arp->addr   = gratious_arp->addr;
3658         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3659         CTDB_NO_MEMORY(ctdb, arp->iface);
3660         arp->count = 0;
3661         
3662         event_add_timed(arp->ctdb->ev, arp, 
3663                         timeval_zero(), send_gratious_arp, arp);
3664
3665         return 0;
3666 }
3667
3668 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3669 {
3670         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3671         int ret;
3672
3673         /* verify the size of indata */
3674         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3675                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3676                 return -1;
3677         }
3678         if (indata.dsize != 
3679                 ( offsetof(struct ctdb_control_ip_iface, iface)
3680                 + pub->len ) ){
3681
3682                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3683                         "but should be %u bytes\n", 
3684                          (unsigned)indata.dsize, 
3685                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3686                 return -1;
3687         }
3688
3689         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3690
3691         if (ret != 0) {
3692                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3693                 return -1;
3694         }
3695
3696         return 0;
3697 }
3698
3699 /*
3700   called when releaseip event finishes for del_public_address
3701  */
3702 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3703                                 void *private_data)
3704 {
3705         talloc_free(private_data);
3706 }
3707
3708 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3709 {
3710         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3711         struct ctdb_vnn *vnn;
3712         int ret;
3713
3714         /* verify the size of indata */
3715         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3716                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3717                 return -1;
3718         }
3719         if (indata.dsize != 
3720                 ( offsetof(struct ctdb_control_ip_iface, iface)
3721                 + pub->len ) ){
3722
3723                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3724                         "but should be %u bytes\n", 
3725                          (unsigned)indata.dsize, 
3726                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3727                 return -1;
3728         }
3729
3730         /* walk over all public addresses until we find a match */
3731         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3732                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3733                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3734
3735                         DLIST_REMOVE(ctdb->vnn, vnn);
3736                         talloc_steal(mem_ctx, vnn);
3737                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3738                         if (vnn->pnn != ctdb->pnn) {
3739                                 if (vnn->iface != NULL) {
3740                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3741                                 }
3742                                 talloc_free(mem_ctx);
3743                                 return 0;
3744                         }
3745                         vnn->pnn = -1;
3746
3747                         ret = ctdb_event_script_callback(ctdb, 
3748                                          mem_ctx, delete_ip_callback, mem_ctx,
3749                                          false,
3750                                          CTDB_EVENT_RELEASE_IP,
3751                                          "%s %s %u",
3752                                          ctdb_vnn_iface_string(vnn),
3753                                          ctdb_addr_to_str(&vnn->public_address),
3754                                          vnn->public_netmask_bits);
3755                         if (vnn->iface != NULL) {
3756                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3757                         }
3758                         if (ret != 0) {
3759                                 return -1;
3760                         }
3761                         return 0;
3762                 }
3763         }
3764
3765         return -1;
3766 }
3767
3768 /* This function is called from the recovery daemon to verify that a remote
3769    node has the expected ip allocation.
3770    This is verified against ctdb->ip_tree
3771 */
3772 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3773 {
3774         struct ctdb_public_ip_list *tmp_ip; 
3775         int i;
3776
3777         if (ctdb->ip_tree == NULL) {
3778                 /* dont know the expected allocation yet, assume remote node
3779                    is correct. */
3780                 return 0;
3781         }
3782
3783         if (ips == NULL) {
3784                 return 0;
3785         }
3786
3787         for (i=0; i<ips->num; i++) {
3788                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3789                 if (tmp_ip == NULL) {
3790                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3791                         return -1;
3792                 }
3793
3794                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3795                         continue;
3796                 }
3797
3798                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3799                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3800                         return -1;
3801                 }
3802         }
3803
3804         return 0;
3805 }
3806
3807 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3808 {
3809         struct ctdb_public_ip_list *tmp_ip; 
3810
3811         if (ctdb->ip_tree == NULL) {
3812                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3813                 return -1;
3814         }
3815
3816         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3817         if (tmp_ip == NULL) {
3818                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3819                 return -1;
3820         }
3821
3822         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3823         tmp_ip->pnn = ip->pnn;
3824
3825         return 0;
3826 }
3827
3828
3829 struct ctdb_reloadips_handle {
3830         struct ctdb_context *ctdb;
3831         struct ctdb_req_control *c;
3832         int status;
3833         int fd[2];
3834         pid_t child;
3835         struct fd_event *fde;
3836 };
3837
3838 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3839 {
3840         if (h == h->ctdb->reload_ips) {
3841                 h->ctdb->reload_ips = NULL;
3842         }
3843         if (h->c != NULL) {
3844                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3845                 h->c = NULL;
3846         }
3847         ctdb_kill(h->ctdb, h->child, SIGKILL);
3848         return 0;
3849 }
3850
3851 static void ctdb_reloadips_timeout_event(struct event_context *ev,
3852                                 struct timed_event *te,
3853                                 struct timeval t, void *private_data)
3854 {
3855         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3856
3857         talloc_free(h);
3858 }       
3859
3860 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
3861                              uint16_t flags, void *private_data)
3862 {
3863         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3864
3865         char res;
3866         int ret;
3867
3868         ret = read(h->fd[0], &res, 1);
3869         if (ret < 1 || res != 0) {
3870                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3871                 res = 1;
3872         }
3873         h->status = res;
3874
3875         talloc_free(h);
3876 }
3877
3878 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3879 {
3880         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3881         struct ctdb_all_public_ips *ips;
3882         struct ctdb_vnn *vnn;
3883         int i, ret;
3884
3885         /* read the ip allocation from the local node */
3886         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
3887         if (ret != 0) {
3888                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
3889                 talloc_free(mem_ctx);
3890                 return -1;
3891         }
3892
3893         /* re-read the public ips file */
3894         ctdb->vnn = NULL;
3895         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3896                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3897                 talloc_free(mem_ctx);
3898                 return -1;
3899         }               
3900
3901
3902         /* check the previous list of ips and scan for ips that have been
3903            dropped.
3904          */
3905         for (i = 0; i < ips->num; i++) {
3906                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3907                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3908                                 break;
3909                         }
3910                 }
3911
3912                 /* we need to delete this ip, no longer available on this node */
3913                 if (vnn == NULL) {
3914                         struct ctdb_control_ip_iface pub;
3915
3916                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3917                         pub.addr  = ips->ips[i].addr;
3918                         pub.mask  = 0;
3919                         pub.len   = 0;
3920
3921                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3922                         if (ret != 0) {
3923                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3924                                 return -1;
3925                         }
3926                 }
3927         }
3928
3929
3930         /* loop over all new ones and check the ones we need to add */
3931         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3932                 for (i = 0; i < ips->num; i++) {
3933                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3934                                 break;
3935                         }
3936                 }
3937                 if (i == ips->num) {
3938                         struct ctdb_control_ip_iface pub;
3939                         const char *ifaces = NULL;
3940                         int iface = 0;
3941
3942                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
3943
3944                         pub.addr  = vnn->public_address;
3945                         pub.mask  = vnn->public_netmask_bits;
3946
3947
3948                         ifaces = vnn->ifaces[0];
3949                         iface = 1;
3950                         while (vnn->ifaces[iface] != NULL) {
3951                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
3952                                 iface++;
3953                         }
3954                         pub.len   = strlen(ifaces)+1;
3955                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
3956
3957                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3958                         if (ret != 0) {
3959                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
3960                                 return -1;
3961                         }
3962                 }
3963         }
3964
3965         return 0;
3966 }
3967
3968 /* This control is sent to force the node to re-read the public addresses file
3969    and drop any addresses we should nnot longer host, and add new addresses
3970    that we are now able to host
3971 */
3972 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
3973 {
3974         struct ctdb_reloadips_handle *h;
3975         pid_t parent = getpid();
3976
3977         if (ctdb->reload_ips != NULL) {
3978                 talloc_free(ctdb->reload_ips);
3979                 ctdb->reload_ips = NULL;
3980         }
3981
3982         h = talloc(ctdb, struct ctdb_reloadips_handle);
3983         CTDB_NO_MEMORY(ctdb, h);
3984         h->ctdb     = ctdb;
3985         h->c        = NULL;
3986         h->status   = -1;
3987         
3988         if (pipe(h->fd) == -1) {
3989                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3990                 talloc_free(h);
3991                 return -1;
3992         }
3993
3994         h->child = ctdb_fork(ctdb);
3995         if (h->child == (pid_t)-1) {
3996                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3997                 close(h->fd[0]);
3998                 close(h->fd[1]);
3999                 talloc_free(h);
4000                 return -1;
4001         }
4002
4003         /* child process */
4004         if (h->child == 0) {
4005                 signed char res = 0;
4006
4007                 close(h->fd[0]);
4008                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4009
4010                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4011                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4012                         res = -1;
4013                 } else {
4014                         res = ctdb_reloadips_child(ctdb);
4015                         if (res != 0) {
4016                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4017                         }
4018                 }
4019
4020                 write(h->fd[1], &res, 1);
4021                 /* make sure we die when our parent dies */
4022                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4023                         sleep(5);
4024                 }
4025                 _exit(0);
4026         }
4027
4028         h->c             = talloc_steal(h, c);
4029
4030         close(h->fd[1]);
4031         set_close_on_exec(h->fd[0]);
4032
4033         talloc_set_destructor(h, ctdb_reloadips_destructor);
4034
4035
4036         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4037                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4038                         (void *)h);
4039         tevent_fd_set_auto_close(h->fde);
4040
4041         event_add_timed(ctdb->ev, h,
4042                         timeval_current_ofs(120, 0),
4043                         ctdb_reloadips_timeout_event, h);
4044
4045         /* we reply later */
4046         *async_reply = true;
4047         return 0;
4048 }