recoverd: Fix a memory leak in IP allocation
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Check if this is a public ip known to the node, i.e. can that
1303    node takeover this ip ?
1304 */
1305 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                 struct ctdb_public_ip_list *ip)
1307 {
1308         struct ctdb_all_public_ips *public_ips;
1309         int i;
1310
1311         public_ips = ctdb->nodes[pnn]->available_public_ips;
1312
1313         if (public_ips == NULL) {
1314                 return -1;
1315         }
1316
1317         for (i=0;i<public_ips->num;i++) {
1318                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1319                         /* yes, this node can serve this public ip */
1320                         return 0;
1321                 }
1322         }
1323
1324         return -1;
1325 }
1326
1327
1328 /* search the node lists list for a node to takeover this ip.
1329    pick the node that currently are serving the least number of ips
1330    so that the ips get spread out evenly.
1331 */
1332 static int find_takeover_node(struct ctdb_context *ctdb, 
1333                 struct ctdb_node_map *nodemap, uint32_t mask, 
1334                 struct ctdb_public_ip_list *ip,
1335                 struct ctdb_public_ip_list *all_ips)
1336 {
1337         int pnn, min=0, num;
1338         int i;
1339
1340         pnn    = -1;
1341         for (i=0;i<nodemap->num;i++) {
1342                 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1343                         /* This node is not allowed to takeover any addresses
1344                         */
1345                         continue;
1346                 }
1347
1348                 if (nodemap->nodes[i].flags & mask) {
1349                         /* This node is not healty and can not be used to serve
1350                            a public address 
1351                         */
1352                         continue;
1353                 }
1354
1355                 /* verify that this node can serve this ip */
1356                 if (can_node_serve_ip(ctdb, i, ip)) {
1357                         /* no it couldnt   so skip to the next node */
1358                         continue;
1359                 }
1360
1361                 num = node_ip_coverage(ctdb, i, all_ips);
1362                 /* was this the first node we checked ? */
1363                 if (pnn == -1) {
1364                         pnn = i;
1365                         min  = num;
1366                 } else {
1367                         if (num < min) {
1368                                 pnn = i;
1369                                 min  = num;
1370                         }
1371                 }
1372         }       
1373         if (pnn == -1) {
1374                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1375                         ctdb_addr_to_str(&ip->addr)));
1376
1377                 return -1;
1378         }
1379
1380         ip->pnn = pnn;
1381         return 0;
1382 }
1383
1384 #define IP_KEYLEN       4
1385 static uint32_t *ip_key(ctdb_sock_addr *ip)
1386 {
1387         static uint32_t key[IP_KEYLEN];
1388
1389         bzero(key, sizeof(key));
1390
1391         switch (ip->sa.sa_family) {
1392         case AF_INET:
1393                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1394                 break;
1395         case AF_INET6: {
1396                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1397                 key[0]  = htonl(s6_a32[0]);
1398                 key[1]  = htonl(s6_a32[1]);
1399                 key[2]  = htonl(s6_a32[2]);
1400                 key[3]  = htonl(s6_a32[3]);
1401                 break;
1402         }
1403         default:
1404                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1405                 return key;
1406         }
1407
1408         return key;
1409 }
1410
1411 static void *add_ip_callback(void *parm, void *data)
1412 {
1413         struct ctdb_public_ip_list *this_ip = parm; 
1414         struct ctdb_public_ip_list *prev_ip = data; 
1415
1416         if (prev_ip == NULL) {
1417                 return parm;
1418         }
1419         if (this_ip->pnn == -1) {
1420                 this_ip->pnn = prev_ip->pnn;
1421         }
1422
1423         return parm;
1424 }
1425
1426 static int getips_count_callback(void *param, void *data)
1427 {
1428         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1429         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1430
1431         new_ip->next = *ip_list;
1432         *ip_list     = new_ip;
1433         return 0;
1434 }
1435
1436 static struct ctdb_public_ip_list *
1437 create_merged_ip_list(struct ctdb_context *ctdb)
1438 {
1439         int i, j;
1440         struct ctdb_public_ip_list *ip_list;
1441         struct ctdb_all_public_ips *public_ips;
1442
1443         if (ctdb->ip_tree != NULL) {
1444                 talloc_free(ctdb->ip_tree);
1445                 ctdb->ip_tree = NULL;
1446         }
1447         ctdb->ip_tree = trbt_create(ctdb, 0);
1448
1449         for (i=0;i<ctdb->num_nodes;i++) {
1450                 public_ips = ctdb->nodes[i]->known_public_ips;
1451
1452                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1453                         continue;
1454                 }
1455
1456                 /* there were no public ips for this node */
1457                 if (public_ips == NULL) {
1458                         continue;
1459                 }               
1460
1461                 for (j=0;j<public_ips->num;j++) {
1462                         struct ctdb_public_ip_list *tmp_ip; 
1463
1464                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1465                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1466                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1467                         tmp_ip->addr = public_ips->ips[j].addr;
1468                         tmp_ip->next = NULL;
1469
1470                         trbt_insertarray32_callback(ctdb->ip_tree,
1471                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1472                                 add_ip_callback,
1473                                 tmp_ip);
1474                 }
1475         }
1476
1477         ip_list = NULL;
1478         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1479
1480         return ip_list;
1481 }
1482
1483 /* 
1484  * This is the length of the longtest common prefix between the IPs.
1485  * It is calculated by XOR-ing the 2 IPs together and counting the
1486  * number of leading zeroes.  The implementation means that all
1487  * addresses end up being 128 bits long.
1488  *
1489  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1490  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1491  * lots of nodes and IP addresses?
1492  */
1493 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1494 {
1495         uint32_t ip1_k[IP_KEYLEN];
1496         uint32_t *t;
1497         int i;
1498         uint32_t x;
1499
1500         uint32_t distance = 0;
1501
1502         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1503         t = ip_key(ip2);
1504         for (i=0; i<IP_KEYLEN; i++) {
1505                 x = ip1_k[i] ^ t[i];
1506                 if (x == 0) {
1507                         distance += 32;
1508                 } else {
1509                         /* Count number of leading zeroes. 
1510                          * FIXME? This could be optimised...
1511                          */
1512                         while ((x & (1 << 31)) == 0) {
1513                                 x <<= 1;
1514                                 distance += 1;
1515                         }
1516                 }
1517         }
1518
1519         return distance;
1520 }
1521
1522 /* Calculate the IP distance for the given IP relative to IPs on the
1523    given node.  The ips argument is generally the all_ips variable
1524    used in the main part of the algorithm.
1525  */
1526 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1527                                   struct ctdb_public_ip_list *ips,
1528                                   int pnn)
1529 {
1530         struct ctdb_public_ip_list *t;
1531         uint32_t d;
1532
1533         uint32_t sum = 0;
1534
1535         for (t=ips; t != NULL; t=t->next) {
1536                 if (t->pnn != pnn) {
1537                         continue;
1538                 }
1539
1540                 /* Optimisation: We never calculate the distance
1541                  * between an address and itself.  This allows us to
1542                  * calculate the effect of removing an address from a
1543                  * node by simply calculating the distance between
1544                  * that address and all of the exitsing addresses.
1545                  * Moreover, we assume that we're only ever dealing
1546                  * with addresses from all_ips so we can identify an
1547                  * address via a pointer rather than doing a more
1548                  * expensive address comparison. */
1549                 if (&(t->addr) == ip) {
1550                         continue;
1551                 }
1552
1553                 d = ip_distance(ip, &(t->addr));
1554                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1555         }
1556
1557         return sum;
1558 }
1559
1560 /* Return the LCP2 imbalance metric for addresses currently assigned
1561    to the given node.
1562  */
1563 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1564 {
1565         struct ctdb_public_ip_list *t;
1566
1567         uint32_t imbalance = 0;
1568
1569         for (t=all_ips; t!=NULL; t=t->next) {
1570                 if (t->pnn != pnn) {
1571                         continue;
1572                 }
1573                 /* Pass the rest of the IPs rather than the whole
1574                    all_ips input list.
1575                 */
1576                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1577         }
1578
1579         return imbalance;
1580 }
1581
1582 /* Allocate any unassigned IPs just by looping through the IPs and
1583  * finding the best node for each.
1584  */
1585 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1586                                       struct ctdb_node_map *nodemap,
1587                                       uint32_t mask,
1588                                       struct ctdb_public_ip_list *all_ips)
1589 {
1590         struct ctdb_public_ip_list *tmp_ip;
1591
1592         /* loop over all ip's and find a physical node to cover for 
1593            each unassigned ip.
1594         */
1595         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1596                 if (tmp_ip->pnn == -1) {
1597                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1598                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1599                                         ctdb_addr_to_str(&tmp_ip->addr)));
1600                         }
1601                 }
1602         }
1603 }
1604
1605 /* Basic non-deterministic rebalancing algorithm.
1606  */
1607 static bool basic_failback(struct ctdb_context *ctdb,
1608                            struct ctdb_node_map *nodemap,
1609                            uint32_t mask,
1610                            struct ctdb_public_ip_list *all_ips,
1611                            int num_ips,
1612                            int *retries)
1613 {
1614         int i;
1615         int maxnode, maxnum=0, minnode, minnum=0, num;
1616         struct ctdb_public_ip_list *tmp_ip;
1617
1618         /* for each ip address, loop over all nodes that can serve
1619            this ip and make sure that the difference between the node
1620            serving the most and the node serving the least ip's are
1621            not greater than 1.
1622         */
1623         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1624                 if (tmp_ip->pnn == -1) {
1625                         continue;
1626                 }
1627
1628                 /* Get the highest and lowest number of ips's served by any 
1629                    valid node which can serve this ip.
1630                 */
1631                 maxnode = -1;
1632                 minnode = -1;
1633                 for (i=0;i<nodemap->num;i++) {
1634                         if (nodemap->nodes[i].flags & mask) {
1635                                 continue;
1636                         }
1637
1638                         /* Only check nodes that are allowed to takeover an ip */
1639                         if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1640                                 continue;
1641                         }
1642
1643                         /* only check nodes that can actually serve this ip */
1644                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1645                                 /* no it couldnt   so skip to the next node */
1646                                 continue;
1647                         }
1648
1649                         num = node_ip_coverage(ctdb, i, all_ips);
1650                         if (maxnode == -1) {
1651                                 maxnode = i;
1652                                 maxnum  = num;
1653                         } else {
1654                                 if (num > maxnum) {
1655                                         maxnode = i;
1656                                         maxnum  = num;
1657                                 }
1658                         }
1659                         if (minnode == -1) {
1660                                 minnode = i;
1661                                 minnum  = num;
1662                         } else {
1663                                 if (num < minnum) {
1664                                         minnode = i;
1665                                         minnum  = num;
1666                                 }
1667                         }
1668                 }
1669                 if (maxnode == -1) {
1670                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1671                                 ctdb_addr_to_str(&tmp_ip->addr)));
1672
1673                         continue;
1674                 }
1675
1676                 /* If we want deterministic IPs then dont try to reallocate 
1677                    them to spread out the load.
1678                 */
1679                 if (1 == ctdb->tunable.deterministic_public_ips) {
1680                         continue;
1681                 }
1682
1683                 /* if the spread between the smallest and largest coverage by
1684                    a node is >=2 we steal one of the ips from the node with
1685                    most coverage to even things out a bit.
1686                    try to do this a limited number of times since we dont
1687                    want to spend too much time balancing the ip coverage.
1688                 */
1689                 if ( (maxnum > minnum+1)
1690                      && (*retries < (num_ips + 5)) ){
1691                         struct ctdb_public_ip_list *tmp;
1692
1693                         /* mark one of maxnode's vnn's as unassigned and try
1694                            again
1695                         */
1696                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1697                                 if (tmp->pnn == maxnode) {
1698                                         tmp->pnn = -1;
1699                                         (*retries)++;
1700                                         return true;
1701                                 }
1702                         }
1703                 }
1704         }
1705
1706         return false;
1707 }
1708
1709 struct ctdb_rebalancenodes {
1710         struct ctdb_rebalancenodes *next;
1711         uint32_t pnn;
1712 };
1713 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1714
1715
1716 /* set this flag to force the node to be rebalanced even if it just didnt
1717    become healthy again.
1718 */
1719 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1720 {
1721         struct ctdb_rebalancenodes *rebalance;
1722
1723         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1724                 if (rebalance->pnn == pnn) {
1725                         return;
1726                 }
1727         }
1728
1729         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1730         rebalance->pnn = pnn;
1731         rebalance->next = force_rebalance_list;
1732         force_rebalance_list = rebalance;
1733 }
1734
1735 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1736  * that we can unit test it.
1737  */
1738 static void lcp2_init(struct ctdb_context * tmp_ctx,
1739                struct ctdb_node_map * nodemap,
1740                uint32_t mask,
1741                struct ctdb_public_ip_list *all_ips,
1742                uint32_t **lcp2_imbalances,
1743                bool **newly_healthy)
1744 {
1745         int i;
1746         struct ctdb_public_ip_list *tmp_ip;
1747
1748         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1750         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1751         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752
1753         for (i=0;i<nodemap->num;i++) {
1754                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1755                 /* First step: is the node "healthy"? */
1756                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1757         }
1758
1759         /* 2nd step: if a ndoe has IPs assigned then it must have been
1760          * healthy before, so we remove it from consideration... */
1761         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1762                 if (tmp_ip->pnn != -1) {
1763                         (*newly_healthy)[tmp_ip->pnn] = false;
1764                 }
1765         }
1766
1767         /* 3rd step: if a node is forced to re-balance then
1768            we allow failback onto the node */
1769         while (force_rebalance_list != NULL) {
1770                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1771
1772                 if (force_rebalance_list->pnn <= nodemap->num) {
1773                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1774                 }
1775
1776                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1777                 talloc_free(force_rebalance_list);
1778                 force_rebalance_list = next;
1779         }
1780 }
1781
1782 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1783  * the IP/node combination that will cost the least.
1784  */
1785 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1786                               struct ctdb_node_map *nodemap,
1787                               uint32_t mask,
1788                               struct ctdb_public_ip_list *all_ips,
1789                               uint32_t *lcp2_imbalances)
1790 {
1791         struct ctdb_public_ip_list *tmp_ip;
1792         int dstnode;
1793
1794         int minnode;
1795         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1796         struct ctdb_public_ip_list *minip;
1797
1798         bool should_loop = true;
1799         bool have_unassigned = true;
1800
1801         while (have_unassigned && should_loop) {
1802                 should_loop = false;
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1806
1807                 minnode = -1;
1808                 mindsum = 0;
1809                 minip = NULL;
1810
1811                 /* loop over each unassigned ip. */
1812                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1813                         if (tmp_ip->pnn != -1) {
1814                                 continue;
1815                         }
1816
1817                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1818                                 /* Only check nodes that are allowed to takeover an ip */
1819                                 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1820                                         continue;
1821                                 }
1822
1823                                 /* only check nodes that can actually serve this ip */
1824                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1825                                         /* no it couldnt   so skip to the next node */
1826                                         continue;
1827                                 }
1828                                 if (nodemap->nodes[dstnode].flags & mask) {
1829                                         continue;
1830                                 }
1831
1832                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1833                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1834                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1835                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1836                                                    dstnode,
1837                                                    dstimbl - lcp2_imbalances[dstnode]));
1838
1839
1840                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1841                                         minnode = dstnode;
1842                                         minimbl = dstimbl;
1843                                         mindsum = dstdsum;
1844                                         minip = tmp_ip;
1845                                         should_loop = true;
1846                                 }
1847                         }
1848                 }
1849
1850                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1851
1852                 /* If we found one then assign it to the given node. */
1853                 if (minnode != -1) {
1854                         minip->pnn = minnode;
1855                         lcp2_imbalances[minnode] = minimbl;
1856                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1857                                           ctdb_addr_to_str(&(minip->addr)),
1858                                           minnode,
1859                                           mindsum));
1860                 }
1861
1862                 /* There might be a better way but at least this is clear. */
1863                 have_unassigned = false;
1864                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1865                         if (tmp_ip->pnn == -1) {
1866                                 have_unassigned = true;
1867                         }
1868                 }
1869         }
1870
1871         /* We know if we have an unassigned addresses so we might as
1872          * well optimise.
1873          */
1874         if (have_unassigned) {
1875                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876                         if (tmp_ip->pnn == -1) {
1877                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1878                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1879                         }
1880                 }
1881         }
1882 }
1883
1884 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1885  * to move IPs from, determines the best IP/destination node
1886  * combination to move from the source node.
1887  */
1888 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1889                                     struct ctdb_node_map *nodemap,
1890                                     struct ctdb_public_ip_list *all_ips,
1891                                     int srcnode,
1892                                     uint32_t candimbl,
1893                                     uint32_t *lcp2_imbalances,
1894                                     bool *newly_healthy)
1895 {
1896         int dstnode, mindstnode;
1897         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1898         uint32_t minsrcimbl, mindstimbl;
1899         struct ctdb_public_ip_list *minip;
1900         struct ctdb_public_ip_list *tmp_ip;
1901
1902         /* Find an IP and destination node that best reduces imbalance. */
1903         minip = NULL;
1904         minsrcimbl = 0;
1905         mindstnode = -1;
1906         mindstimbl = 0;
1907
1908         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1909         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1910
1911         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1912                 /* Only consider addresses on srcnode. */
1913                 if (tmp_ip->pnn != srcnode) {
1914                         continue;
1915                 }
1916
1917                 /* What is this IP address costing the source node? */
1918                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1919                 srcimbl = candimbl - srcdsum;
1920
1921                 /* Consider this IP address would cost each potential
1922                  * destination node.  Destination nodes are limited to
1923                  * those that are newly healthy, since we don't want
1924                  * to do gratuitous failover of IPs just to make minor
1925                  * balance improvements.
1926                  */
1927                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1928                         if (! newly_healthy[dstnode]) {
1929                                 continue;
1930                         }
1931
1932                         /* Only check nodes that are allowed to takeover an ip */
1933                         if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1934                                 continue;
1935                         }
1936
1937                         /* only check nodes that can actually serve this ip */
1938                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1939                                 /* no it couldnt   so skip to the next node */
1940                                 continue;
1941                         }
1942
1943                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1944                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1945                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1946                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1947                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1948                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1949
1950                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1951                             ((mindstnode == -1) ||                              \
1952                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1953
1954                                 minip = tmp_ip;
1955                                 minsrcimbl = srcimbl;
1956                                 mindstnode = dstnode;
1957                                 mindstimbl = dstimbl;
1958                         }
1959                 }
1960         }
1961         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1962
1963         if (mindstnode != -1) {
1964                 /* We found a move that makes things better... */
1965                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1966                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1967                                   ctdb_addr_to_str(&(minip->addr)),
1968                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1969
1970
1971                 lcp2_imbalances[srcnode] = srcimbl;
1972                 lcp2_imbalances[mindstnode] = mindstimbl;
1973                 minip->pnn = mindstnode;
1974
1975                 return true;
1976         }
1977
1978         return false;
1979         
1980 }
1981
1982 struct lcp2_imbalance_pnn {
1983         uint32_t imbalance;
1984         int pnn;
1985 };
1986
1987 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1988 {
1989         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1990         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1991
1992         if (lipa->imbalance > lipb->imbalance) {
1993                 return -1;
1994         } else if (lipa->imbalance == lipb->imbalance) {
1995                 return 0;
1996         } else {
1997                 return 1;
1998         }
1999 }
2000
2001 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2002  * node with the highest LCP2 imbalance, and then determines the best
2003  * IP/destination node combination to move from the source node.
2004  */
2005 static bool lcp2_failback(struct ctdb_context *ctdb,
2006                           struct ctdb_node_map *nodemap,
2007                           uint32_t mask,
2008                           struct ctdb_public_ip_list *all_ips,
2009                           uint32_t *lcp2_imbalances,
2010                           bool *newly_healthy)
2011 {
2012         int i, num_newly_healthy;
2013         struct lcp2_imbalance_pnn * lips;
2014         bool ret;
2015
2016         /* It is only worth continuing if we have suitable target
2017          * nodes to transfer IPs to.  This check is much cheaper than
2018          * continuing on...
2019          */
2020         num_newly_healthy = 0;
2021         for (i = 0; i < nodemap->num; i++) {
2022                 if (newly_healthy[i]) {
2023                         num_newly_healthy++;
2024                 }
2025         }
2026         if (num_newly_healthy == 0) {
2027                 return false;
2028         }
2029
2030         /* Put the imbalances and nodes into an array, sort them and
2031          * iterate through candidates.  Usually the 1st one will be
2032          * used, so this doesn't cost much...
2033          */
2034         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2035         for (i = 0; i < nodemap->num; i++) {
2036                 lips[i].imbalance = lcp2_imbalances[i];
2037                 lips[i].pnn = i;
2038         }
2039         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2040               lcp2_cmp_imbalance_pnn);
2041
2042         ret = false;
2043         for (i = 0; i < nodemap->num; i++) {
2044                 /* This means that all nodes had 0 or 1 addresses, so
2045                  * can't be imbalanced.
2046                  */
2047                 if (lips[i].imbalance == 0) {
2048                         break;
2049                 }
2050
2051                 if (lcp2_failback_candidate(ctdb,
2052                                             nodemap,
2053                                             all_ips,
2054                                             lips[i].pnn,
2055                                             lips[i].imbalance,
2056                                             lcp2_imbalances,
2057                                             newly_healthy)) {
2058                         ret = true;
2059                         break;
2060                 }
2061         }
2062
2063         talloc_free(lips);
2064         return ret;
2065 }
2066
2067 /* The calculation part of the IP allocation algorithm. */
2068 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2069                                    struct ctdb_node_map *nodemap,
2070                                    struct ctdb_public_ip_list **all_ips_p)
2071 {
2072         int i, num_healthy, retries, num_ips;
2073         uint32_t mask;
2074         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2075         uint32_t *lcp2_imbalances;
2076         bool *newly_healthy;
2077
2078         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2079
2080         /* Count how many completely healthy nodes we have */
2081         num_healthy = 0;
2082         for (i=0;i<nodemap->num;i++) {
2083                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2084                         num_healthy++;
2085                 }
2086         }
2087
2088         /* If we have healthy nodes then we will only consider them
2089            for serving public addresses
2090         */
2091         mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
2092         if ((num_healthy == 0) &&
2093             (ctdb->tunable.no_ip_takeover_on_disabled == 0)) {
2094                 /* We didnt have any completely healthy nodes so
2095                    use "disabled" nodes as a fallback
2096                 */
2097                 mask = NODE_FLAGS_INACTIVE;
2098         }
2099
2100         /* since nodes only know about those public addresses that
2101            can be served by that particular node, no single node has
2102            a full list of all public addresses that exist in the cluster.
2103            Walk over all node structures and create a merged list of
2104            all public addresses that exist in the cluster.
2105
2106            keep the tree of ips around as ctdb->ip_tree
2107         */
2108         all_ips = create_merged_ip_list(ctdb);
2109         *all_ips_p = all_ips; /* minimal code changes */
2110
2111         /* Count how many ips we have */
2112         num_ips = 0;
2113         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2114                 num_ips++;
2115         }
2116
2117         /* If we want deterministic ip allocations, i.e. that the ip addresses
2118            will always be allocated the same way for a specific set of
2119            available/unavailable nodes.
2120         */
2121         if (1 == ctdb->tunable.deterministic_public_ips) {              
2122                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2123                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2124                         tmp_ip->pnn = i%nodemap->num;
2125                 }
2126         }
2127
2128
2129         /* mark all public addresses with a masked node as being served by
2130            node -1
2131         */
2132         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2133                 if (tmp_ip->pnn == -1) {
2134                         continue;
2135                 }
2136                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
2137                         tmp_ip->pnn = -1;
2138                 }
2139         }
2140
2141         /* verify that the assigned nodes can serve that public ip
2142            and set it to -1 if not
2143         */
2144         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2145                 if (tmp_ip->pnn == -1) {
2146                         continue;
2147                 }
2148                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2149                         /* this node can not serve this ip. */
2150                         tmp_ip->pnn = -1;
2151                 }
2152         }
2153
2154         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2155                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2156         }
2157
2158         /* now we must redistribute all public addresses with takeover node
2159            -1 among the nodes available
2160         */
2161         retries = 0;
2162 try_again:
2163         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2164                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2165         } else {
2166                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2167         }
2168
2169         /* If we dont want ips to fail back after a node becomes healthy
2170            again, we wont even try to reallocat the ip addresses so that
2171            they are evenly spread out.
2172            This can NOT be used at the same time as DeterministicIPs !
2173         */
2174         if (1 == ctdb->tunable.no_ip_failback) {
2175                 if (1 == ctdb->tunable.deterministic_public_ips) {
2176                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
2177                 }
2178                 goto finished;
2179         }
2180
2181
2182         /* now, try to make sure the ip adresses are evenly distributed
2183            across the node.
2184         */
2185         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2186                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
2187                         goto try_again;
2188                 }
2189         } else {
2190                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
2191                         goto try_again;
2192                 }
2193         }
2194
2195         /* finished distributing the public addresses, now just send the 
2196            info out to the nodes */
2197 finished:
2198         /* at this point ->pnn is the node which will own each IP
2199            or -1 if there is no node that can cover this ip
2200         */
2201
2202         talloc_free(tmp_ctx);
2203
2204         return;
2205 }
2206
2207 static void noiptakeover_cb(struct ctdb_context *ctdb, uint32_t pnn, int32_t res, TDB_DATA outdata, void *callback)
2208 {
2209         struct ctdb_node_map *nodemap = (struct ctdb_node_map *)callback;
2210
2211         if (res != 0) {
2212                 DEBUG(DEBUG_ERR,("Failure to read NoIPTakeover tunable from remote node %d\n", pnn));
2213                 return;
2214         }
2215
2216         if (outdata.dsize != sizeof(uint32_t)) {
2217                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading NoIPTakeover tunable from node %d. Expected %d bytes but received %d bytes\n", pnn, (int)sizeof(uint32_t), (int)outdata.dsize));
2218                 return;
2219         }
2220
2221         if (pnn >= nodemap->num) {
2222                 DEBUG(DEBUG_ERR,("Got NoIPTakeover reply from node %d but nodemap only has %d entries\n", pnn, nodemap->num));
2223                 return;
2224         }
2225
2226         if (*(uint32_t *)outdata.dptr != 0) {
2227                 nodemap->nodes[pnn].flags |= NODE_FLAGS_NOIPTAKEOVER;
2228         }
2229 }
2230
2231 /*
2232   make any IP alias changes for public addresses that are necessary 
2233  */
2234 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2235                       client_async_callback fail_callback, void *callback_data)
2236 {
2237         int i;
2238         struct ctdb_public_ip ip;
2239         struct ctdb_public_ipv4 ipv4;
2240         struct ctdb_control_get_tunable *t;
2241         uint32_t *nodes;
2242         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2243         TDB_DATA data;
2244         struct timeval timeout;
2245         struct client_async_data *async_data;
2246         struct ctdb_client_control_state *state;
2247         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2248         uint32_t disable_timeout;
2249
2250         /*
2251          * ip failover is completely disabled, just send out the 
2252          * ipreallocated event.
2253          */
2254         if (ctdb->tunable.disable_ip_failover != 0) {
2255                 goto ipreallocated;
2256         }
2257
2258
2259         /* assume all nodes do support failback */
2260         for (i=0;i<nodemap->num;i++) {
2261                 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2262         }
2263         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen("NoIPTakeover") + 1;
2264         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2265         t = (struct ctdb_control_get_tunable *)data.dptr;
2266         t->length = strlen("NoIPTakeover")+1;
2267         memcpy(t->name, "NoIPTakeover", t->length);
2268         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2269         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2270                                       nodes, 0, TAKEOVER_TIMEOUT(),
2271                                       false, data,
2272                                       noiptakeover_cb, NULL,
2273                                       nodemap) != 0) {
2274                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get noiptakeover tunable failed\n"));
2275         }
2276         talloc_free(nodes);
2277         talloc_free(data.dptr);
2278
2279
2280         ZERO_STRUCT(ip);
2281
2282         /* Do the IP reassignment calculations */
2283         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2284
2285         /* The recovery daemon does regular sanity checks of the IPs.
2286          * However, sometimes it is overzealous and thinks changes are
2287          * required when they're already underway.  This stops the
2288          * checks for a while before we start moving IPs.
2289          */
2290         disable_timeout = ctdb->tunable.takeover_timeout;
2291         data.dptr  = (uint8_t*)&disable_timeout;
2292         data.dsize = sizeof(disable_timeout);
2293         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2294                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2295                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2296         }
2297
2298         /* now tell all nodes to delete any alias that they should not
2299            have.  This will be a NOOP on nodes that don't currently
2300            hold the given alias */
2301         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2302         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2303
2304         async_data->fail_callback = fail_callback;
2305         async_data->callback_data = callback_data;
2306
2307         for (i=0;i<nodemap->num;i++) {
2308                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2309                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2310                         continue;
2311                 }
2312
2313                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2314                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2315                                 /* This node should be serving this
2316                                    vnn so dont tell it to release the ip
2317                                 */
2318                                 continue;
2319                         }
2320                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2321                                 ipv4.pnn = tmp_ip->pnn;
2322                                 ipv4.sin = tmp_ip->addr.ip;
2323
2324                                 timeout = TAKEOVER_TIMEOUT();
2325                                 data.dsize = sizeof(ipv4);
2326                                 data.dptr  = (uint8_t *)&ipv4;
2327                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2328                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2329                                                 data, async_data,
2330                                                 &timeout, NULL);
2331                         } else {
2332                                 ip.pnn  = tmp_ip->pnn;
2333                                 ip.addr = tmp_ip->addr;
2334
2335                                 timeout = TAKEOVER_TIMEOUT();
2336                                 data.dsize = sizeof(ip);
2337                                 data.dptr  = (uint8_t *)&ip;
2338                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2339                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2340                                                 data, async_data,
2341                                                 &timeout, NULL);
2342                         }
2343
2344                         if (state == NULL) {
2345                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2346                                 talloc_free(tmp_ctx);
2347                                 return -1;
2348                         }
2349                 
2350                         ctdb_client_async_add(async_data, state);
2351                 }
2352         }
2353         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2354                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2355                 talloc_free(tmp_ctx);
2356                 return -1;
2357         }
2358         talloc_free(async_data);
2359
2360
2361         /* tell all nodes to get their own IPs */
2362         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2363         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2364
2365         async_data->fail_callback = fail_callback;
2366         async_data->callback_data = callback_data;
2367
2368         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2369                 if (tmp_ip->pnn == -1) {
2370                         /* this IP won't be taken over */
2371                         continue;
2372                 }
2373
2374                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2375                         ipv4.pnn = tmp_ip->pnn;
2376                         ipv4.sin = tmp_ip->addr.ip;
2377
2378                         timeout = TAKEOVER_TIMEOUT();
2379                         data.dsize = sizeof(ipv4);
2380                         data.dptr  = (uint8_t *)&ipv4;
2381                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2382                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2383                                         data, async_data,
2384                                         &timeout, NULL);
2385                 } else {
2386                         ip.pnn  = tmp_ip->pnn;
2387                         ip.addr = tmp_ip->addr;
2388
2389                         timeout = TAKEOVER_TIMEOUT();
2390                         data.dsize = sizeof(ip);
2391                         data.dptr  = (uint8_t *)&ip;
2392                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2393                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2394                                         data, async_data,
2395                                         &timeout, NULL);
2396                 }
2397                 if (state == NULL) {
2398                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2399                         talloc_free(tmp_ctx);
2400                         return -1;
2401                 }
2402                 
2403                 ctdb_client_async_add(async_data, state);
2404         }
2405         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2406                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2407                 talloc_free(tmp_ctx);
2408                 return -1;
2409         }
2410
2411 ipreallocated:
2412         /* 
2413          * Tell all nodes to run eventscripts to process the
2414          * "ipreallocated" event.  This can do a lot of things,
2415          * including restarting services to reconfigure them if public
2416          * IPs have moved.  Once upon a time this event only used to
2417          * update natwg.
2418          */
2419         data.dptr  = discard_const("ipreallocated");
2420         data.dsize = strlen((char *)data.dptr) + 1; 
2421         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2422         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2423                                       nodes, 0, TAKEOVER_TIMEOUT(),
2424                                       false, data,
2425                                       NULL, fail_callback,
2426                                       callback_data) != 0) {
2427                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2428         }
2429
2430         talloc_free(tmp_ctx);
2431         return 0;
2432 }
2433
2434
2435 /*
2436   destroy a ctdb_client_ip structure
2437  */
2438 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2439 {
2440         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2441                 ctdb_addr_to_str(&ip->addr),
2442                 ntohs(ip->addr.ip.sin_port),
2443                 ip->client_id));
2444
2445         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2446         return 0;
2447 }
2448
2449 /*
2450   called by a client to inform us of a TCP connection that it is managing
2451   that should tickled with an ACK when IP takeover is done
2452   we handle both the old ipv4 style of packets as well as the new ipv4/6
2453   pdus.
2454  */
2455 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2456                                 TDB_DATA indata)
2457 {
2458         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2459         struct ctdb_control_tcp *old_addr = NULL;
2460         struct ctdb_control_tcp_addr new_addr;
2461         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2462         struct ctdb_tcp_list *tcp;
2463         struct ctdb_tcp_connection t;
2464         int ret;
2465         TDB_DATA data;
2466         struct ctdb_client_ip *ip;
2467         struct ctdb_vnn *vnn;
2468         ctdb_sock_addr addr;
2469
2470         switch (indata.dsize) {
2471         case sizeof(struct ctdb_control_tcp):
2472                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2473                 ZERO_STRUCT(new_addr);
2474                 tcp_sock = &new_addr;
2475                 tcp_sock->src.ip  = old_addr->src;
2476                 tcp_sock->dest.ip = old_addr->dest;
2477                 break;
2478         case sizeof(struct ctdb_control_tcp_addr):
2479                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2480                 break;
2481         default:
2482                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2483                                  "to ctdb_control_tcp_client. size was %d but "
2484                                  "only allowed sizes are %lu and %lu\n",
2485                                  (int)indata.dsize,
2486                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2487                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2488                 return -1;
2489         }
2490
2491         addr = tcp_sock->src;
2492         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2493         addr = tcp_sock->dest;
2494         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2495
2496         ZERO_STRUCT(addr);
2497         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2498         vnn = find_public_ip_vnn(ctdb, &addr);
2499         if (vnn == NULL) {
2500                 switch (addr.sa.sa_family) {
2501                 case AF_INET:
2502                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2503                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2504                                         ctdb_addr_to_str(&addr)));
2505                         }
2506                         break;
2507                 case AF_INET6:
2508                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2509                                 ctdb_addr_to_str(&addr)));
2510                         break;
2511                 default:
2512                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2513                 }
2514
2515                 return 0;
2516         }
2517
2518         if (vnn->pnn != ctdb->pnn) {
2519                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2520                         ctdb_addr_to_str(&addr),
2521                         client_id, client->pid));
2522                 /* failing this call will tell smbd to die */
2523                 return -1;
2524         }
2525
2526         ip = talloc(client, struct ctdb_client_ip);
2527         CTDB_NO_MEMORY(ctdb, ip);
2528
2529         ip->ctdb      = ctdb;
2530         ip->addr      = addr;
2531         ip->client_id = client_id;
2532         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2533         DLIST_ADD(ctdb->client_ip_list, ip);
2534
2535         tcp = talloc(client, struct ctdb_tcp_list);
2536         CTDB_NO_MEMORY(ctdb, tcp);
2537
2538         tcp->connection.src_addr = tcp_sock->src;
2539         tcp->connection.dst_addr = tcp_sock->dest;
2540
2541         DLIST_ADD(client->tcp_list, tcp);
2542
2543         t.src_addr = tcp_sock->src;
2544         t.dst_addr = tcp_sock->dest;
2545
2546         data.dptr = (uint8_t *)&t;
2547         data.dsize = sizeof(t);
2548
2549         switch (addr.sa.sa_family) {
2550         case AF_INET:
2551                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2552                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2553                         ctdb_addr_to_str(&tcp_sock->src),
2554                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2555                 break;
2556         case AF_INET6:
2557                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2558                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2559                         ctdb_addr_to_str(&tcp_sock->src),
2560                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2561                 break;
2562         default:
2563                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2564         }
2565
2566
2567         /* tell all nodes about this tcp connection */
2568         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2569                                        CTDB_CONTROL_TCP_ADD,
2570                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2571         if (ret != 0) {
2572                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2573                 return -1;
2574         }
2575
2576         return 0;
2577 }
2578
2579 /*
2580   find a tcp address on a list
2581  */
2582 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2583                                            struct ctdb_tcp_connection *tcp)
2584 {
2585         int i;
2586
2587         if (array == NULL) {
2588                 return NULL;
2589         }
2590
2591         for (i=0;i<array->num;i++) {
2592                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2593                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2594                         return &array->connections[i];
2595                 }
2596         }
2597         return NULL;
2598 }
2599
2600
2601
2602 /*
2603   called by a daemon to inform us of a TCP connection that one of its
2604   clients managing that should tickled with an ACK when IP takeover is
2605   done
2606  */
2607 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2608 {
2609         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2610         struct ctdb_tcp_array *tcparray;
2611         struct ctdb_tcp_connection tcp;
2612         struct ctdb_vnn *vnn;
2613
2614         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2615         if (vnn == NULL) {
2616                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2617                         ctdb_addr_to_str(&p->dst_addr)));
2618
2619                 return -1;
2620         }
2621
2622
2623         tcparray = vnn->tcp_array;
2624
2625         /* If this is the first tickle */
2626         if (tcparray == NULL) {
2627                 tcparray = talloc_size(ctdb->nodes, 
2628                         offsetof(struct ctdb_tcp_array, connections) +
2629                         sizeof(struct ctdb_tcp_connection) * 1);
2630                 CTDB_NO_MEMORY(ctdb, tcparray);
2631                 vnn->tcp_array = tcparray;
2632
2633                 tcparray->num = 0;
2634                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2635                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2636
2637                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2638                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2639                 tcparray->num++;
2640
2641                 if (tcp_update_needed) {
2642                         vnn->tcp_update_needed = true;
2643                 }
2644                 return 0;
2645         }
2646
2647
2648         /* Do we already have this tickle ?*/
2649         tcp.src_addr = p->src_addr;
2650         tcp.dst_addr = p->dst_addr;
2651         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2652                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2653                         ctdb_addr_to_str(&tcp.dst_addr),
2654                         ntohs(tcp.dst_addr.ip.sin_port),
2655                         vnn->pnn));
2656                 return 0;
2657         }
2658
2659         /* A new tickle, we must add it to the array */
2660         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2661                                         struct ctdb_tcp_connection,
2662                                         tcparray->num+1);
2663         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2664
2665         vnn->tcp_array = tcparray;
2666         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2667         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2668         tcparray->num++;
2669                                 
2670         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2671                 ctdb_addr_to_str(&tcp.dst_addr),
2672                 ntohs(tcp.dst_addr.ip.sin_port),
2673                 vnn->pnn));
2674
2675         if (tcp_update_needed) {
2676                 vnn->tcp_update_needed = true;
2677         }
2678
2679         return 0;
2680 }
2681
2682
2683 /*
2684   called by a daemon to inform us of a TCP connection that one of its
2685   clients managing that should tickled with an ACK when IP takeover is
2686   done
2687  */
2688 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2689 {
2690         struct ctdb_tcp_connection *tcpp;
2691         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2692
2693         if (vnn == NULL) {
2694                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2695                         ctdb_addr_to_str(&conn->dst_addr)));
2696                 return;
2697         }
2698
2699         /* if the array is empty we cant remove it
2700            and we dont need to do anything
2701          */
2702         if (vnn->tcp_array == NULL) {
2703                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2704                         ctdb_addr_to_str(&conn->dst_addr),
2705                         ntohs(conn->dst_addr.ip.sin_port)));
2706                 return;
2707         }
2708
2709
2710         /* See if we know this connection
2711            if we dont know this connection  then we dont need to do anything
2712          */
2713         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2714         if (tcpp == NULL) {
2715                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2716                         ctdb_addr_to_str(&conn->dst_addr),
2717                         ntohs(conn->dst_addr.ip.sin_port)));
2718                 return;
2719         }
2720
2721
2722         /* We need to remove this entry from the array.
2723            Instead of allocating a new array and copying data to it
2724            we cheat and just copy the last entry in the existing array
2725            to the entry that is to be removed and just shring the 
2726            ->num field
2727          */
2728         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2729         vnn->tcp_array->num--;
2730
2731         /* If we deleted the last entry we also need to remove the entire array
2732          */
2733         if (vnn->tcp_array->num == 0) {
2734                 talloc_free(vnn->tcp_array);
2735                 vnn->tcp_array = NULL;
2736         }               
2737
2738         vnn->tcp_update_needed = true;
2739
2740         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2741                 ctdb_addr_to_str(&conn->src_addr),
2742                 ntohs(conn->src_addr.ip.sin_port)));
2743 }
2744
2745
2746 /*
2747   called by a daemon to inform us of a TCP connection that one of its
2748   clients used are no longer needed in the tickle database
2749  */
2750 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2751 {
2752         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2753
2754         ctdb_remove_tcp_connection(ctdb, conn);
2755
2756         return 0;
2757 }
2758
2759
2760 /*
2761   called when a daemon restarts - send all tickes for all public addresses
2762   we are serving immediately to the new node.
2763  */
2764 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2765 {
2766 /*XXX here we should send all tickes we are serving to the new node */
2767         return 0;
2768 }
2769
2770
2771 /*
2772   called when a client structure goes away - hook to remove
2773   elements from the tcp_list in all daemons
2774  */
2775 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2776 {
2777         while (client->tcp_list) {
2778                 struct ctdb_tcp_list *tcp = client->tcp_list;
2779                 DLIST_REMOVE(client->tcp_list, tcp);
2780                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2781         }
2782 }
2783
2784
2785 /*
2786   release all IPs on shutdown
2787  */
2788 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2789 {
2790         struct ctdb_vnn *vnn;
2791
2792         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2793                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2794                         ctdb_vnn_unassign_iface(ctdb, vnn);
2795                         continue;
2796                 }
2797                 if (!vnn->iface) {
2798                         continue;
2799                 }
2800                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2801                                   ctdb_vnn_iface_string(vnn),
2802                                   ctdb_addr_to_str(&vnn->public_address),
2803                                   vnn->public_netmask_bits);
2804                 release_kill_clients(ctdb, &vnn->public_address);
2805                 ctdb_vnn_unassign_iface(ctdb, vnn);
2806         }
2807 }
2808
2809
2810 /*
2811   get list of public IPs
2812  */
2813 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2814                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2815 {
2816         int i, num, len;
2817         struct ctdb_all_public_ips *ips;
2818         struct ctdb_vnn *vnn;
2819         bool only_available = false;
2820
2821         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2822                 only_available = true;
2823         }
2824
2825         /* count how many public ip structures we have */
2826         num = 0;
2827         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2828                 num++;
2829         }
2830
2831         len = offsetof(struct ctdb_all_public_ips, ips) + 
2832                 num*sizeof(struct ctdb_public_ip);
2833         ips = talloc_zero_size(outdata, len);
2834         CTDB_NO_MEMORY(ctdb, ips);
2835
2836         i = 0;
2837         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2838                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2839                         continue;
2840                 }
2841                 ips->ips[i].pnn  = vnn->pnn;
2842                 ips->ips[i].addr = vnn->public_address;
2843                 i++;
2844         }
2845         ips->num = i;
2846         len = offsetof(struct ctdb_all_public_ips, ips) +
2847                 i*sizeof(struct ctdb_public_ip);
2848
2849         outdata->dsize = len;
2850         outdata->dptr  = (uint8_t *)ips;
2851
2852         return 0;
2853 }
2854
2855
2856 /*
2857   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2858  */
2859 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2860                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2861 {
2862         int i, num, len;
2863         struct ctdb_all_public_ipsv4 *ips;
2864         struct ctdb_vnn *vnn;
2865
2866         /* count how many public ip structures we have */
2867         num = 0;
2868         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2869                 if (vnn->public_address.sa.sa_family != AF_INET) {
2870                         continue;
2871                 }
2872                 num++;
2873         }
2874
2875         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2876                 num*sizeof(struct ctdb_public_ipv4);
2877         ips = talloc_zero_size(outdata, len);
2878         CTDB_NO_MEMORY(ctdb, ips);
2879
2880         outdata->dsize = len;
2881         outdata->dptr  = (uint8_t *)ips;
2882
2883         ips->num = num;
2884         i = 0;
2885         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2886                 if (vnn->public_address.sa.sa_family != AF_INET) {
2887                         continue;
2888                 }
2889                 ips->ips[i].pnn = vnn->pnn;
2890                 ips->ips[i].sin = vnn->public_address.ip;
2891                 i++;
2892         }
2893
2894         return 0;
2895 }
2896
2897 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2898                                         struct ctdb_req_control *c,
2899                                         TDB_DATA indata,
2900                                         TDB_DATA *outdata)
2901 {
2902         int i, num, len;
2903         ctdb_sock_addr *addr;
2904         struct ctdb_control_public_ip_info *info;
2905         struct ctdb_vnn *vnn;
2906
2907         addr = (ctdb_sock_addr *)indata.dptr;
2908
2909         vnn = find_public_ip_vnn(ctdb, addr);
2910         if (vnn == NULL) {
2911                 /* if it is not a public ip   it could be our 'single ip' */
2912                 if (ctdb->single_ip_vnn) {
2913                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2914                                 vnn = ctdb->single_ip_vnn;
2915                         }
2916                 }
2917         }
2918         if (vnn == NULL) {
2919                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2920                                  "'%s'not a public address\n",
2921                                  ctdb_addr_to_str(addr)));
2922                 return -1;
2923         }
2924
2925         /* count how many public ip structures we have */
2926         num = 0;
2927         for (;vnn->ifaces[num];) {
2928                 num++;
2929         }
2930
2931         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2932                 num*sizeof(struct ctdb_control_iface_info);
2933         info = talloc_zero_size(outdata, len);
2934         CTDB_NO_MEMORY(ctdb, info);
2935
2936         info->ip.addr = vnn->public_address;
2937         info->ip.pnn = vnn->pnn;
2938         info->active_idx = 0xFFFFFFFF;
2939
2940         for (i=0; vnn->ifaces[i]; i++) {
2941                 struct ctdb_iface *cur;
2942
2943                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2944                 if (cur == NULL) {
2945                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2946                                            vnn->ifaces[i]));
2947                         return -1;
2948                 }
2949                 if (vnn->iface == cur) {
2950                         info->active_idx = i;
2951                 }
2952                 strcpy(info->ifaces[i].name, cur->name);
2953                 info->ifaces[i].link_state = cur->link_up;
2954                 info->ifaces[i].references = cur->references;
2955         }
2956         info->num = i;
2957         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2958                 i*sizeof(struct ctdb_control_iface_info);
2959
2960         outdata->dsize = len;
2961         outdata->dptr  = (uint8_t *)info;
2962
2963         return 0;
2964 }
2965
2966 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2967                                 struct ctdb_req_control *c,
2968                                 TDB_DATA *outdata)
2969 {
2970         int i, num, len;
2971         struct ctdb_control_get_ifaces *ifaces;
2972         struct ctdb_iface *cur;
2973
2974         /* count how many public ip structures we have */
2975         num = 0;
2976         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2977                 num++;
2978         }
2979
2980         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2981                 num*sizeof(struct ctdb_control_iface_info);
2982         ifaces = talloc_zero_size(outdata, len);
2983         CTDB_NO_MEMORY(ctdb, ifaces);
2984
2985         i = 0;
2986         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2987                 strcpy(ifaces->ifaces[i].name, cur->name);
2988                 ifaces->ifaces[i].link_state = cur->link_up;
2989                 ifaces->ifaces[i].references = cur->references;
2990                 i++;
2991         }
2992         ifaces->num = i;
2993         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2994                 i*sizeof(struct ctdb_control_iface_info);
2995
2996         outdata->dsize = len;
2997         outdata->dptr  = (uint8_t *)ifaces;
2998
2999         return 0;
3000 }
3001
3002 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3003                                     struct ctdb_req_control *c,
3004                                     TDB_DATA indata)
3005 {
3006         struct ctdb_control_iface_info *info;
3007         struct ctdb_iface *iface;
3008         bool link_up = false;
3009
3010         info = (struct ctdb_control_iface_info *)indata.dptr;
3011
3012         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3013                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3014                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3015                                   len, len, info->name));
3016                 return -1;
3017         }
3018
3019         switch (info->link_state) {
3020         case 0:
3021                 link_up = false;
3022                 break;
3023         case 1:
3024                 link_up = true;
3025                 break;
3026         default:
3027                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3028                                   (unsigned int)info->link_state));
3029                 return -1;
3030         }
3031
3032         if (info->references != 0) {
3033                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3034                                   (unsigned int)info->references));
3035                 return -1;
3036         }
3037
3038         iface = ctdb_find_iface(ctdb, info->name);
3039         if (iface == NULL) {
3040                 return -1;
3041         }
3042
3043         if (link_up == iface->link_up) {
3044                 return 0;
3045         }
3046
3047         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3048               ("iface[%s] has changed it's link status %s => %s\n",
3049                iface->name,
3050                iface->link_up?"up":"down",
3051                link_up?"up":"down"));
3052
3053         iface->link_up = link_up;
3054         return 0;
3055 }
3056
3057
3058 /* 
3059    structure containing the listening socket and the list of tcp connections
3060    that the ctdb daemon is to kill
3061 */
3062 struct ctdb_kill_tcp {
3063         struct ctdb_vnn *vnn;
3064         struct ctdb_context *ctdb;
3065         int capture_fd;
3066         struct fd_event *fde;
3067         trbt_tree_t *connections;
3068         void *private_data;
3069 };
3070
3071 /*
3072   a tcp connection that is to be killed
3073  */
3074 struct ctdb_killtcp_con {
3075         ctdb_sock_addr src_addr;
3076         ctdb_sock_addr dst_addr;
3077         int count;
3078         struct ctdb_kill_tcp *killtcp;
3079 };
3080
3081 /* this function is used to create a key to represent this socketpair
3082    in the killtcp tree.
3083    this key is used to insert and lookup matching socketpairs that are
3084    to be tickled and RST
3085 */
3086 #define KILLTCP_KEYLEN  10
3087 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3088 {
3089         static uint32_t key[KILLTCP_KEYLEN];
3090
3091         bzero(key, sizeof(key));
3092
3093         if (src->sa.sa_family != dst->sa.sa_family) {
3094                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3095                 return key;
3096         }
3097         
3098         switch (src->sa.sa_family) {
3099         case AF_INET:
3100                 key[0]  = dst->ip.sin_addr.s_addr;
3101                 key[1]  = src->ip.sin_addr.s_addr;
3102                 key[2]  = dst->ip.sin_port;
3103                 key[3]  = src->ip.sin_port;
3104                 break;
3105         case AF_INET6: {
3106                 uint32_t *dst6_addr32 =
3107                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3108                 uint32_t *src6_addr32 =
3109                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3110                 key[0]  = dst6_addr32[3];
3111                 key[1]  = src6_addr32[3];
3112                 key[2]  = dst6_addr32[2];
3113                 key[3]  = src6_addr32[2];
3114                 key[4]  = dst6_addr32[1];
3115                 key[5]  = src6_addr32[1];
3116                 key[6]  = dst6_addr32[0];
3117                 key[7]  = src6_addr32[0];
3118                 key[8]  = dst->ip6.sin6_port;
3119                 key[9]  = src->ip6.sin6_port;
3120                 break;
3121         }
3122         default:
3123                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3124                 return key;
3125         }
3126
3127         return key;
3128 }
3129
3130 /*
3131   called when we get a read event on the raw socket
3132  */
3133 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3134                                 uint16_t flags, void *private_data)
3135 {
3136         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3137         struct ctdb_killtcp_con *con;
3138         ctdb_sock_addr src, dst;
3139         uint32_t ack_seq, seq;
3140
3141         if (!(flags & EVENT_FD_READ)) {
3142                 return;
3143         }
3144
3145         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3146                                 killtcp->private_data,
3147                                 &src, &dst,
3148                                 &ack_seq, &seq) != 0) {
3149                 /* probably a non-tcp ACK packet */
3150                 return;
3151         }
3152
3153         /* check if we have this guy in our list of connections
3154            to kill
3155         */
3156         con = trbt_lookuparray32(killtcp->connections, 
3157                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3158         if (con == NULL) {
3159                 /* no this was some other packet we can just ignore */
3160                 return;
3161         }
3162
3163         /* This one has been tickled !
3164            now reset him and remove him from the list.
3165          */
3166         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3167                 ntohs(con->dst_addr.ip.sin_port),
3168                 ctdb_addr_to_str(&con->src_addr),
3169                 ntohs(con->src_addr.ip.sin_port)));
3170
3171         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3172         talloc_free(con);
3173 }
3174
3175
3176 /* when traversing the list of all tcp connections to send tickle acks to
3177    (so that we can capture the ack coming back and kill the connection
3178     by a RST)
3179    this callback is called for each connection we are currently trying to kill
3180 */
3181 static int tickle_connection_traverse(void *param, void *data)
3182 {
3183         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3184
3185         /* have tried too many times, just give up */
3186         if (con->count >= 5) {
3187                 /* can't delete in traverse: reparent to delete_cons */
3188                 talloc_steal(param, con);
3189                 return 0;
3190         }
3191
3192         /* othervise, try tickling it again */
3193         con->count++;
3194         ctdb_sys_send_tcp(
3195                 (ctdb_sock_addr *)&con->dst_addr,
3196                 (ctdb_sock_addr *)&con->src_addr,
3197                 0, 0, 0);
3198         return 0;
3199 }
3200
3201
3202 /* 
3203    called every second until all sentenced connections have been reset
3204  */
3205 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3206                                               struct timeval t, void *private_data)
3207 {
3208         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3209         void *delete_cons = talloc_new(NULL);
3210
3211         /* loop over all connections sending tickle ACKs */
3212         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3213
3214         /* now we've finished traverse, it's safe to do deletion. */
3215         talloc_free(delete_cons);
3216
3217         /* If there are no more connections to kill we can remove the
3218            entire killtcp structure
3219          */
3220         if ( (killtcp->connections == NULL) || 
3221              (killtcp->connections->root == NULL) ) {
3222                 talloc_free(killtcp);
3223                 return;
3224         }
3225
3226         /* try tickling them again in a seconds time
3227          */
3228         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3229                         ctdb_tickle_sentenced_connections, killtcp);
3230 }
3231
3232 /*
3233   destroy the killtcp structure
3234  */
3235 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3236 {
3237         struct ctdb_vnn *tmpvnn;
3238
3239         /* verify that this vnn is still active */
3240         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3241                 if (tmpvnn == killtcp->vnn) {
3242                         break;
3243                 }
3244         }
3245
3246         if (tmpvnn == NULL) {
3247                 return 0;
3248         }
3249
3250         if (killtcp->vnn->killtcp != killtcp) {
3251                 return 0;
3252         }
3253
3254         killtcp->vnn->killtcp = NULL;
3255
3256         return 0;
3257 }
3258
3259
3260 /* nothing fancy here, just unconditionally replace any existing
3261    connection structure with the new one.
3262
3263    dont even free the old one if it did exist, that one is talloc_stolen
3264    by the same node in the tree anyway and will be deleted when the new data 
3265    is deleted
3266 */
3267 static void *add_killtcp_callback(void *parm, void *data)
3268 {
3269         return parm;
3270 }
3271
3272 /*
3273   add a tcp socket to the list of connections we want to RST
3274  */
3275 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3276                                        ctdb_sock_addr *s,
3277                                        ctdb_sock_addr *d)
3278 {
3279         ctdb_sock_addr src, dst;
3280         struct ctdb_kill_tcp *killtcp;
3281         struct ctdb_killtcp_con *con;
3282         struct ctdb_vnn *vnn;
3283
3284         ctdb_canonicalize_ip(s, &src);
3285         ctdb_canonicalize_ip(d, &dst);
3286
3287         vnn = find_public_ip_vnn(ctdb, &dst);
3288         if (vnn == NULL) {
3289                 vnn = find_public_ip_vnn(ctdb, &src);
3290         }
3291         if (vnn == NULL) {
3292                 /* if it is not a public ip   it could be our 'single ip' */
3293                 if (ctdb->single_ip_vnn) {
3294                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3295                                 vnn = ctdb->single_ip_vnn;
3296                         }
3297                 }
3298         }
3299         if (vnn == NULL) {
3300                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3301                 return -1;
3302         }
3303
3304         killtcp = vnn->killtcp;
3305         
3306         /* If this is the first connection to kill we must allocate
3307            a new structure
3308          */
3309         if (killtcp == NULL) {
3310                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3311                 CTDB_NO_MEMORY(ctdb, killtcp);
3312
3313                 killtcp->vnn         = vnn;
3314                 killtcp->ctdb        = ctdb;
3315                 killtcp->capture_fd  = -1;
3316                 killtcp->connections = trbt_create(killtcp, 0);
3317
3318                 vnn->killtcp         = killtcp;
3319                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3320         }
3321
3322
3323
3324         /* create a structure that describes this connection we want to
3325            RST and store it in killtcp->connections
3326         */
3327         con = talloc(killtcp, struct ctdb_killtcp_con);
3328         CTDB_NO_MEMORY(ctdb, con);
3329         con->src_addr = src;
3330         con->dst_addr = dst;
3331         con->count    = 0;
3332         con->killtcp  = killtcp;
3333
3334
3335         trbt_insertarray32_callback(killtcp->connections,
3336                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3337                         add_killtcp_callback, con);
3338
3339         /* 
3340            If we dont have a socket to listen on yet we must create it
3341          */
3342         if (killtcp->capture_fd == -1) {
3343                 const char *iface = ctdb_vnn_iface_string(vnn);
3344                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3345                 if (killtcp->capture_fd == -1) {
3346                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3347                                           "socket on iface '%s' for killtcp (%s)\n",
3348                                           iface, strerror(errno)));
3349                         goto failed;
3350                 }
3351         }
3352
3353
3354         if (killtcp->fde == NULL) {
3355                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3356                                             EVENT_FD_READ,
3357                                             capture_tcp_handler, killtcp);
3358                 tevent_fd_set_auto_close(killtcp->fde);
3359
3360                 /* We also need to set up some events to tickle all these connections
3361                    until they are all reset
3362                 */
3363                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3364                                 ctdb_tickle_sentenced_connections, killtcp);
3365         }
3366
3367         /* tickle him once now */
3368         ctdb_sys_send_tcp(
3369                 &con->dst_addr,
3370                 &con->src_addr,
3371                 0, 0, 0);
3372
3373         return 0;
3374
3375 failed:
3376         talloc_free(vnn->killtcp);
3377         vnn->killtcp = NULL;
3378         return -1;
3379 }
3380
3381 /*
3382   kill a TCP connection.
3383  */
3384 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3385 {
3386         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3387
3388         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3389 }
3390
3391 /*
3392   called by a daemon to inform us of the entire list of TCP tickles for
3393   a particular public address.
3394   this control should only be sent by the node that is currently serving
3395   that public address.
3396  */
3397 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3398 {
3399         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3400         struct ctdb_tcp_array *tcparray;
3401         struct ctdb_vnn *vnn;
3402
3403         /* We must at least have tickles.num or else we cant verify the size
3404            of the received data blob
3405          */
3406         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3407                                         tickles.connections)) {
3408                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3409                 return -1;
3410         }
3411
3412         /* verify that the size of data matches what we expect */
3413         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3414                                 tickles.connections)
3415                          + sizeof(struct ctdb_tcp_connection)
3416                                  * list->tickles.num) {
3417                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3418                 return -1;
3419         }       
3420
3421         vnn = find_public_ip_vnn(ctdb, &list->addr);
3422         if (vnn == NULL) {
3423                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3424                         ctdb_addr_to_str(&list->addr)));
3425
3426                 return 1;
3427         }
3428
3429         /* remove any old ticklelist we might have */
3430         talloc_free(vnn->tcp_array);
3431         vnn->tcp_array = NULL;
3432
3433         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3434         CTDB_NO_MEMORY(ctdb, tcparray);
3435
3436         tcparray->num = list->tickles.num;
3437
3438         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3439         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3440
3441         memcpy(tcparray->connections, &list->tickles.connections[0], 
3442                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3443
3444         /* We now have a new fresh tickle list array for this vnn */
3445         vnn->tcp_array = talloc_steal(vnn, tcparray);
3446         
3447         return 0;
3448 }
3449
3450 /*
3451   called to return the full list of tickles for the puclic address associated 
3452   with the provided vnn
3453  */
3454 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3455 {
3456         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3457         struct ctdb_control_tcp_tickle_list *list;
3458         struct ctdb_tcp_array *tcparray;
3459         int num;
3460         struct ctdb_vnn *vnn;
3461
3462         vnn = find_public_ip_vnn(ctdb, addr);
3463         if (vnn == NULL) {
3464                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3465                         ctdb_addr_to_str(addr)));
3466
3467                 return 1;
3468         }
3469
3470         tcparray = vnn->tcp_array;
3471         if (tcparray) {
3472                 num = tcparray->num;
3473         } else {
3474                 num = 0;
3475         }
3476
3477         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3478                                 tickles.connections)
3479                         + sizeof(struct ctdb_tcp_connection) * num;
3480
3481         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3482         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3483         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3484
3485         list->addr = *addr;
3486         list->tickles.num = num;
3487         if (num) {
3488                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3489                         sizeof(struct ctdb_tcp_connection) * num);
3490         }
3491
3492         return 0;
3493 }
3494
3495
3496 /*
3497   set the list of all tcp tickles for a public address
3498  */
3499 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3500                               struct timeval timeout, uint32_t destnode, 
3501                               ctdb_sock_addr *addr,
3502                               struct ctdb_tcp_array *tcparray)
3503 {
3504         int ret, num;
3505         TDB_DATA data;
3506         struct ctdb_control_tcp_tickle_list *list;
3507
3508         if (tcparray) {
3509                 num = tcparray->num;
3510         } else {
3511                 num = 0;
3512         }
3513
3514         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3515                                 tickles.connections) +
3516                         sizeof(struct ctdb_tcp_connection) * num;
3517         data.dptr = talloc_size(ctdb, data.dsize);
3518         CTDB_NO_MEMORY(ctdb, data.dptr);
3519
3520         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3521         list->addr = *addr;
3522         list->tickles.num = num;
3523         if (tcparray) {
3524                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3525         }
3526
3527         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3528                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3529                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3530         if (ret != 0) {
3531                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3532                 return -1;
3533         }
3534
3535         talloc_free(data.dptr);
3536
3537         return ret;
3538 }
3539
3540
3541 /*
3542   perform tickle updates if required
3543  */
3544 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3545                                 struct timed_event *te, 
3546                                 struct timeval t, void *private_data)
3547 {
3548         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3549         int ret;
3550         struct ctdb_vnn *vnn;
3551
3552         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3553                 /* we only send out updates for public addresses that 
3554                    we have taken over
3555                  */
3556                 if (ctdb->pnn != vnn->pnn) {
3557                         continue;
3558                 }
3559                 /* We only send out the updates if we need to */
3560                 if (!vnn->tcp_update_needed) {
3561                         continue;
3562                 }
3563                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3564                                 TAKEOVER_TIMEOUT(),
3565                                 CTDB_BROADCAST_CONNECTED,
3566                                 &vnn->public_address,
3567                                 vnn->tcp_array);
3568                 if (ret != 0) {
3569                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3570                                 ctdb_addr_to_str(&vnn->public_address)));
3571                 }
3572         }
3573
3574         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3575                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3576                              ctdb_update_tcp_tickles, ctdb);
3577 }               
3578         
3579
3580 /*
3581   start periodic update of tcp tickles
3582  */
3583 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3584 {
3585         ctdb->tickle_update_context = talloc_new(ctdb);
3586
3587         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3588                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3589                              ctdb_update_tcp_tickles, ctdb);
3590 }
3591
3592
3593
3594
3595 struct control_gratious_arp {
3596         struct ctdb_context *ctdb;
3597         ctdb_sock_addr addr;
3598         const char *iface;
3599         int count;
3600 };
3601
3602 /*
3603   send a control_gratuitous arp
3604  */
3605 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3606                                   struct timeval t, void *private_data)
3607 {
3608         int ret;
3609         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3610                                                         struct control_gratious_arp);
3611
3612         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3613         if (ret != 0) {
3614                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3615                                  arp->iface, strerror(errno)));
3616         }
3617
3618
3619         arp->count++;
3620         if (arp->count == CTDB_ARP_REPEAT) {
3621                 talloc_free(arp);
3622                 return;
3623         }
3624
3625         event_add_timed(arp->ctdb->ev, arp, 
3626                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3627                         send_gratious_arp, arp);
3628 }
3629
3630
3631 /*
3632   send a gratious arp 
3633  */
3634 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3635 {
3636         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3637         struct control_gratious_arp *arp;
3638
3639         /* verify the size of indata */
3640         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3641                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3642                                  (unsigned)indata.dsize, 
3643                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3644                 return -1;
3645         }
3646         if (indata.dsize != 
3647                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3648                 + gratious_arp->len ) ){
3649
3650                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3651                         "but should be %u bytes\n", 
3652                          (unsigned)indata.dsize, 
3653                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3654                 return -1;
3655         }
3656
3657
3658         arp = talloc(ctdb, struct control_gratious_arp);
3659         CTDB_NO_MEMORY(ctdb, arp);
3660
3661         arp->ctdb  = ctdb;
3662         arp->addr   = gratious_arp->addr;
3663         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3664         CTDB_NO_MEMORY(ctdb, arp->iface);
3665         arp->count = 0;
3666         
3667         event_add_timed(arp->ctdb->ev, arp, 
3668                         timeval_zero(), send_gratious_arp, arp);
3669
3670         return 0;
3671 }
3672
3673 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3674 {
3675         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3676         int ret;
3677
3678         /* verify the size of indata */
3679         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3680                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3681                 return -1;
3682         }
3683         if (indata.dsize != 
3684                 ( offsetof(struct ctdb_control_ip_iface, iface)
3685                 + pub->len ) ){
3686
3687                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3688                         "but should be %u bytes\n", 
3689                          (unsigned)indata.dsize, 
3690                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3691                 return -1;
3692         }
3693
3694         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3695
3696         if (ret != 0) {
3697                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3698                 return -1;
3699         }
3700
3701         return 0;
3702 }
3703
3704 /*
3705   called when releaseip event finishes for del_public_address
3706  */
3707 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3708                                 void *private_data)
3709 {
3710         talloc_free(private_data);
3711 }
3712
3713 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3714 {
3715         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3716         struct ctdb_vnn *vnn;
3717         int ret;
3718
3719         /* verify the size of indata */
3720         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3721                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3722                 return -1;
3723         }
3724         if (indata.dsize != 
3725                 ( offsetof(struct ctdb_control_ip_iface, iface)
3726                 + pub->len ) ){
3727
3728                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3729                         "but should be %u bytes\n", 
3730                          (unsigned)indata.dsize, 
3731                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3732                 return -1;
3733         }
3734
3735         /* walk over all public addresses until we find a match */
3736         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3737                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3738                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3739
3740                         DLIST_REMOVE(ctdb->vnn, vnn);
3741                         talloc_steal(mem_ctx, vnn);
3742                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3743                         if (vnn->pnn != ctdb->pnn) {
3744                                 if (vnn->iface != NULL) {
3745                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3746                                 }
3747                                 talloc_free(mem_ctx);
3748                                 return 0;
3749                         }
3750                         vnn->pnn = -1;
3751
3752                         ret = ctdb_event_script_callback(ctdb, 
3753                                          mem_ctx, delete_ip_callback, mem_ctx,
3754                                          false,
3755                                          CTDB_EVENT_RELEASE_IP,
3756                                          "%s %s %u",
3757                                          ctdb_vnn_iface_string(vnn),
3758                                          ctdb_addr_to_str(&vnn->public_address),
3759                                          vnn->public_netmask_bits);
3760                         if (vnn->iface != NULL) {
3761                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3762                         }
3763                         if (ret != 0) {
3764                                 return -1;
3765                         }
3766                         return 0;
3767                 }
3768         }
3769
3770         return -1;
3771 }
3772
3773 /* This function is called from the recovery daemon to verify that a remote
3774    node has the expected ip allocation.
3775    This is verified against ctdb->ip_tree
3776 */
3777 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3778 {
3779         struct ctdb_public_ip_list *tmp_ip; 
3780         int i;
3781
3782         if (ctdb->ip_tree == NULL) {
3783                 /* dont know the expected allocation yet, assume remote node
3784                    is correct. */
3785                 return 0;
3786         }
3787
3788         if (ips == NULL) {
3789                 return 0;
3790         }
3791
3792         for (i=0; i<ips->num; i++) {
3793                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3794                 if (tmp_ip == NULL) {
3795                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3796                         return -1;
3797                 }
3798
3799                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3800                         continue;
3801                 }
3802
3803                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3804                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3805                         return -1;
3806                 }
3807         }
3808
3809         return 0;
3810 }
3811
3812 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3813 {
3814         struct ctdb_public_ip_list *tmp_ip; 
3815
3816         if (ctdb->ip_tree == NULL) {
3817                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3818                 return -1;
3819         }
3820
3821         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3822         if (tmp_ip == NULL) {
3823                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3824                 return -1;
3825         }
3826
3827         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3828         tmp_ip->pnn = ip->pnn;
3829
3830         return 0;
3831 }
3832
3833
3834 struct ctdb_reloadips_handle {
3835         struct ctdb_context *ctdb;
3836         struct ctdb_req_control *c;
3837         int status;
3838         int fd[2];
3839         pid_t child;
3840         struct fd_event *fde;
3841 };
3842
3843 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3844 {
3845         if (h == h->ctdb->reload_ips) {
3846                 h->ctdb->reload_ips = NULL;
3847         }
3848         if (h->c != NULL) {
3849                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3850                 h->c = NULL;
3851         }
3852         ctdb_kill(h->ctdb, h->child, SIGKILL);
3853         return 0;
3854 }
3855
3856 static void ctdb_reloadips_timeout_event(struct event_context *ev,
3857                                 struct timed_event *te,
3858                                 struct timeval t, void *private_data)
3859 {
3860         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3861
3862         talloc_free(h);
3863 }       
3864
3865 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
3866                              uint16_t flags, void *private_data)
3867 {
3868         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3869
3870         char res;
3871         int ret;
3872
3873         ret = read(h->fd[0], &res, 1);
3874         if (ret < 1 || res != 0) {
3875                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3876                 res = 1;
3877         }
3878         h->status = res;
3879
3880         talloc_free(h);
3881 }
3882
3883 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3884 {
3885         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3886         struct ctdb_all_public_ips *ips;
3887         struct ctdb_vnn *vnn;
3888         int i, ret;
3889
3890         /* read the ip allocation from the local node */
3891         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
3892         if (ret != 0) {
3893                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
3894                 talloc_free(mem_ctx);
3895                 return -1;
3896         }
3897
3898         /* re-read the public ips file */
3899         ctdb->vnn = NULL;
3900         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3901                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3902                 talloc_free(mem_ctx);
3903                 return -1;
3904         }               
3905
3906
3907         /* check the previous list of ips and scan for ips that have been
3908            dropped.
3909          */
3910         for (i = 0; i < ips->num; i++) {
3911                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3912                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3913                                 break;
3914                         }
3915                 }
3916
3917                 /* we need to delete this ip, no longer available on this node */
3918                 if (vnn == NULL) {
3919                         struct ctdb_control_ip_iface pub;
3920
3921                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3922                         pub.addr  = ips->ips[i].addr;
3923                         pub.mask  = 0;
3924                         pub.len   = 0;
3925
3926                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3927                         if (ret != 0) {
3928                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3929                                 return -1;
3930                         }
3931                 }
3932         }
3933
3934
3935         /* loop over all new ones and check the ones we need to add */
3936         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3937                 for (i = 0; i < ips->num; i++) {
3938                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3939                                 break;
3940                         }
3941                 }
3942                 if (i == ips->num) {
3943                         struct ctdb_control_ip_iface pub;
3944                         const char *ifaces = NULL;
3945                         int iface = 0;
3946
3947                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
3948
3949                         pub.addr  = vnn->public_address;
3950                         pub.mask  = vnn->public_netmask_bits;
3951
3952
3953                         ifaces = vnn->ifaces[0];
3954                         iface = 1;
3955                         while (vnn->ifaces[iface] != NULL) {
3956                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
3957                                 iface++;
3958                         }
3959                         pub.len   = strlen(ifaces)+1;
3960                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
3961
3962                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3963                         if (ret != 0) {
3964                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
3965                                 return -1;
3966                         }
3967                 }
3968         }
3969
3970         return 0;
3971 }
3972
3973 /* This control is sent to force the node to re-read the public addresses file
3974    and drop any addresses we should nnot longer host, and add new addresses
3975    that we are now able to host
3976 */
3977 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
3978 {
3979         struct ctdb_reloadips_handle *h;
3980         pid_t parent = getpid();
3981
3982         if (ctdb->reload_ips != NULL) {
3983                 talloc_free(ctdb->reload_ips);
3984                 ctdb->reload_ips = NULL;
3985         }
3986
3987         h = talloc(ctdb, struct ctdb_reloadips_handle);
3988         CTDB_NO_MEMORY(ctdb, h);
3989         h->ctdb     = ctdb;
3990         h->c        = NULL;
3991         h->status   = -1;
3992         
3993         if (pipe(h->fd) == -1) {
3994                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3995                 talloc_free(h);
3996                 return -1;
3997         }
3998
3999         h->child = ctdb_fork(ctdb);
4000         if (h->child == (pid_t)-1) {
4001                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4002                 close(h->fd[0]);
4003                 close(h->fd[1]);
4004                 talloc_free(h);
4005                 return -1;
4006         }
4007
4008         /* child process */
4009         if (h->child == 0) {
4010                 signed char res = 0;
4011
4012                 close(h->fd[0]);
4013                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4014
4015                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4016                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4017                         res = -1;
4018                 } else {
4019                         res = ctdb_reloadips_child(ctdb);
4020                         if (res != 0) {
4021                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4022                         }
4023                 }
4024
4025                 write(h->fd[1], &res, 1);
4026                 /* make sure we die when our parent dies */
4027                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4028                         sleep(5);
4029                 }
4030                 _exit(0);
4031         }
4032
4033         h->c             = talloc_steal(h, c);
4034
4035         close(h->fd[1]);
4036         set_close_on_exec(h->fd[0]);
4037
4038         talloc_set_destructor(h, ctdb_reloadips_destructor);
4039
4040
4041         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4042                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4043                         (void *)h);
4044         tevent_fd_set_auto_close(h->fde);
4045
4046         event_add_timed(ctdb->ev, h,
4047                         timeval_current_ofs(120, 0),
4048                         ctdb_reloadips_timeout_event, h);
4049
4050         /* we reply later */
4051         *async_reply = true;
4052         return 0;
4053 }