ctdbd: Clean up orphaned interfaces when an IP is deleted
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Check if this is a public ip known to the node, i.e. can that
1303    node takeover this ip ?
1304 */
1305 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                 struct ctdb_public_ip_list *ip)
1307 {
1308         struct ctdb_all_public_ips *public_ips;
1309         int i;
1310
1311         public_ips = ctdb->nodes[pnn]->available_public_ips;
1312
1313         if (public_ips == NULL) {
1314                 return -1;
1315         }
1316
1317         for (i=0;i<public_ips->num;i++) {
1318                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1319                         /* yes, this node can serve this public ip */
1320                         return 0;
1321                 }
1322         }
1323
1324         return -1;
1325 }
1326
1327
1328 /* search the node lists list for a node to takeover this ip.
1329    pick the node that currently are serving the least number of ips
1330    so that the ips get spread out evenly.
1331 */
1332 static int find_takeover_node(struct ctdb_context *ctdb, 
1333                 struct ctdb_node_map *nodemap, uint32_t mask, 
1334                 struct ctdb_public_ip_list *ip,
1335                 struct ctdb_public_ip_list *all_ips)
1336 {
1337         int pnn, min=0, num;
1338         int i;
1339
1340         pnn    = -1;
1341         for (i=0;i<nodemap->num;i++) {
1342                 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1343                         /* This node is not allowed to takeover any addresses
1344                         */
1345                         continue;
1346                 }
1347
1348                 if (nodemap->nodes[i].flags & mask) {
1349                         /* This node is not healty and can not be used to serve
1350                            a public address 
1351                         */
1352                         continue;
1353                 }
1354
1355                 /* verify that this node can serve this ip */
1356                 if (can_node_serve_ip(ctdb, i, ip)) {
1357                         /* no it couldnt   so skip to the next node */
1358                         continue;
1359                 }
1360
1361                 num = node_ip_coverage(ctdb, i, all_ips);
1362                 /* was this the first node we checked ? */
1363                 if (pnn == -1) {
1364                         pnn = i;
1365                         min  = num;
1366                 } else {
1367                         if (num < min) {
1368                                 pnn = i;
1369                                 min  = num;
1370                         }
1371                 }
1372         }       
1373         if (pnn == -1) {
1374                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1375                         ctdb_addr_to_str(&ip->addr)));
1376
1377                 return -1;
1378         }
1379
1380         ip->pnn = pnn;
1381         return 0;
1382 }
1383
1384 #define IP_KEYLEN       4
1385 static uint32_t *ip_key(ctdb_sock_addr *ip)
1386 {
1387         static uint32_t key[IP_KEYLEN];
1388
1389         bzero(key, sizeof(key));
1390
1391         switch (ip->sa.sa_family) {
1392         case AF_INET:
1393                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1394                 break;
1395         case AF_INET6: {
1396                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1397                 key[0]  = htonl(s6_a32[0]);
1398                 key[1]  = htonl(s6_a32[1]);
1399                 key[2]  = htonl(s6_a32[2]);
1400                 key[3]  = htonl(s6_a32[3]);
1401                 break;
1402         }
1403         default:
1404                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1405                 return key;
1406         }
1407
1408         return key;
1409 }
1410
1411 static void *add_ip_callback(void *parm, void *data)
1412 {
1413         struct ctdb_public_ip_list *this_ip = parm; 
1414         struct ctdb_public_ip_list *prev_ip = data; 
1415
1416         if (prev_ip == NULL) {
1417                 return parm;
1418         }
1419         if (this_ip->pnn == -1) {
1420                 this_ip->pnn = prev_ip->pnn;
1421         }
1422
1423         return parm;
1424 }
1425
1426 static int getips_count_callback(void *param, void *data)
1427 {
1428         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1429         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1430
1431         new_ip->next = *ip_list;
1432         *ip_list     = new_ip;
1433         return 0;
1434 }
1435
1436 static struct ctdb_public_ip_list *
1437 create_merged_ip_list(struct ctdb_context *ctdb)
1438 {
1439         int i, j;
1440         struct ctdb_public_ip_list *ip_list;
1441         struct ctdb_all_public_ips *public_ips;
1442
1443         if (ctdb->ip_tree != NULL) {
1444                 talloc_free(ctdb->ip_tree);
1445                 ctdb->ip_tree = NULL;
1446         }
1447         ctdb->ip_tree = trbt_create(ctdb, 0);
1448
1449         for (i=0;i<ctdb->num_nodes;i++) {
1450                 public_ips = ctdb->nodes[i]->known_public_ips;
1451
1452                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1453                         continue;
1454                 }
1455
1456                 /* there were no public ips for this node */
1457                 if (public_ips == NULL) {
1458                         continue;
1459                 }               
1460
1461                 for (j=0;j<public_ips->num;j++) {
1462                         struct ctdb_public_ip_list *tmp_ip; 
1463
1464                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1465                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1466                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1467                         tmp_ip->addr = public_ips->ips[j].addr;
1468                         tmp_ip->next = NULL;
1469
1470                         trbt_insertarray32_callback(ctdb->ip_tree,
1471                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1472                                 add_ip_callback,
1473                                 tmp_ip);
1474                 }
1475         }
1476
1477         ip_list = NULL;
1478         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1479
1480         return ip_list;
1481 }
1482
1483 /* 
1484  * This is the length of the longtest common prefix between the IPs.
1485  * It is calculated by XOR-ing the 2 IPs together and counting the
1486  * number of leading zeroes.  The implementation means that all
1487  * addresses end up being 128 bits long.
1488  *
1489  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1490  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1491  * lots of nodes and IP addresses?
1492  */
1493 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1494 {
1495         uint32_t ip1_k[IP_KEYLEN];
1496         uint32_t *t;
1497         int i;
1498         uint32_t x;
1499
1500         uint32_t distance = 0;
1501
1502         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1503         t = ip_key(ip2);
1504         for (i=0; i<IP_KEYLEN; i++) {
1505                 x = ip1_k[i] ^ t[i];
1506                 if (x == 0) {
1507                         distance += 32;
1508                 } else {
1509                         /* Count number of leading zeroes. 
1510                          * FIXME? This could be optimised...
1511                          */
1512                         while ((x & (1 << 31)) == 0) {
1513                                 x <<= 1;
1514                                 distance += 1;
1515                         }
1516                 }
1517         }
1518
1519         return distance;
1520 }
1521
1522 /* Calculate the IP distance for the given IP relative to IPs on the
1523    given node.  The ips argument is generally the all_ips variable
1524    used in the main part of the algorithm.
1525  */
1526 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1527                                   struct ctdb_public_ip_list *ips,
1528                                   int pnn)
1529 {
1530         struct ctdb_public_ip_list *t;
1531         uint32_t d;
1532
1533         uint32_t sum = 0;
1534
1535         for (t=ips; t != NULL; t=t->next) {
1536                 if (t->pnn != pnn) {
1537                         continue;
1538                 }
1539
1540                 /* Optimisation: We never calculate the distance
1541                  * between an address and itself.  This allows us to
1542                  * calculate the effect of removing an address from a
1543                  * node by simply calculating the distance between
1544                  * that address and all of the exitsing addresses.
1545                  * Moreover, we assume that we're only ever dealing
1546                  * with addresses from all_ips so we can identify an
1547                  * address via a pointer rather than doing a more
1548                  * expensive address comparison. */
1549                 if (&(t->addr) == ip) {
1550                         continue;
1551                 }
1552
1553                 d = ip_distance(ip, &(t->addr));
1554                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1555         }
1556
1557         return sum;
1558 }
1559
1560 /* Return the LCP2 imbalance metric for addresses currently assigned
1561    to the given node.
1562  */
1563 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1564 {
1565         struct ctdb_public_ip_list *t;
1566
1567         uint32_t imbalance = 0;
1568
1569         for (t=all_ips; t!=NULL; t=t->next) {
1570                 if (t->pnn != pnn) {
1571                         continue;
1572                 }
1573                 /* Pass the rest of the IPs rather than the whole
1574                    all_ips input list.
1575                 */
1576                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1577         }
1578
1579         return imbalance;
1580 }
1581
1582 /* Allocate any unassigned IPs just by looping through the IPs and
1583  * finding the best node for each.
1584  */
1585 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1586                                       struct ctdb_node_map *nodemap,
1587                                       uint32_t mask,
1588                                       struct ctdb_public_ip_list *all_ips)
1589 {
1590         struct ctdb_public_ip_list *tmp_ip;
1591
1592         /* loop over all ip's and find a physical node to cover for 
1593            each unassigned ip.
1594         */
1595         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1596                 if (tmp_ip->pnn == -1) {
1597                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1598                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1599                                         ctdb_addr_to_str(&tmp_ip->addr)));
1600                         }
1601                 }
1602         }
1603 }
1604
1605 /* Basic non-deterministic rebalancing algorithm.
1606  */
1607 static bool basic_failback(struct ctdb_context *ctdb,
1608                            struct ctdb_node_map *nodemap,
1609                            uint32_t mask,
1610                            struct ctdb_public_ip_list *all_ips,
1611                            int num_ips,
1612                            int *retries)
1613 {
1614         int i;
1615         int maxnode, maxnum=0, minnode, minnum=0, num;
1616         struct ctdb_public_ip_list *tmp_ip;
1617
1618         /* for each ip address, loop over all nodes that can serve
1619            this ip and make sure that the difference between the node
1620            serving the most and the node serving the least ip's are
1621            not greater than 1.
1622         */
1623         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1624                 if (tmp_ip->pnn == -1) {
1625                         continue;
1626                 }
1627
1628                 /* Get the highest and lowest number of ips's served by any 
1629                    valid node which can serve this ip.
1630                 */
1631                 maxnode = -1;
1632                 minnode = -1;
1633                 for (i=0;i<nodemap->num;i++) {
1634                         if (nodemap->nodes[i].flags & mask) {
1635                                 continue;
1636                         }
1637
1638                         /* Only check nodes that are allowed to takeover an ip */
1639                         if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1640                                 continue;
1641                         }
1642
1643                         /* only check nodes that can actually serve this ip */
1644                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1645                                 /* no it couldnt   so skip to the next node */
1646                                 continue;
1647                         }
1648
1649                         num = node_ip_coverage(ctdb, i, all_ips);
1650                         if (maxnode == -1) {
1651                                 maxnode = i;
1652                                 maxnum  = num;
1653                         } else {
1654                                 if (num > maxnum) {
1655                                         maxnode = i;
1656                                         maxnum  = num;
1657                                 }
1658                         }
1659                         if (minnode == -1) {
1660                                 minnode = i;
1661                                 minnum  = num;
1662                         } else {
1663                                 if (num < minnum) {
1664                                         minnode = i;
1665                                         minnum  = num;
1666                                 }
1667                         }
1668                 }
1669                 if (maxnode == -1) {
1670                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1671                                 ctdb_addr_to_str(&tmp_ip->addr)));
1672
1673                         continue;
1674                 }
1675
1676                 /* If we want deterministic IPs then dont try to reallocate 
1677                    them to spread out the load.
1678                 */
1679                 if (1 == ctdb->tunable.deterministic_public_ips) {
1680                         continue;
1681                 }
1682
1683                 /* if the spread between the smallest and largest coverage by
1684                    a node is >=2 we steal one of the ips from the node with
1685                    most coverage to even things out a bit.
1686                    try to do this a limited number of times since we dont
1687                    want to spend too much time balancing the ip coverage.
1688                 */
1689                 if ( (maxnum > minnum+1)
1690                      && (*retries < (num_ips + 5)) ){
1691                         struct ctdb_public_ip_list *tmp;
1692
1693                         /* mark one of maxnode's vnn's as unassigned and try
1694                            again
1695                         */
1696                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1697                                 if (tmp->pnn == maxnode) {
1698                                         tmp->pnn = -1;
1699                                         (*retries)++;
1700                                         return true;
1701                                 }
1702                         }
1703                 }
1704         }
1705
1706         return false;
1707 }
1708
1709 struct ctdb_rebalancenodes {
1710         struct ctdb_rebalancenodes *next;
1711         uint32_t pnn;
1712 };
1713 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1714
1715
1716 /* set this flag to force the node to be rebalanced even if it just didnt
1717    become healthy again.
1718 */
1719 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1720 {
1721         struct ctdb_rebalancenodes *rebalance;
1722
1723         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1724                 if (rebalance->pnn == pnn) {
1725                         return;
1726                 }
1727         }
1728
1729         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1730         rebalance->pnn = pnn;
1731         rebalance->next = force_rebalance_list;
1732         force_rebalance_list = rebalance;
1733 }
1734
1735 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1736  * that we can unit test it.
1737  */
1738 static void lcp2_init(struct ctdb_context * tmp_ctx,
1739                struct ctdb_node_map * nodemap,
1740                uint32_t mask,
1741                struct ctdb_public_ip_list *all_ips,
1742                uint32_t **lcp2_imbalances,
1743                bool **newly_healthy)
1744 {
1745         int i;
1746         struct ctdb_public_ip_list *tmp_ip;
1747
1748         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1750         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1751         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752
1753         for (i=0;i<nodemap->num;i++) {
1754                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1755                 /* First step: is the node "healthy"? */
1756                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1757         }
1758
1759         /* 2nd step: if a ndoe has IPs assigned then it must have been
1760          * healthy before, so we remove it from consideration... */
1761         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1762                 if (tmp_ip->pnn != -1) {
1763                         (*newly_healthy)[tmp_ip->pnn] = false;
1764                 }
1765         }
1766
1767         /* 3rd step: if a node is forced to re-balance then
1768            we allow failback onto the node */
1769         while (force_rebalance_list != NULL) {
1770                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1771
1772                 if (force_rebalance_list->pnn <= nodemap->num) {
1773                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1774                 }
1775
1776                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1777                 talloc_free(force_rebalance_list);
1778                 force_rebalance_list = next;
1779         }
1780 }
1781
1782 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1783  * the IP/node combination that will cost the least.
1784  */
1785 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1786                               struct ctdb_node_map *nodemap,
1787                               uint32_t mask,
1788                               struct ctdb_public_ip_list *all_ips,
1789                               uint32_t *lcp2_imbalances)
1790 {
1791         struct ctdb_public_ip_list *tmp_ip;
1792         int dstnode;
1793
1794         int minnode;
1795         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1796         struct ctdb_public_ip_list *minip;
1797
1798         bool should_loop = true;
1799         bool have_unassigned = true;
1800
1801         while (have_unassigned && should_loop) {
1802                 should_loop = false;
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1806
1807                 minnode = -1;
1808                 mindsum = 0;
1809                 minip = NULL;
1810
1811                 /* loop over each unassigned ip. */
1812                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1813                         if (tmp_ip->pnn != -1) {
1814                                 continue;
1815                         }
1816
1817                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1818                                 /* Only check nodes that are allowed to takeover an ip */
1819                                 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1820                                         continue;
1821                                 }
1822
1823                                 /* only check nodes that can actually serve this ip */
1824                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1825                                         /* no it couldnt   so skip to the next node */
1826                                         continue;
1827                                 }
1828                                 if (nodemap->nodes[dstnode].flags & mask) {
1829                                         continue;
1830                                 }
1831
1832                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1833                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1834                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1835                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1836                                                    dstnode,
1837                                                    dstimbl - lcp2_imbalances[dstnode]));
1838
1839
1840                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1841                                         minnode = dstnode;
1842                                         minimbl = dstimbl;
1843                                         mindsum = dstdsum;
1844                                         minip = tmp_ip;
1845                                         should_loop = true;
1846                                 }
1847                         }
1848                 }
1849
1850                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1851
1852                 /* If we found one then assign it to the given node. */
1853                 if (minnode != -1) {
1854                         minip->pnn = minnode;
1855                         lcp2_imbalances[minnode] = minimbl;
1856                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1857                                           ctdb_addr_to_str(&(minip->addr)),
1858                                           minnode,
1859                                           mindsum));
1860                 }
1861
1862                 /* There might be a better way but at least this is clear. */
1863                 have_unassigned = false;
1864                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1865                         if (tmp_ip->pnn == -1) {
1866                                 have_unassigned = true;
1867                         }
1868                 }
1869         }
1870
1871         /* We know if we have an unassigned addresses so we might as
1872          * well optimise.
1873          */
1874         if (have_unassigned) {
1875                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876                         if (tmp_ip->pnn == -1) {
1877                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1878                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1879                         }
1880                 }
1881         }
1882 }
1883
1884 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1885  * to move IPs from, determines the best IP/destination node
1886  * combination to move from the source node.
1887  */
1888 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1889                                     struct ctdb_node_map *nodemap,
1890                                     struct ctdb_public_ip_list *all_ips,
1891                                     int srcnode,
1892                                     uint32_t candimbl,
1893                                     uint32_t *lcp2_imbalances,
1894                                     bool *newly_healthy)
1895 {
1896         int dstnode, mindstnode;
1897         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1898         uint32_t minsrcimbl, mindstimbl;
1899         struct ctdb_public_ip_list *minip;
1900         struct ctdb_public_ip_list *tmp_ip;
1901
1902         /* Find an IP and destination node that best reduces imbalance. */
1903         minip = NULL;
1904         minsrcimbl = 0;
1905         mindstnode = -1;
1906         mindstimbl = 0;
1907
1908         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1909         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1910
1911         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1912                 /* Only consider addresses on srcnode. */
1913                 if (tmp_ip->pnn != srcnode) {
1914                         continue;
1915                 }
1916
1917                 /* What is this IP address costing the source node? */
1918                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1919                 srcimbl = candimbl - srcdsum;
1920
1921                 /* Consider this IP address would cost each potential
1922                  * destination node.  Destination nodes are limited to
1923                  * those that are newly healthy, since we don't want
1924                  * to do gratuitous failover of IPs just to make minor
1925                  * balance improvements.
1926                  */
1927                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1928                         if (! newly_healthy[dstnode]) {
1929                                 continue;
1930                         }
1931
1932                         /* Only check nodes that are allowed to takeover an ip */
1933                         if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1934                                 continue;
1935                         }
1936
1937                         /* only check nodes that can actually serve this ip */
1938                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1939                                 /* no it couldnt   so skip to the next node */
1940                                 continue;
1941                         }
1942
1943                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1944                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1945                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1946                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1947                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1948                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1949
1950                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1951                             ((mindstnode == -1) ||                              \
1952                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1953
1954                                 minip = tmp_ip;
1955                                 minsrcimbl = srcimbl;
1956                                 mindstnode = dstnode;
1957                                 mindstimbl = dstimbl;
1958                         }
1959                 }
1960         }
1961         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1962
1963         if (mindstnode != -1) {
1964                 /* We found a move that makes things better... */
1965                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1966                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1967                                   ctdb_addr_to_str(&(minip->addr)),
1968                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1969
1970
1971                 lcp2_imbalances[srcnode] = srcimbl;
1972                 lcp2_imbalances[mindstnode] = mindstimbl;
1973                 minip->pnn = mindstnode;
1974
1975                 return true;
1976         }
1977
1978         return false;
1979         
1980 }
1981
1982 struct lcp2_imbalance_pnn {
1983         uint32_t imbalance;
1984         int pnn;
1985 };
1986
1987 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1988 {
1989         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1990         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1991
1992         if (lipa->imbalance > lipb->imbalance) {
1993                 return -1;
1994         } else if (lipa->imbalance == lipb->imbalance) {
1995                 return 0;
1996         } else {
1997                 return 1;
1998         }
1999 }
2000
2001 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2002  * node with the highest LCP2 imbalance, and then determines the best
2003  * IP/destination node combination to move from the source node.
2004  */
2005 static bool lcp2_failback(struct ctdb_context *ctdb,
2006                           struct ctdb_node_map *nodemap,
2007                           uint32_t mask,
2008                           struct ctdb_public_ip_list *all_ips,
2009                           uint32_t *lcp2_imbalances,
2010                           bool *newly_healthy)
2011 {
2012         int i, num_newly_healthy;
2013         struct lcp2_imbalance_pnn * lips;
2014         bool ret;
2015
2016         /* It is only worth continuing if we have suitable target
2017          * nodes to transfer IPs to.  This check is much cheaper than
2018          * continuing on...
2019          */
2020         num_newly_healthy = 0;
2021         for (i = 0; i < nodemap->num; i++) {
2022                 if (newly_healthy[i]) {
2023                         num_newly_healthy++;
2024                 }
2025         }
2026         if (num_newly_healthy == 0) {
2027                 return false;
2028         }
2029
2030         /* Put the imbalances and nodes into an array, sort them and
2031          * iterate through candidates.  Usually the 1st one will be
2032          * used, so this doesn't cost much...
2033          */
2034         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2035         for (i = 0; i < nodemap->num; i++) {
2036                 lips[i].imbalance = lcp2_imbalances[i];
2037                 lips[i].pnn = i;
2038         }
2039         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2040               lcp2_cmp_imbalance_pnn);
2041
2042         ret = false;
2043         for (i = 0; i < nodemap->num; i++) {
2044                 /* This means that all nodes had 0 or 1 addresses, so
2045                  * can't be imbalanced.
2046                  */
2047                 if (lips[i].imbalance == 0) {
2048                         break;
2049                 }
2050
2051                 if (lcp2_failback_candidate(ctdb,
2052                                             nodemap,
2053                                             all_ips,
2054                                             lips[i].pnn,
2055                                             lips[i].imbalance,
2056                                             lcp2_imbalances,
2057                                             newly_healthy)) {
2058                         ret = true;
2059                         break;
2060                 }
2061         }
2062
2063         talloc_free(lips);
2064         return ret;
2065 }
2066
2067 /* The calculation part of the IP allocation algorithm. */
2068 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2069                                    struct ctdb_node_map *nodemap,
2070                                    struct ctdb_public_ip_list **all_ips_p)
2071 {
2072         int i, num_healthy, retries, num_ips;
2073         uint32_t mask;
2074         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2075         uint32_t *lcp2_imbalances;
2076         bool *newly_healthy;
2077
2078         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2079
2080         /* Count how many completely healthy nodes we have */
2081         num_healthy = 0;
2082         for (i=0;i<nodemap->num;i++) {
2083                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2084                         num_healthy++;
2085                 }
2086         }
2087
2088         /* If we have healthy nodes then we will only consider them
2089            for serving public addresses
2090         */
2091         mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
2092         if ((num_healthy == 0) &&
2093             (ctdb->tunable.no_ip_takeover_on_disabled == 0)) {
2094                 /* We didnt have any completely healthy nodes so
2095                    use "disabled" nodes as a fallback
2096                 */
2097                 mask = NODE_FLAGS_INACTIVE;
2098         }
2099
2100         /* since nodes only know about those public addresses that
2101            can be served by that particular node, no single node has
2102            a full list of all public addresses that exist in the cluster.
2103            Walk over all node structures and create a merged list of
2104            all public addresses that exist in the cluster.
2105
2106            keep the tree of ips around as ctdb->ip_tree
2107         */
2108         all_ips = create_merged_ip_list(ctdb);
2109         *all_ips_p = all_ips; /* minimal code changes */
2110
2111         /* Count how many ips we have */
2112         num_ips = 0;
2113         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2114                 num_ips++;
2115         }
2116
2117         /* If we want deterministic ip allocations, i.e. that the ip addresses
2118            will always be allocated the same way for a specific set of
2119            available/unavailable nodes.
2120         */
2121         if (1 == ctdb->tunable.deterministic_public_ips) {              
2122                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2123                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2124                         tmp_ip->pnn = i%nodemap->num;
2125                 }
2126         }
2127
2128
2129         /* mark all public addresses with a masked node as being served by
2130            node -1
2131         */
2132         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2133                 if (tmp_ip->pnn == -1) {
2134                         continue;
2135                 }
2136                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
2137                         tmp_ip->pnn = -1;
2138                 }
2139         }
2140
2141         /* verify that the assigned nodes can serve that public ip
2142            and set it to -1 if not
2143         */
2144         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2145                 if (tmp_ip->pnn == -1) {
2146                         continue;
2147                 }
2148                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2149                         /* this node can not serve this ip. */
2150                         tmp_ip->pnn = -1;
2151                 }
2152         }
2153
2154         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2155                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2156         }
2157
2158         /* now we must redistribute all public addresses with takeover node
2159            -1 among the nodes available
2160         */
2161         retries = 0;
2162 try_again:
2163         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2164                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2165         } else {
2166                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2167         }
2168
2169         /* If we dont want ips to fail back after a node becomes healthy
2170            again, we wont even try to reallocat the ip addresses so that
2171            they are evenly spread out.
2172            This can NOT be used at the same time as DeterministicIPs !
2173         */
2174         if (1 == ctdb->tunable.no_ip_failback) {
2175                 if (1 == ctdb->tunable.deterministic_public_ips) {
2176                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
2177                 }
2178                 goto finished;
2179         }
2180
2181
2182         /* now, try to make sure the ip adresses are evenly distributed
2183            across the node.
2184         */
2185         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2186                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
2187                         goto try_again;
2188                 }
2189         } else {
2190                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
2191                         goto try_again;
2192                 }
2193         }
2194
2195         /* finished distributing the public addresses, now just send the 
2196            info out to the nodes */
2197 finished:
2198         /* at this point ->pnn is the node which will own each IP
2199            or -1 if there is no node that can cover this ip
2200         */
2201
2202         return;
2203 }
2204
2205 static void noiptakeover_cb(struct ctdb_context *ctdb, uint32_t pnn, int32_t res, TDB_DATA outdata, void *callback)
2206 {
2207         struct ctdb_node_map *nodemap = (struct ctdb_node_map *)callback;
2208
2209         if (res != 0) {
2210                 DEBUG(DEBUG_ERR,("Failure to read NoIPTakeover tunable from remote node %d\n", pnn));
2211                 return;
2212         }
2213
2214         if (outdata.dsize != sizeof(uint32_t)) {
2215                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading NoIPTakeover tunable from node %d. Expected %d bytes but received %d bytes\n", pnn, (int)sizeof(uint32_t), (int)outdata.dsize));
2216                 return;
2217         }
2218
2219         if (pnn >= nodemap->num) {
2220                 DEBUG(DEBUG_ERR,("Got NoIPTakeover reply from node %d but nodemap only has %d entries\n", pnn, nodemap->num));
2221                 return;
2222         }
2223
2224         if (*(uint32_t *)outdata.dptr != 0) {
2225                 nodemap->nodes[pnn].flags |= NODE_FLAGS_NOIPTAKEOVER;
2226         }
2227 }
2228
2229 /*
2230   make any IP alias changes for public addresses that are necessary 
2231  */
2232 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2233                       client_async_callback fail_callback, void *callback_data)
2234 {
2235         int i;
2236         struct ctdb_public_ip ip;
2237         struct ctdb_public_ipv4 ipv4;
2238         struct ctdb_control_get_tunable *t;
2239         uint32_t *nodes;
2240         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2241         TDB_DATA data;
2242         struct timeval timeout;
2243         struct client_async_data *async_data;
2244         struct ctdb_client_control_state *state;
2245         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2246         uint32_t disable_timeout;
2247
2248         /*
2249          * ip failover is completely disabled, just send out the 
2250          * ipreallocated event.
2251          */
2252         if (ctdb->tunable.disable_ip_failover != 0) {
2253                 goto ipreallocated;
2254         }
2255
2256
2257         /* assume all nodes do support failback */
2258         for (i=0;i<nodemap->num;i++) {
2259                 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2260         }
2261         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen("NoIPTakeover") + 1;
2262         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2263         t = (struct ctdb_control_get_tunable *)data.dptr;
2264         t->length = strlen("NoIPTakeover")+1;
2265         memcpy(t->name, "NoIPTakeover", t->length);
2266         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2267         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2268                                       nodes, 0, TAKEOVER_TIMEOUT(),
2269                                       false, data,
2270                                       noiptakeover_cb, NULL,
2271                                       nodemap) != 0) {
2272                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get noiptakeover tunable failed\n"));
2273         }
2274         talloc_free(nodes);
2275         talloc_free(data.dptr);
2276
2277
2278         ZERO_STRUCT(ip);
2279
2280         /* Do the IP reassignment calculations */
2281         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2282
2283         /* The recovery daemon does regular sanity checks of the IPs.
2284          * However, sometimes it is overzealous and thinks changes are
2285          * required when they're already underway.  This stops the
2286          * checks for a while before we start moving IPs.
2287          */
2288         disable_timeout = ctdb->tunable.takeover_timeout;
2289         data.dptr  = (uint8_t*)&disable_timeout;
2290         data.dsize = sizeof(disable_timeout);
2291         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2292                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2293                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2294         }
2295
2296         /* now tell all nodes to delete any alias that they should not
2297            have.  This will be a NOOP on nodes that don't currently
2298            hold the given alias */
2299         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2300         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2301
2302         async_data->fail_callback = fail_callback;
2303         async_data->callback_data = callback_data;
2304
2305         for (i=0;i<nodemap->num;i++) {
2306                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2307                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2308                         continue;
2309                 }
2310
2311                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2312                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2313                                 /* This node should be serving this
2314                                    vnn so dont tell it to release the ip
2315                                 */
2316                                 continue;
2317                         }
2318                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2319                                 ipv4.pnn = tmp_ip->pnn;
2320                                 ipv4.sin = tmp_ip->addr.ip;
2321
2322                                 timeout = TAKEOVER_TIMEOUT();
2323                                 data.dsize = sizeof(ipv4);
2324                                 data.dptr  = (uint8_t *)&ipv4;
2325                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2326                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2327                                                 data, async_data,
2328                                                 &timeout, NULL);
2329                         } else {
2330                                 ip.pnn  = tmp_ip->pnn;
2331                                 ip.addr = tmp_ip->addr;
2332
2333                                 timeout = TAKEOVER_TIMEOUT();
2334                                 data.dsize = sizeof(ip);
2335                                 data.dptr  = (uint8_t *)&ip;
2336                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2337                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2338                                                 data, async_data,
2339                                                 &timeout, NULL);
2340                         }
2341
2342                         if (state == NULL) {
2343                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2344                                 talloc_free(tmp_ctx);
2345                                 return -1;
2346                         }
2347                 
2348                         ctdb_client_async_add(async_data, state);
2349                 }
2350         }
2351         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2352                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2353                 talloc_free(tmp_ctx);
2354                 return -1;
2355         }
2356         talloc_free(async_data);
2357
2358
2359         /* tell all nodes to get their own IPs */
2360         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2361         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2362
2363         async_data->fail_callback = fail_callback;
2364         async_data->callback_data = callback_data;
2365
2366         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2367                 if (tmp_ip->pnn == -1) {
2368                         /* this IP won't be taken over */
2369                         continue;
2370                 }
2371
2372                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2373                         ipv4.pnn = tmp_ip->pnn;
2374                         ipv4.sin = tmp_ip->addr.ip;
2375
2376                         timeout = TAKEOVER_TIMEOUT();
2377                         data.dsize = sizeof(ipv4);
2378                         data.dptr  = (uint8_t *)&ipv4;
2379                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2380                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2381                                         data, async_data,
2382                                         &timeout, NULL);
2383                 } else {
2384                         ip.pnn  = tmp_ip->pnn;
2385                         ip.addr = tmp_ip->addr;
2386
2387                         timeout = TAKEOVER_TIMEOUT();
2388                         data.dsize = sizeof(ip);
2389                         data.dptr  = (uint8_t *)&ip;
2390                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2391                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2392                                         data, async_data,
2393                                         &timeout, NULL);
2394                 }
2395                 if (state == NULL) {
2396                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2397                         talloc_free(tmp_ctx);
2398                         return -1;
2399                 }
2400                 
2401                 ctdb_client_async_add(async_data, state);
2402         }
2403         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2404                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2405                 talloc_free(tmp_ctx);
2406                 return -1;
2407         }
2408
2409 ipreallocated:
2410         /* 
2411          * Tell all nodes to run eventscripts to process the
2412          * "ipreallocated" event.  This can do a lot of things,
2413          * including restarting services to reconfigure them if public
2414          * IPs have moved.  Once upon a time this event only used to
2415          * update natwg.
2416          */
2417         data.dptr  = discard_const("ipreallocated");
2418         data.dsize = strlen((char *)data.dptr) + 1; 
2419         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2420         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2421                                       nodes, 0, TAKEOVER_TIMEOUT(),
2422                                       false, data,
2423                                       NULL, fail_callback,
2424                                       callback_data) != 0) {
2425                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2426         }
2427
2428         talloc_free(tmp_ctx);
2429         return 0;
2430 }
2431
2432
2433 /*
2434   destroy a ctdb_client_ip structure
2435  */
2436 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2437 {
2438         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2439                 ctdb_addr_to_str(&ip->addr),
2440                 ntohs(ip->addr.ip.sin_port),
2441                 ip->client_id));
2442
2443         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2444         return 0;
2445 }
2446
2447 /*
2448   called by a client to inform us of a TCP connection that it is managing
2449   that should tickled with an ACK when IP takeover is done
2450   we handle both the old ipv4 style of packets as well as the new ipv4/6
2451   pdus.
2452  */
2453 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2454                                 TDB_DATA indata)
2455 {
2456         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2457         struct ctdb_control_tcp *old_addr = NULL;
2458         struct ctdb_control_tcp_addr new_addr;
2459         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2460         struct ctdb_tcp_list *tcp;
2461         struct ctdb_tcp_connection t;
2462         int ret;
2463         TDB_DATA data;
2464         struct ctdb_client_ip *ip;
2465         struct ctdb_vnn *vnn;
2466         ctdb_sock_addr addr;
2467
2468         switch (indata.dsize) {
2469         case sizeof(struct ctdb_control_tcp):
2470                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2471                 ZERO_STRUCT(new_addr);
2472                 tcp_sock = &new_addr;
2473                 tcp_sock->src.ip  = old_addr->src;
2474                 tcp_sock->dest.ip = old_addr->dest;
2475                 break;
2476         case sizeof(struct ctdb_control_tcp_addr):
2477                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2478                 break;
2479         default:
2480                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2481                                  "to ctdb_control_tcp_client. size was %d but "
2482                                  "only allowed sizes are %lu and %lu\n",
2483                                  (int)indata.dsize,
2484                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2485                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2486                 return -1;
2487         }
2488
2489         addr = tcp_sock->src;
2490         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2491         addr = tcp_sock->dest;
2492         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2493
2494         ZERO_STRUCT(addr);
2495         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2496         vnn = find_public_ip_vnn(ctdb, &addr);
2497         if (vnn == NULL) {
2498                 switch (addr.sa.sa_family) {
2499                 case AF_INET:
2500                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2501                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2502                                         ctdb_addr_to_str(&addr)));
2503                         }
2504                         break;
2505                 case AF_INET6:
2506                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2507                                 ctdb_addr_to_str(&addr)));
2508                         break;
2509                 default:
2510                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2511                 }
2512
2513                 return 0;
2514         }
2515
2516         if (vnn->pnn != ctdb->pnn) {
2517                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2518                         ctdb_addr_to_str(&addr),
2519                         client_id, client->pid));
2520                 /* failing this call will tell smbd to die */
2521                 return -1;
2522         }
2523
2524         ip = talloc(client, struct ctdb_client_ip);
2525         CTDB_NO_MEMORY(ctdb, ip);
2526
2527         ip->ctdb      = ctdb;
2528         ip->addr      = addr;
2529         ip->client_id = client_id;
2530         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2531         DLIST_ADD(ctdb->client_ip_list, ip);
2532
2533         tcp = talloc(client, struct ctdb_tcp_list);
2534         CTDB_NO_MEMORY(ctdb, tcp);
2535
2536         tcp->connection.src_addr = tcp_sock->src;
2537         tcp->connection.dst_addr = tcp_sock->dest;
2538
2539         DLIST_ADD(client->tcp_list, tcp);
2540
2541         t.src_addr = tcp_sock->src;
2542         t.dst_addr = tcp_sock->dest;
2543
2544         data.dptr = (uint8_t *)&t;
2545         data.dsize = sizeof(t);
2546
2547         switch (addr.sa.sa_family) {
2548         case AF_INET:
2549                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2550                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2551                         ctdb_addr_to_str(&tcp_sock->src),
2552                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2553                 break;
2554         case AF_INET6:
2555                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2556                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2557                         ctdb_addr_to_str(&tcp_sock->src),
2558                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2559                 break;
2560         default:
2561                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2562         }
2563
2564
2565         /* tell all nodes about this tcp connection */
2566         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2567                                        CTDB_CONTROL_TCP_ADD,
2568                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2569         if (ret != 0) {
2570                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2571                 return -1;
2572         }
2573
2574         return 0;
2575 }
2576
2577 /*
2578   find a tcp address on a list
2579  */
2580 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2581                                            struct ctdb_tcp_connection *tcp)
2582 {
2583         int i;
2584
2585         if (array == NULL) {
2586                 return NULL;
2587         }
2588
2589         for (i=0;i<array->num;i++) {
2590                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2591                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2592                         return &array->connections[i];
2593                 }
2594         }
2595         return NULL;
2596 }
2597
2598
2599
2600 /*
2601   called by a daemon to inform us of a TCP connection that one of its
2602   clients managing that should tickled with an ACK when IP takeover is
2603   done
2604  */
2605 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2606 {
2607         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2608         struct ctdb_tcp_array *tcparray;
2609         struct ctdb_tcp_connection tcp;
2610         struct ctdb_vnn *vnn;
2611
2612         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2613         if (vnn == NULL) {
2614                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2615                         ctdb_addr_to_str(&p->dst_addr)));
2616
2617                 return -1;
2618         }
2619
2620
2621         tcparray = vnn->tcp_array;
2622
2623         /* If this is the first tickle */
2624         if (tcparray == NULL) {
2625                 tcparray = talloc_size(ctdb->nodes, 
2626                         offsetof(struct ctdb_tcp_array, connections) +
2627                         sizeof(struct ctdb_tcp_connection) * 1);
2628                 CTDB_NO_MEMORY(ctdb, tcparray);
2629                 vnn->tcp_array = tcparray;
2630
2631                 tcparray->num = 0;
2632                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2633                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2634
2635                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2636                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2637                 tcparray->num++;
2638
2639                 if (tcp_update_needed) {
2640                         vnn->tcp_update_needed = true;
2641                 }
2642                 return 0;
2643         }
2644
2645
2646         /* Do we already have this tickle ?*/
2647         tcp.src_addr = p->src_addr;
2648         tcp.dst_addr = p->dst_addr;
2649         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2650                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2651                         ctdb_addr_to_str(&tcp.dst_addr),
2652                         ntohs(tcp.dst_addr.ip.sin_port),
2653                         vnn->pnn));
2654                 return 0;
2655         }
2656
2657         /* A new tickle, we must add it to the array */
2658         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2659                                         struct ctdb_tcp_connection,
2660                                         tcparray->num+1);
2661         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2662
2663         vnn->tcp_array = tcparray;
2664         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2665         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2666         tcparray->num++;
2667                                 
2668         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2669                 ctdb_addr_to_str(&tcp.dst_addr),
2670                 ntohs(tcp.dst_addr.ip.sin_port),
2671                 vnn->pnn));
2672
2673         if (tcp_update_needed) {
2674                 vnn->tcp_update_needed = true;
2675         }
2676
2677         return 0;
2678 }
2679
2680
2681 /*
2682   called by a daemon to inform us of a TCP connection that one of its
2683   clients managing that should tickled with an ACK when IP takeover is
2684   done
2685  */
2686 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2687 {
2688         struct ctdb_tcp_connection *tcpp;
2689         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2690
2691         if (vnn == NULL) {
2692                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2693                         ctdb_addr_to_str(&conn->dst_addr)));
2694                 return;
2695         }
2696
2697         /* if the array is empty we cant remove it
2698            and we dont need to do anything
2699          */
2700         if (vnn->tcp_array == NULL) {
2701                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2702                         ctdb_addr_to_str(&conn->dst_addr),
2703                         ntohs(conn->dst_addr.ip.sin_port)));
2704                 return;
2705         }
2706
2707
2708         /* See if we know this connection
2709            if we dont know this connection  then we dont need to do anything
2710          */
2711         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2712         if (tcpp == NULL) {
2713                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2714                         ctdb_addr_to_str(&conn->dst_addr),
2715                         ntohs(conn->dst_addr.ip.sin_port)));
2716                 return;
2717         }
2718
2719
2720         /* We need to remove this entry from the array.
2721            Instead of allocating a new array and copying data to it
2722            we cheat and just copy the last entry in the existing array
2723            to the entry that is to be removed and just shring the 
2724            ->num field
2725          */
2726         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2727         vnn->tcp_array->num--;
2728
2729         /* If we deleted the last entry we also need to remove the entire array
2730          */
2731         if (vnn->tcp_array->num == 0) {
2732                 talloc_free(vnn->tcp_array);
2733                 vnn->tcp_array = NULL;
2734         }               
2735
2736         vnn->tcp_update_needed = true;
2737
2738         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2739                 ctdb_addr_to_str(&conn->src_addr),
2740                 ntohs(conn->src_addr.ip.sin_port)));
2741 }
2742
2743
2744 /*
2745   called by a daemon to inform us of a TCP connection that one of its
2746   clients used are no longer needed in the tickle database
2747  */
2748 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2749 {
2750         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2751
2752         ctdb_remove_tcp_connection(ctdb, conn);
2753
2754         return 0;
2755 }
2756
2757
2758 /*
2759   called when a daemon restarts - send all tickes for all public addresses
2760   we are serving immediately to the new node.
2761  */
2762 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2763 {
2764 /*XXX here we should send all tickes we are serving to the new node */
2765         return 0;
2766 }
2767
2768
2769 /*
2770   called when a client structure goes away - hook to remove
2771   elements from the tcp_list in all daemons
2772  */
2773 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2774 {
2775         while (client->tcp_list) {
2776                 struct ctdb_tcp_list *tcp = client->tcp_list;
2777                 DLIST_REMOVE(client->tcp_list, tcp);
2778                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2779         }
2780 }
2781
2782
2783 /*
2784   release all IPs on shutdown
2785  */
2786 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2787 {
2788         struct ctdb_vnn *vnn;
2789
2790         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2791                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2792                         ctdb_vnn_unassign_iface(ctdb, vnn);
2793                         continue;
2794                 }
2795                 if (!vnn->iface) {
2796                         continue;
2797                 }
2798                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2799                                   ctdb_vnn_iface_string(vnn),
2800                                   ctdb_addr_to_str(&vnn->public_address),
2801                                   vnn->public_netmask_bits);
2802                 release_kill_clients(ctdb, &vnn->public_address);
2803                 ctdb_vnn_unassign_iface(ctdb, vnn);
2804         }
2805 }
2806
2807
2808 /*
2809   get list of public IPs
2810  */
2811 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2812                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2813 {
2814         int i, num, len;
2815         struct ctdb_all_public_ips *ips;
2816         struct ctdb_vnn *vnn;
2817         bool only_available = false;
2818
2819         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2820                 only_available = true;
2821         }
2822
2823         /* count how many public ip structures we have */
2824         num = 0;
2825         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2826                 num++;
2827         }
2828
2829         len = offsetof(struct ctdb_all_public_ips, ips) + 
2830                 num*sizeof(struct ctdb_public_ip);
2831         ips = talloc_zero_size(outdata, len);
2832         CTDB_NO_MEMORY(ctdb, ips);
2833
2834         i = 0;
2835         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2836                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2837                         continue;
2838                 }
2839                 ips->ips[i].pnn  = vnn->pnn;
2840                 ips->ips[i].addr = vnn->public_address;
2841                 i++;
2842         }
2843         ips->num = i;
2844         len = offsetof(struct ctdb_all_public_ips, ips) +
2845                 i*sizeof(struct ctdb_public_ip);
2846
2847         outdata->dsize = len;
2848         outdata->dptr  = (uint8_t *)ips;
2849
2850         return 0;
2851 }
2852
2853
2854 /*
2855   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2856  */
2857 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2858                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2859 {
2860         int i, num, len;
2861         struct ctdb_all_public_ipsv4 *ips;
2862         struct ctdb_vnn *vnn;
2863
2864         /* count how many public ip structures we have */
2865         num = 0;
2866         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2867                 if (vnn->public_address.sa.sa_family != AF_INET) {
2868                         continue;
2869                 }
2870                 num++;
2871         }
2872
2873         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2874                 num*sizeof(struct ctdb_public_ipv4);
2875         ips = talloc_zero_size(outdata, len);
2876         CTDB_NO_MEMORY(ctdb, ips);
2877
2878         outdata->dsize = len;
2879         outdata->dptr  = (uint8_t *)ips;
2880
2881         ips->num = num;
2882         i = 0;
2883         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2884                 if (vnn->public_address.sa.sa_family != AF_INET) {
2885                         continue;
2886                 }
2887                 ips->ips[i].pnn = vnn->pnn;
2888                 ips->ips[i].sin = vnn->public_address.ip;
2889                 i++;
2890         }
2891
2892         return 0;
2893 }
2894
2895 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2896                                         struct ctdb_req_control *c,
2897                                         TDB_DATA indata,
2898                                         TDB_DATA *outdata)
2899 {
2900         int i, num, len;
2901         ctdb_sock_addr *addr;
2902         struct ctdb_control_public_ip_info *info;
2903         struct ctdb_vnn *vnn;
2904
2905         addr = (ctdb_sock_addr *)indata.dptr;
2906
2907         vnn = find_public_ip_vnn(ctdb, addr);
2908         if (vnn == NULL) {
2909                 /* if it is not a public ip   it could be our 'single ip' */
2910                 if (ctdb->single_ip_vnn) {
2911                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2912                                 vnn = ctdb->single_ip_vnn;
2913                         }
2914                 }
2915         }
2916         if (vnn == NULL) {
2917                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2918                                  "'%s'not a public address\n",
2919                                  ctdb_addr_to_str(addr)));
2920                 return -1;
2921         }
2922
2923         /* count how many public ip structures we have */
2924         num = 0;
2925         for (;vnn->ifaces[num];) {
2926                 num++;
2927         }
2928
2929         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2930                 num*sizeof(struct ctdb_control_iface_info);
2931         info = talloc_zero_size(outdata, len);
2932         CTDB_NO_MEMORY(ctdb, info);
2933
2934         info->ip.addr = vnn->public_address;
2935         info->ip.pnn = vnn->pnn;
2936         info->active_idx = 0xFFFFFFFF;
2937
2938         for (i=0; vnn->ifaces[i]; i++) {
2939                 struct ctdb_iface *cur;
2940
2941                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2942                 if (cur == NULL) {
2943                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2944                                            vnn->ifaces[i]));
2945                         return -1;
2946                 }
2947                 if (vnn->iface == cur) {
2948                         info->active_idx = i;
2949                 }
2950                 strcpy(info->ifaces[i].name, cur->name);
2951                 info->ifaces[i].link_state = cur->link_up;
2952                 info->ifaces[i].references = cur->references;
2953         }
2954         info->num = i;
2955         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2956                 i*sizeof(struct ctdb_control_iface_info);
2957
2958         outdata->dsize = len;
2959         outdata->dptr  = (uint8_t *)info;
2960
2961         return 0;
2962 }
2963
2964 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2965                                 struct ctdb_req_control *c,
2966                                 TDB_DATA *outdata)
2967 {
2968         int i, num, len;
2969         struct ctdb_control_get_ifaces *ifaces;
2970         struct ctdb_iface *cur;
2971
2972         /* count how many public ip structures we have */
2973         num = 0;
2974         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2975                 num++;
2976         }
2977
2978         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2979                 num*sizeof(struct ctdb_control_iface_info);
2980         ifaces = talloc_zero_size(outdata, len);
2981         CTDB_NO_MEMORY(ctdb, ifaces);
2982
2983         i = 0;
2984         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2985                 strcpy(ifaces->ifaces[i].name, cur->name);
2986                 ifaces->ifaces[i].link_state = cur->link_up;
2987                 ifaces->ifaces[i].references = cur->references;
2988                 i++;
2989         }
2990         ifaces->num = i;
2991         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2992                 i*sizeof(struct ctdb_control_iface_info);
2993
2994         outdata->dsize = len;
2995         outdata->dptr  = (uint8_t *)ifaces;
2996
2997         return 0;
2998 }
2999
3000 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3001                                     struct ctdb_req_control *c,
3002                                     TDB_DATA indata)
3003 {
3004         struct ctdb_control_iface_info *info;
3005         struct ctdb_iface *iface;
3006         bool link_up = false;
3007
3008         info = (struct ctdb_control_iface_info *)indata.dptr;
3009
3010         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3011                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3012                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3013                                   len, len, info->name));
3014                 return -1;
3015         }
3016
3017         switch (info->link_state) {
3018         case 0:
3019                 link_up = false;
3020                 break;
3021         case 1:
3022                 link_up = true;
3023                 break;
3024         default:
3025                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3026                                   (unsigned int)info->link_state));
3027                 return -1;
3028         }
3029
3030         if (info->references != 0) {
3031                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3032                                   (unsigned int)info->references));
3033                 return -1;
3034         }
3035
3036         iface = ctdb_find_iface(ctdb, info->name);
3037         if (iface == NULL) {
3038                 return -1;
3039         }
3040
3041         if (link_up == iface->link_up) {
3042                 return 0;
3043         }
3044
3045         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3046               ("iface[%s] has changed it's link status %s => %s\n",
3047                iface->name,
3048                iface->link_up?"up":"down",
3049                link_up?"up":"down"));
3050
3051         iface->link_up = link_up;
3052         return 0;
3053 }
3054
3055
3056 /* 
3057    structure containing the listening socket and the list of tcp connections
3058    that the ctdb daemon is to kill
3059 */
3060 struct ctdb_kill_tcp {
3061         struct ctdb_vnn *vnn;
3062         struct ctdb_context *ctdb;
3063         int capture_fd;
3064         struct fd_event *fde;
3065         trbt_tree_t *connections;
3066         void *private_data;
3067 };
3068
3069 /*
3070   a tcp connection that is to be killed
3071  */
3072 struct ctdb_killtcp_con {
3073         ctdb_sock_addr src_addr;
3074         ctdb_sock_addr dst_addr;
3075         int count;
3076         struct ctdb_kill_tcp *killtcp;
3077 };
3078
3079 /* this function is used to create a key to represent this socketpair
3080    in the killtcp tree.
3081    this key is used to insert and lookup matching socketpairs that are
3082    to be tickled and RST
3083 */
3084 #define KILLTCP_KEYLEN  10
3085 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3086 {
3087         static uint32_t key[KILLTCP_KEYLEN];
3088
3089         bzero(key, sizeof(key));
3090
3091         if (src->sa.sa_family != dst->sa.sa_family) {
3092                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3093                 return key;
3094         }
3095         
3096         switch (src->sa.sa_family) {
3097         case AF_INET:
3098                 key[0]  = dst->ip.sin_addr.s_addr;
3099                 key[1]  = src->ip.sin_addr.s_addr;
3100                 key[2]  = dst->ip.sin_port;
3101                 key[3]  = src->ip.sin_port;
3102                 break;
3103         case AF_INET6: {
3104                 uint32_t *dst6_addr32 =
3105                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3106                 uint32_t *src6_addr32 =
3107                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3108                 key[0]  = dst6_addr32[3];
3109                 key[1]  = src6_addr32[3];
3110                 key[2]  = dst6_addr32[2];
3111                 key[3]  = src6_addr32[2];
3112                 key[4]  = dst6_addr32[1];
3113                 key[5]  = src6_addr32[1];
3114                 key[6]  = dst6_addr32[0];
3115                 key[7]  = src6_addr32[0];
3116                 key[8]  = dst->ip6.sin6_port;
3117                 key[9]  = src->ip6.sin6_port;
3118                 break;
3119         }
3120         default:
3121                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3122                 return key;
3123         }
3124
3125         return key;
3126 }
3127
3128 /*
3129   called when we get a read event on the raw socket
3130  */
3131 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3132                                 uint16_t flags, void *private_data)
3133 {
3134         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3135         struct ctdb_killtcp_con *con;
3136         ctdb_sock_addr src, dst;
3137         uint32_t ack_seq, seq;
3138
3139         if (!(flags & EVENT_FD_READ)) {
3140                 return;
3141         }
3142
3143         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3144                                 killtcp->private_data,
3145                                 &src, &dst,
3146                                 &ack_seq, &seq) != 0) {
3147                 /* probably a non-tcp ACK packet */
3148                 return;
3149         }
3150
3151         /* check if we have this guy in our list of connections
3152            to kill
3153         */
3154         con = trbt_lookuparray32(killtcp->connections, 
3155                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3156         if (con == NULL) {
3157                 /* no this was some other packet we can just ignore */
3158                 return;
3159         }
3160
3161         /* This one has been tickled !
3162            now reset him and remove him from the list.
3163          */
3164         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3165                 ntohs(con->dst_addr.ip.sin_port),
3166                 ctdb_addr_to_str(&con->src_addr),
3167                 ntohs(con->src_addr.ip.sin_port)));
3168
3169         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3170         talloc_free(con);
3171 }
3172
3173
3174 /* when traversing the list of all tcp connections to send tickle acks to
3175    (so that we can capture the ack coming back and kill the connection
3176     by a RST)
3177    this callback is called for each connection we are currently trying to kill
3178 */
3179 static int tickle_connection_traverse(void *param, void *data)
3180 {
3181         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3182
3183         /* have tried too many times, just give up */
3184         if (con->count >= 5) {
3185                 /* can't delete in traverse: reparent to delete_cons */
3186                 talloc_steal(param, con);
3187                 return 0;
3188         }
3189
3190         /* othervise, try tickling it again */
3191         con->count++;
3192         ctdb_sys_send_tcp(
3193                 (ctdb_sock_addr *)&con->dst_addr,
3194                 (ctdb_sock_addr *)&con->src_addr,
3195                 0, 0, 0);
3196         return 0;
3197 }
3198
3199
3200 /* 
3201    called every second until all sentenced connections have been reset
3202  */
3203 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3204                                               struct timeval t, void *private_data)
3205 {
3206         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3207         void *delete_cons = talloc_new(NULL);
3208
3209         /* loop over all connections sending tickle ACKs */
3210         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3211
3212         /* now we've finished traverse, it's safe to do deletion. */
3213         talloc_free(delete_cons);
3214
3215         /* If there are no more connections to kill we can remove the
3216            entire killtcp structure
3217          */
3218         if ( (killtcp->connections == NULL) || 
3219              (killtcp->connections->root == NULL) ) {
3220                 talloc_free(killtcp);
3221                 return;
3222         }
3223
3224         /* try tickling them again in a seconds time
3225          */
3226         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3227                         ctdb_tickle_sentenced_connections, killtcp);
3228 }
3229
3230 /*
3231   destroy the killtcp structure
3232  */
3233 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3234 {
3235         struct ctdb_vnn *tmpvnn;
3236
3237         /* verify that this vnn is still active */
3238         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3239                 if (tmpvnn == killtcp->vnn) {
3240                         break;
3241                 }
3242         }
3243
3244         if (tmpvnn == NULL) {
3245                 return 0;
3246         }
3247
3248         if (killtcp->vnn->killtcp != killtcp) {
3249                 return 0;
3250         }
3251
3252         killtcp->vnn->killtcp = NULL;
3253
3254         return 0;
3255 }
3256
3257
3258 /* nothing fancy here, just unconditionally replace any existing
3259    connection structure with the new one.
3260
3261    dont even free the old one if it did exist, that one is talloc_stolen
3262    by the same node in the tree anyway and will be deleted when the new data 
3263    is deleted
3264 */
3265 static void *add_killtcp_callback(void *parm, void *data)
3266 {
3267         return parm;
3268 }
3269
3270 /*
3271   add a tcp socket to the list of connections we want to RST
3272  */
3273 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3274                                        ctdb_sock_addr *s,
3275                                        ctdb_sock_addr *d)
3276 {
3277         ctdb_sock_addr src, dst;
3278         struct ctdb_kill_tcp *killtcp;
3279         struct ctdb_killtcp_con *con;
3280         struct ctdb_vnn *vnn;
3281
3282         ctdb_canonicalize_ip(s, &src);
3283         ctdb_canonicalize_ip(d, &dst);
3284
3285         vnn = find_public_ip_vnn(ctdb, &dst);
3286         if (vnn == NULL) {
3287                 vnn = find_public_ip_vnn(ctdb, &src);
3288         }
3289         if (vnn == NULL) {
3290                 /* if it is not a public ip   it could be our 'single ip' */
3291                 if (ctdb->single_ip_vnn) {
3292                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3293                                 vnn = ctdb->single_ip_vnn;
3294                         }
3295                 }
3296         }
3297         if (vnn == NULL) {
3298                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3299                 return -1;
3300         }
3301
3302         killtcp = vnn->killtcp;
3303         
3304         /* If this is the first connection to kill we must allocate
3305            a new structure
3306          */
3307         if (killtcp == NULL) {
3308                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3309                 CTDB_NO_MEMORY(ctdb, killtcp);
3310
3311                 killtcp->vnn         = vnn;
3312                 killtcp->ctdb        = ctdb;
3313                 killtcp->capture_fd  = -1;
3314                 killtcp->connections = trbt_create(killtcp, 0);
3315
3316                 vnn->killtcp         = killtcp;
3317                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3318         }
3319
3320
3321
3322         /* create a structure that describes this connection we want to
3323            RST and store it in killtcp->connections
3324         */
3325         con = talloc(killtcp, struct ctdb_killtcp_con);
3326         CTDB_NO_MEMORY(ctdb, con);
3327         con->src_addr = src;
3328         con->dst_addr = dst;
3329         con->count    = 0;
3330         con->killtcp  = killtcp;
3331
3332
3333         trbt_insertarray32_callback(killtcp->connections,
3334                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3335                         add_killtcp_callback, con);
3336
3337         /* 
3338            If we dont have a socket to listen on yet we must create it
3339          */
3340         if (killtcp->capture_fd == -1) {
3341                 const char *iface = ctdb_vnn_iface_string(vnn);
3342                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3343                 if (killtcp->capture_fd == -1) {
3344                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3345                                           "socket on iface '%s' for killtcp (%s)\n",
3346                                           iface, strerror(errno)));
3347                         goto failed;
3348                 }
3349         }
3350
3351
3352         if (killtcp->fde == NULL) {
3353                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3354                                             EVENT_FD_READ,
3355                                             capture_tcp_handler, killtcp);
3356                 tevent_fd_set_auto_close(killtcp->fde);
3357
3358                 /* We also need to set up some events to tickle all these connections
3359                    until they are all reset
3360                 */
3361                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3362                                 ctdb_tickle_sentenced_connections, killtcp);
3363         }
3364
3365         /* tickle him once now */
3366         ctdb_sys_send_tcp(
3367                 &con->dst_addr,
3368                 &con->src_addr,
3369                 0, 0, 0);
3370
3371         return 0;
3372
3373 failed:
3374         talloc_free(vnn->killtcp);
3375         vnn->killtcp = NULL;
3376         return -1;
3377 }
3378
3379 /*
3380   kill a TCP connection.
3381  */
3382 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3383 {
3384         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3385
3386         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3387 }
3388
3389 /*
3390   called by a daemon to inform us of the entire list of TCP tickles for
3391   a particular public address.
3392   this control should only be sent by the node that is currently serving
3393   that public address.
3394  */
3395 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3396 {
3397         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3398         struct ctdb_tcp_array *tcparray;
3399         struct ctdb_vnn *vnn;
3400
3401         /* We must at least have tickles.num or else we cant verify the size
3402            of the received data blob
3403          */
3404         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3405                                         tickles.connections)) {
3406                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3407                 return -1;
3408         }
3409
3410         /* verify that the size of data matches what we expect */
3411         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3412                                 tickles.connections)
3413                          + sizeof(struct ctdb_tcp_connection)
3414                                  * list->tickles.num) {
3415                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3416                 return -1;
3417         }       
3418
3419         vnn = find_public_ip_vnn(ctdb, &list->addr);
3420         if (vnn == NULL) {
3421                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3422                         ctdb_addr_to_str(&list->addr)));
3423
3424                 return 1;
3425         }
3426
3427         /* remove any old ticklelist we might have */
3428         talloc_free(vnn->tcp_array);
3429         vnn->tcp_array = NULL;
3430
3431         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3432         CTDB_NO_MEMORY(ctdb, tcparray);
3433
3434         tcparray->num = list->tickles.num;
3435
3436         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3437         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3438
3439         memcpy(tcparray->connections, &list->tickles.connections[0], 
3440                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3441
3442         /* We now have a new fresh tickle list array for this vnn */
3443         vnn->tcp_array = talloc_steal(vnn, tcparray);
3444         
3445         return 0;
3446 }
3447
3448 /*
3449   called to return the full list of tickles for the puclic address associated 
3450   with the provided vnn
3451  */
3452 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3453 {
3454         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3455         struct ctdb_control_tcp_tickle_list *list;
3456         struct ctdb_tcp_array *tcparray;
3457         int num;
3458         struct ctdb_vnn *vnn;
3459
3460         vnn = find_public_ip_vnn(ctdb, addr);
3461         if (vnn == NULL) {
3462                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3463                         ctdb_addr_to_str(addr)));
3464
3465                 return 1;
3466         }
3467
3468         tcparray = vnn->tcp_array;
3469         if (tcparray) {
3470                 num = tcparray->num;
3471         } else {
3472                 num = 0;
3473         }
3474
3475         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3476                                 tickles.connections)
3477                         + sizeof(struct ctdb_tcp_connection) * num;
3478
3479         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3480         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3481         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3482
3483         list->addr = *addr;
3484         list->tickles.num = num;
3485         if (num) {
3486                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3487                         sizeof(struct ctdb_tcp_connection) * num);
3488         }
3489
3490         return 0;
3491 }
3492
3493
3494 /*
3495   set the list of all tcp tickles for a public address
3496  */
3497 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3498                               struct timeval timeout, uint32_t destnode, 
3499                               ctdb_sock_addr *addr,
3500                               struct ctdb_tcp_array *tcparray)
3501 {
3502         int ret, num;
3503         TDB_DATA data;
3504         struct ctdb_control_tcp_tickle_list *list;
3505
3506         if (tcparray) {
3507                 num = tcparray->num;
3508         } else {
3509                 num = 0;
3510         }
3511
3512         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3513                                 tickles.connections) +
3514                         sizeof(struct ctdb_tcp_connection) * num;
3515         data.dptr = talloc_size(ctdb, data.dsize);
3516         CTDB_NO_MEMORY(ctdb, data.dptr);
3517
3518         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3519         list->addr = *addr;
3520         list->tickles.num = num;
3521         if (tcparray) {
3522                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3523         }
3524
3525         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3526                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3527                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3528         if (ret != 0) {
3529                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3530                 return -1;
3531         }
3532
3533         talloc_free(data.dptr);
3534
3535         return ret;
3536 }
3537
3538
3539 /*
3540   perform tickle updates if required
3541  */
3542 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3543                                 struct timed_event *te, 
3544                                 struct timeval t, void *private_data)
3545 {
3546         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3547         int ret;
3548         struct ctdb_vnn *vnn;
3549
3550         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3551                 /* we only send out updates for public addresses that 
3552                    we have taken over
3553                  */
3554                 if (ctdb->pnn != vnn->pnn) {
3555                         continue;
3556                 }
3557                 /* We only send out the updates if we need to */
3558                 if (!vnn->tcp_update_needed) {
3559                         continue;
3560                 }
3561                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3562                                 TAKEOVER_TIMEOUT(),
3563                                 CTDB_BROADCAST_CONNECTED,
3564                                 &vnn->public_address,
3565                                 vnn->tcp_array);
3566                 if (ret != 0) {
3567                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3568                                 ctdb_addr_to_str(&vnn->public_address)));
3569                 }
3570         }
3571
3572         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3573                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3574                              ctdb_update_tcp_tickles, ctdb);
3575 }               
3576         
3577
3578 /*
3579   start periodic update of tcp tickles
3580  */
3581 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3582 {
3583         ctdb->tickle_update_context = talloc_new(ctdb);
3584
3585         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3586                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3587                              ctdb_update_tcp_tickles, ctdb);
3588 }
3589
3590
3591
3592
3593 struct control_gratious_arp {
3594         struct ctdb_context *ctdb;
3595         ctdb_sock_addr addr;
3596         const char *iface;
3597         int count;
3598 };
3599
3600 /*
3601   send a control_gratuitous arp
3602  */
3603 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3604                                   struct timeval t, void *private_data)
3605 {
3606         int ret;
3607         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3608                                                         struct control_gratious_arp);
3609
3610         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3611         if (ret != 0) {
3612                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3613                                  arp->iface, strerror(errno)));
3614         }
3615
3616
3617         arp->count++;
3618         if (arp->count == CTDB_ARP_REPEAT) {
3619                 talloc_free(arp);
3620                 return;
3621         }
3622
3623         event_add_timed(arp->ctdb->ev, arp, 
3624                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3625                         send_gratious_arp, arp);
3626 }
3627
3628
3629 /*
3630   send a gratious arp 
3631  */
3632 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3633 {
3634         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3635         struct control_gratious_arp *arp;
3636
3637         /* verify the size of indata */
3638         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3639                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3640                                  (unsigned)indata.dsize, 
3641                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3642                 return -1;
3643         }
3644         if (indata.dsize != 
3645                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3646                 + gratious_arp->len ) ){
3647
3648                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3649                         "but should be %u bytes\n", 
3650                          (unsigned)indata.dsize, 
3651                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3652                 return -1;
3653         }
3654
3655
3656         arp = talloc(ctdb, struct control_gratious_arp);
3657         CTDB_NO_MEMORY(ctdb, arp);
3658
3659         arp->ctdb  = ctdb;
3660         arp->addr   = gratious_arp->addr;
3661         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3662         CTDB_NO_MEMORY(ctdb, arp->iface);
3663         arp->count = 0;
3664         
3665         event_add_timed(arp->ctdb->ev, arp, 
3666                         timeval_zero(), send_gratious_arp, arp);
3667
3668         return 0;
3669 }
3670
3671 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3672 {
3673         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3674         int ret;
3675
3676         /* verify the size of indata */
3677         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3678                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3679                 return -1;
3680         }
3681         if (indata.dsize != 
3682                 ( offsetof(struct ctdb_control_ip_iface, iface)
3683                 + pub->len ) ){
3684
3685                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3686                         "but should be %u bytes\n", 
3687                          (unsigned)indata.dsize, 
3688                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3689                 return -1;
3690         }
3691
3692         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3693
3694         if (ret != 0) {
3695                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3696                 return -1;
3697         }
3698
3699         return 0;
3700 }
3701
3702 /*
3703   called when releaseip event finishes for del_public_address
3704  */
3705 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3706                                 void *private_data)
3707 {
3708         talloc_free(private_data);
3709 }
3710
3711 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3712 {
3713         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3714         struct ctdb_vnn *vnn;
3715         int ret;
3716
3717         /* verify the size of indata */
3718         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3719                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3720                 return -1;
3721         }
3722         if (indata.dsize != 
3723                 ( offsetof(struct ctdb_control_ip_iface, iface)
3724                 + pub->len ) ){
3725
3726                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3727                         "but should be %u bytes\n", 
3728                          (unsigned)indata.dsize, 
3729                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3730                 return -1;
3731         }
3732
3733         /* walk over all public addresses until we find a match */
3734         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3735                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3736                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3737
3738                         DLIST_REMOVE(ctdb->vnn, vnn);
3739                         talloc_steal(mem_ctx, vnn);
3740                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3741                         if (vnn->pnn != ctdb->pnn) {
3742                                 if (vnn->iface != NULL) {
3743                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3744                                 }
3745                                 talloc_free(mem_ctx);
3746                                 return 0;
3747                         }
3748                         vnn->pnn = -1;
3749
3750                         ret = ctdb_event_script_callback(ctdb, 
3751                                          mem_ctx, delete_ip_callback, mem_ctx,
3752                                          false,
3753                                          CTDB_EVENT_RELEASE_IP,
3754                                          "%s %s %u",
3755                                          ctdb_vnn_iface_string(vnn),
3756                                          ctdb_addr_to_str(&vnn->public_address),
3757                                          vnn->public_netmask_bits);
3758                         if (vnn->iface != NULL) {
3759                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3760                         }
3761                         if (ret != 0) {
3762                                 return -1;
3763                         }
3764                         return 0;
3765                 }
3766         }
3767
3768         return -1;
3769 }
3770
3771 /* This function is called from the recovery daemon to verify that a remote
3772    node has the expected ip allocation.
3773    This is verified against ctdb->ip_tree
3774 */
3775 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3776 {
3777         struct ctdb_public_ip_list *tmp_ip; 
3778         int i;
3779
3780         if (ctdb->ip_tree == NULL) {
3781                 /* dont know the expected allocation yet, assume remote node
3782                    is correct. */
3783                 return 0;
3784         }
3785
3786         if (ips == NULL) {
3787                 return 0;
3788         }
3789
3790         for (i=0; i<ips->num; i++) {
3791                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3792                 if (tmp_ip == NULL) {
3793                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3794                         return -1;
3795                 }
3796
3797                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3798                         continue;
3799                 }
3800
3801                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3802                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3803                         return -1;
3804                 }
3805         }
3806
3807         return 0;
3808 }
3809
3810 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3811 {
3812         struct ctdb_public_ip_list *tmp_ip; 
3813
3814         if (ctdb->ip_tree == NULL) {
3815                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3816                 return -1;
3817         }
3818
3819         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3820         if (tmp_ip == NULL) {
3821                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3822                 return -1;
3823         }
3824
3825         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3826         tmp_ip->pnn = ip->pnn;
3827
3828         return 0;
3829 }
3830
3831
3832 struct ctdb_reloadips_handle {
3833         struct ctdb_context *ctdb;
3834         struct ctdb_req_control *c;
3835         int status;
3836         int fd[2];
3837         pid_t child;
3838         struct fd_event *fde;
3839 };
3840
3841 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3842 {
3843         if (h == h->ctdb->reload_ips) {
3844                 h->ctdb->reload_ips = NULL;
3845         }
3846         if (h->c != NULL) {
3847                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3848                 h->c = NULL;
3849         }
3850         ctdb_kill(h->ctdb, h->child, SIGKILL);
3851         return 0;
3852 }
3853
3854 static void ctdb_reloadips_timeout_event(struct event_context *ev,
3855                                 struct timed_event *te,
3856                                 struct timeval t, void *private_data)
3857 {
3858         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3859
3860         talloc_free(h);
3861 }       
3862
3863 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
3864                              uint16_t flags, void *private_data)
3865 {
3866         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3867
3868         char res;
3869         int ret;
3870
3871         ret = read(h->fd[0], &res, 1);
3872         if (ret < 1 || res != 0) {
3873                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3874                 res = 1;
3875         }
3876         h->status = res;
3877
3878         talloc_free(h);
3879 }
3880
3881 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3882 {
3883         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3884         struct ctdb_all_public_ips *ips;
3885         struct ctdb_vnn *vnn;
3886         int i, ret;
3887
3888         /* read the ip allocation from the local node */
3889         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
3890         if (ret != 0) {
3891                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
3892                 talloc_free(mem_ctx);
3893                 return -1;
3894         }
3895
3896         /* re-read the public ips file */
3897         ctdb->vnn = NULL;
3898         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3899                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3900                 talloc_free(mem_ctx);
3901                 return -1;
3902         }               
3903
3904
3905         /* check the previous list of ips and scan for ips that have been
3906            dropped.
3907          */
3908         for (i = 0; i < ips->num; i++) {
3909                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3910                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3911                                 break;
3912                         }
3913                 }
3914
3915                 /* we need to delete this ip, no longer available on this node */
3916                 if (vnn == NULL) {
3917                         struct ctdb_control_ip_iface pub;
3918
3919                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3920                         pub.addr  = ips->ips[i].addr;
3921                         pub.mask  = 0;
3922                         pub.len   = 0;
3923
3924                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3925                         if (ret != 0) {
3926                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3927                                 return -1;
3928                         }
3929                 }
3930         }
3931
3932
3933         /* loop over all new ones and check the ones we need to add */
3934         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3935                 for (i = 0; i < ips->num; i++) {
3936                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3937                                 break;
3938                         }
3939                 }
3940                 if (i == ips->num) {
3941                         struct ctdb_control_ip_iface pub;
3942                         const char *ifaces = NULL;
3943                         int iface = 0;
3944
3945                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
3946
3947                         pub.addr  = vnn->public_address;
3948                         pub.mask  = vnn->public_netmask_bits;
3949
3950
3951                         ifaces = vnn->ifaces[0];
3952                         iface = 1;
3953                         while (vnn->ifaces[iface] != NULL) {
3954                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
3955                                 iface++;
3956                         }
3957                         pub.len   = strlen(ifaces)+1;
3958                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
3959
3960                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3961                         if (ret != 0) {
3962                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
3963                                 return -1;
3964                         }
3965                 }
3966         }
3967
3968         return 0;
3969 }
3970
3971 /* This control is sent to force the node to re-read the public addresses file
3972    and drop any addresses we should nnot longer host, and add new addresses
3973    that we are now able to host
3974 */
3975 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
3976 {
3977         struct ctdb_reloadips_handle *h;
3978         pid_t parent = getpid();
3979
3980         if (ctdb->reload_ips != NULL) {
3981                 talloc_free(ctdb->reload_ips);
3982                 ctdb->reload_ips = NULL;
3983         }
3984
3985         h = talloc(ctdb, struct ctdb_reloadips_handle);
3986         CTDB_NO_MEMORY(ctdb, h);
3987         h->ctdb     = ctdb;
3988         h->c        = NULL;
3989         h->status   = -1;
3990         
3991         if (pipe(h->fd) == -1) {
3992                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3993                 talloc_free(h);
3994                 return -1;
3995         }
3996
3997         h->child = ctdb_fork(ctdb);
3998         if (h->child == (pid_t)-1) {
3999                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4000                 close(h->fd[0]);
4001                 close(h->fd[1]);
4002                 talloc_free(h);
4003                 return -1;
4004         }
4005
4006         /* child process */
4007         if (h->child == 0) {
4008                 signed char res = 0;
4009
4010                 close(h->fd[0]);
4011                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4012
4013                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4014                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4015                         res = -1;
4016                 } else {
4017                         res = ctdb_reloadips_child(ctdb);
4018                         if (res != 0) {
4019                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4020                         }
4021                 }
4022
4023                 write(h->fd[1], &res, 1);
4024                 /* make sure we die when our parent dies */
4025                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4026                         sleep(5);
4027                 }
4028                 _exit(0);
4029         }
4030
4031         h->c             = talloc_steal(h, c);
4032
4033         close(h->fd[1]);
4034         set_close_on_exec(h->fd[0]);
4035
4036         talloc_set_destructor(h, ctdb_reloadips_destructor);
4037
4038
4039         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4040                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4041                         (void *)h);
4042         tevent_fd_set_auto_close(h->fde);
4043
4044         event_add_timed(ctdb->ev, h,
4045                         timeval_current_ofs(120, 0),
4046                         ctdb_reloadips_timeout_event, h);
4047
4048         /* we reply later */
4049         *async_reply = true;
4050         return 0;
4051 }