recoverd: Refactor code to get NoIPTakeover tunable from all nodes
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Check if this is a public ip known to the node, i.e. can that
1303    node takeover this ip ?
1304 */
1305 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                 struct ctdb_public_ip_list *ip)
1307 {
1308         struct ctdb_all_public_ips *public_ips;
1309         int i;
1310
1311         public_ips = ctdb->nodes[pnn]->available_public_ips;
1312
1313         if (public_ips == NULL) {
1314                 return -1;
1315         }
1316
1317         for (i=0;i<public_ips->num;i++) {
1318                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1319                         /* yes, this node can serve this public ip */
1320                         return 0;
1321                 }
1322         }
1323
1324         return -1;
1325 }
1326
1327
1328 /* search the node lists list for a node to takeover this ip.
1329    pick the node that currently are serving the least number of ips
1330    so that the ips get spread out evenly.
1331 */
1332 static int find_takeover_node(struct ctdb_context *ctdb, 
1333                 struct ctdb_node_map *nodemap, uint32_t mask, 
1334                 struct ctdb_public_ip_list *ip,
1335                 struct ctdb_public_ip_list *all_ips)
1336 {
1337         int pnn, min=0, num;
1338         int i;
1339
1340         pnn    = -1;
1341         for (i=0;i<nodemap->num;i++) {
1342                 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1343                         /* This node is not allowed to takeover any addresses
1344                         */
1345                         continue;
1346                 }
1347
1348                 if (nodemap->nodes[i].flags & mask) {
1349                         /* This node is not healty and can not be used to serve
1350                            a public address 
1351                         */
1352                         continue;
1353                 }
1354
1355                 /* verify that this node can serve this ip */
1356                 if (can_node_serve_ip(ctdb, i, ip)) {
1357                         /* no it couldnt   so skip to the next node */
1358                         continue;
1359                 }
1360
1361                 num = node_ip_coverage(ctdb, i, all_ips);
1362                 /* was this the first node we checked ? */
1363                 if (pnn == -1) {
1364                         pnn = i;
1365                         min  = num;
1366                 } else {
1367                         if (num < min) {
1368                                 pnn = i;
1369                                 min  = num;
1370                         }
1371                 }
1372         }       
1373         if (pnn == -1) {
1374                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1375                         ctdb_addr_to_str(&ip->addr)));
1376
1377                 return -1;
1378         }
1379
1380         ip->pnn = pnn;
1381         return 0;
1382 }
1383
1384 #define IP_KEYLEN       4
1385 static uint32_t *ip_key(ctdb_sock_addr *ip)
1386 {
1387         static uint32_t key[IP_KEYLEN];
1388
1389         bzero(key, sizeof(key));
1390
1391         switch (ip->sa.sa_family) {
1392         case AF_INET:
1393                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1394                 break;
1395         case AF_INET6: {
1396                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1397                 key[0]  = htonl(s6_a32[0]);
1398                 key[1]  = htonl(s6_a32[1]);
1399                 key[2]  = htonl(s6_a32[2]);
1400                 key[3]  = htonl(s6_a32[3]);
1401                 break;
1402         }
1403         default:
1404                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1405                 return key;
1406         }
1407
1408         return key;
1409 }
1410
1411 static void *add_ip_callback(void *parm, void *data)
1412 {
1413         struct ctdb_public_ip_list *this_ip = parm; 
1414         struct ctdb_public_ip_list *prev_ip = data; 
1415
1416         if (prev_ip == NULL) {
1417                 return parm;
1418         }
1419         if (this_ip->pnn == -1) {
1420                 this_ip->pnn = prev_ip->pnn;
1421         }
1422
1423         return parm;
1424 }
1425
1426 static int getips_count_callback(void *param, void *data)
1427 {
1428         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1429         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1430
1431         new_ip->next = *ip_list;
1432         *ip_list     = new_ip;
1433         return 0;
1434 }
1435
1436 static struct ctdb_public_ip_list *
1437 create_merged_ip_list(struct ctdb_context *ctdb)
1438 {
1439         int i, j;
1440         struct ctdb_public_ip_list *ip_list;
1441         struct ctdb_all_public_ips *public_ips;
1442
1443         if (ctdb->ip_tree != NULL) {
1444                 talloc_free(ctdb->ip_tree);
1445                 ctdb->ip_tree = NULL;
1446         }
1447         ctdb->ip_tree = trbt_create(ctdb, 0);
1448
1449         for (i=0;i<ctdb->num_nodes;i++) {
1450                 public_ips = ctdb->nodes[i]->known_public_ips;
1451
1452                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1453                         continue;
1454                 }
1455
1456                 /* there were no public ips for this node */
1457                 if (public_ips == NULL) {
1458                         continue;
1459                 }               
1460
1461                 for (j=0;j<public_ips->num;j++) {
1462                         struct ctdb_public_ip_list *tmp_ip; 
1463
1464                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1465                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1466                         /* Do not use information about IP addresses hosted
1467                          * on other nodes, it may not be accurate */
1468                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1469                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1470                         } else {
1471                                 tmp_ip->pnn = -1;
1472                         }
1473                         tmp_ip->addr = public_ips->ips[j].addr;
1474                         tmp_ip->next = NULL;
1475
1476                         trbt_insertarray32_callback(ctdb->ip_tree,
1477                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1478                                 add_ip_callback,
1479                                 tmp_ip);
1480                 }
1481         }
1482
1483         ip_list = NULL;
1484         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1485
1486         return ip_list;
1487 }
1488
1489 /* 
1490  * This is the length of the longtest common prefix between the IPs.
1491  * It is calculated by XOR-ing the 2 IPs together and counting the
1492  * number of leading zeroes.  The implementation means that all
1493  * addresses end up being 128 bits long.
1494  *
1495  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1496  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1497  * lots of nodes and IP addresses?
1498  */
1499 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1500 {
1501         uint32_t ip1_k[IP_KEYLEN];
1502         uint32_t *t;
1503         int i;
1504         uint32_t x;
1505
1506         uint32_t distance = 0;
1507
1508         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1509         t = ip_key(ip2);
1510         for (i=0; i<IP_KEYLEN; i++) {
1511                 x = ip1_k[i] ^ t[i];
1512                 if (x == 0) {
1513                         distance += 32;
1514                 } else {
1515                         /* Count number of leading zeroes. 
1516                          * FIXME? This could be optimised...
1517                          */
1518                         while ((x & (1 << 31)) == 0) {
1519                                 x <<= 1;
1520                                 distance += 1;
1521                         }
1522                 }
1523         }
1524
1525         return distance;
1526 }
1527
1528 /* Calculate the IP distance for the given IP relative to IPs on the
1529    given node.  The ips argument is generally the all_ips variable
1530    used in the main part of the algorithm.
1531  */
1532 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1533                                   struct ctdb_public_ip_list *ips,
1534                                   int pnn)
1535 {
1536         struct ctdb_public_ip_list *t;
1537         uint32_t d;
1538
1539         uint32_t sum = 0;
1540
1541         for (t=ips; t != NULL; t=t->next) {
1542                 if (t->pnn != pnn) {
1543                         continue;
1544                 }
1545
1546                 /* Optimisation: We never calculate the distance
1547                  * between an address and itself.  This allows us to
1548                  * calculate the effect of removing an address from a
1549                  * node by simply calculating the distance between
1550                  * that address and all of the exitsing addresses.
1551                  * Moreover, we assume that we're only ever dealing
1552                  * with addresses from all_ips so we can identify an
1553                  * address via a pointer rather than doing a more
1554                  * expensive address comparison. */
1555                 if (&(t->addr) == ip) {
1556                         continue;
1557                 }
1558
1559                 d = ip_distance(ip, &(t->addr));
1560                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1561         }
1562
1563         return sum;
1564 }
1565
1566 /* Return the LCP2 imbalance metric for addresses currently assigned
1567    to the given node.
1568  */
1569 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1570 {
1571         struct ctdb_public_ip_list *t;
1572
1573         uint32_t imbalance = 0;
1574
1575         for (t=all_ips; t!=NULL; t=t->next) {
1576                 if (t->pnn != pnn) {
1577                         continue;
1578                 }
1579                 /* Pass the rest of the IPs rather than the whole
1580                    all_ips input list.
1581                 */
1582                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1583         }
1584
1585         return imbalance;
1586 }
1587
1588 /* Allocate any unassigned IPs just by looping through the IPs and
1589  * finding the best node for each.
1590  */
1591 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1592                                       struct ctdb_node_map *nodemap,
1593                                       uint32_t mask,
1594                                       struct ctdb_public_ip_list *all_ips)
1595 {
1596         struct ctdb_public_ip_list *tmp_ip;
1597
1598         /* loop over all ip's and find a physical node to cover for 
1599            each unassigned ip.
1600         */
1601         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1602                 if (tmp_ip->pnn == -1) {
1603                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1604                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1605                                         ctdb_addr_to_str(&tmp_ip->addr)));
1606                         }
1607                 }
1608         }
1609 }
1610
1611 /* Basic non-deterministic rebalancing algorithm.
1612  */
1613 static void basic_failback(struct ctdb_context *ctdb,
1614                            struct ctdb_node_map *nodemap,
1615                            uint32_t mask,
1616                            struct ctdb_public_ip_list *all_ips,
1617                            int num_ips)
1618 {
1619         int i;
1620         int maxnode, maxnum, minnode, minnum, num, retries;
1621         struct ctdb_public_ip_list *tmp_ip;
1622
1623         retries = 0;
1624
1625 try_again:
1626         maxnum=0;
1627         minnum=0;
1628
1629         /* for each ip address, loop over all nodes that can serve
1630            this ip and make sure that the difference between the node
1631            serving the most and the node serving the least ip's are
1632            not greater than 1.
1633         */
1634         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1635                 if (tmp_ip->pnn == -1) {
1636                         continue;
1637                 }
1638
1639                 /* Get the highest and lowest number of ips's served by any 
1640                    valid node which can serve this ip.
1641                 */
1642                 maxnode = -1;
1643                 minnode = -1;
1644                 for (i=0;i<nodemap->num;i++) {
1645                         if (nodemap->nodes[i].flags & mask) {
1646                                 continue;
1647                         }
1648
1649                         /* Only check nodes that are allowed to takeover an ip */
1650                         if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1651                                 continue;
1652                         }
1653
1654                         /* only check nodes that can actually serve this ip */
1655                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1656                                 /* no it couldnt   so skip to the next node */
1657                                 continue;
1658                         }
1659
1660                         num = node_ip_coverage(ctdb, i, all_ips);
1661                         if (maxnode == -1) {
1662                                 maxnode = i;
1663                                 maxnum  = num;
1664                         } else {
1665                                 if (num > maxnum) {
1666                                         maxnode = i;
1667                                         maxnum  = num;
1668                                 }
1669                         }
1670                         if (minnode == -1) {
1671                                 minnode = i;
1672                                 minnum  = num;
1673                         } else {
1674                                 if (num < minnum) {
1675                                         minnode = i;
1676                                         minnum  = num;
1677                                 }
1678                         }
1679                 }
1680                 if (maxnode == -1) {
1681                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1682                                 ctdb_addr_to_str(&tmp_ip->addr)));
1683
1684                         continue;
1685                 }
1686
1687                 /* if the spread between the smallest and largest coverage by
1688                    a node is >=2 we steal one of the ips from the node with
1689                    most coverage to even things out a bit.
1690                    try to do this a limited number of times since we dont
1691                    want to spend too much time balancing the ip coverage.
1692                 */
1693                 if ( (maxnum > minnum+1)
1694                      && (retries < (num_ips + 5)) ){
1695                         struct ctdb_public_ip_list *tmp;
1696
1697                         /* Reassign one of maxnode's VNNs */
1698                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1699                                 if (tmp->pnn == maxnode) {
1700                                         (void)find_takeover_node(ctdb, nodemap, mask, tmp, all_ips);
1701                                         retries++;
1702                                         goto try_again;;
1703                                 }
1704                         }
1705                 }
1706         }
1707 }
1708
1709 struct ctdb_rebalancenodes {
1710         struct ctdb_rebalancenodes *next;
1711         uint32_t pnn;
1712 };
1713 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1714
1715
1716 /* set this flag to force the node to be rebalanced even if it just didnt
1717    become healthy again.
1718 */
1719 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1720 {
1721         struct ctdb_rebalancenodes *rebalance;
1722
1723         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1724                 if (rebalance->pnn == pnn) {
1725                         return;
1726                 }
1727         }
1728
1729         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1730         rebalance->pnn = pnn;
1731         rebalance->next = force_rebalance_list;
1732         force_rebalance_list = rebalance;
1733 }
1734
1735 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1736  * that we can unit test it.
1737  */
1738 static void lcp2_init(struct ctdb_context * tmp_ctx,
1739                struct ctdb_node_map * nodemap,
1740                uint32_t mask,
1741                struct ctdb_public_ip_list *all_ips,
1742                uint32_t **lcp2_imbalances,
1743                bool **newly_healthy)
1744 {
1745         int i;
1746         struct ctdb_public_ip_list *tmp_ip;
1747
1748         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1750         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1751         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752
1753         for (i=0;i<nodemap->num;i++) {
1754                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1755                 /* First step: is the node "healthy"? */
1756                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1757         }
1758
1759         /* 2nd step: if a ndoe has IPs assigned then it must have been
1760          * healthy before, so we remove it from consideration... */
1761         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1762                 if (tmp_ip->pnn != -1) {
1763                         (*newly_healthy)[tmp_ip->pnn] = false;
1764                 }
1765         }
1766
1767         /* 3rd step: if a node is forced to re-balance then
1768            we allow failback onto the node */
1769         while (force_rebalance_list != NULL) {
1770                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1771
1772                 if (force_rebalance_list->pnn <= nodemap->num) {
1773                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1774                 }
1775
1776                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1777                 talloc_free(force_rebalance_list);
1778                 force_rebalance_list = next;
1779         }
1780 }
1781
1782 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1783  * the IP/node combination that will cost the least.
1784  */
1785 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1786                               struct ctdb_node_map *nodemap,
1787                               uint32_t mask,
1788                               struct ctdb_public_ip_list *all_ips,
1789                               uint32_t *lcp2_imbalances)
1790 {
1791         struct ctdb_public_ip_list *tmp_ip;
1792         int dstnode;
1793
1794         int minnode;
1795         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1796         struct ctdb_public_ip_list *minip;
1797
1798         bool should_loop = true;
1799         bool have_unassigned = true;
1800
1801         while (have_unassigned && should_loop) {
1802                 should_loop = false;
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1806
1807                 minnode = -1;
1808                 mindsum = 0;
1809                 minip = NULL;
1810
1811                 /* loop over each unassigned ip. */
1812                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1813                         if (tmp_ip->pnn != -1) {
1814                                 continue;
1815                         }
1816
1817                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1818                                 /* Only check nodes that are allowed to takeover an ip */
1819                                 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1820                                         continue;
1821                                 }
1822
1823                                 /* only check nodes that can actually serve this ip */
1824                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1825                                         /* no it couldnt   so skip to the next node */
1826                                         continue;
1827                                 }
1828                                 if (nodemap->nodes[dstnode].flags & mask) {
1829                                         continue;
1830                                 }
1831
1832                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1833                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1834                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1835                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1836                                                    dstnode,
1837                                                    dstimbl - lcp2_imbalances[dstnode]));
1838
1839
1840                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1841                                         minnode = dstnode;
1842                                         minimbl = dstimbl;
1843                                         mindsum = dstdsum;
1844                                         minip = tmp_ip;
1845                                         should_loop = true;
1846                                 }
1847                         }
1848                 }
1849
1850                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1851
1852                 /* If we found one then assign it to the given node. */
1853                 if (minnode != -1) {
1854                         minip->pnn = minnode;
1855                         lcp2_imbalances[minnode] = minimbl;
1856                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1857                                           ctdb_addr_to_str(&(minip->addr)),
1858                                           minnode,
1859                                           mindsum));
1860                 }
1861
1862                 /* There might be a better way but at least this is clear. */
1863                 have_unassigned = false;
1864                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1865                         if (tmp_ip->pnn == -1) {
1866                                 have_unassigned = true;
1867                         }
1868                 }
1869         }
1870
1871         /* We know if we have an unassigned addresses so we might as
1872          * well optimise.
1873          */
1874         if (have_unassigned) {
1875                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876                         if (tmp_ip->pnn == -1) {
1877                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1878                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1879                         }
1880                 }
1881         }
1882 }
1883
1884 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1885  * to move IPs from, determines the best IP/destination node
1886  * combination to move from the source node.
1887  */
1888 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1889                                     struct ctdb_node_map *nodemap,
1890                                     struct ctdb_public_ip_list *all_ips,
1891                                     int srcnode,
1892                                     uint32_t candimbl,
1893                                     uint32_t *lcp2_imbalances,
1894                                     bool *newly_healthy)
1895 {
1896         int dstnode, mindstnode;
1897         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1898         uint32_t minsrcimbl, mindstimbl;
1899         struct ctdb_public_ip_list *minip;
1900         struct ctdb_public_ip_list *tmp_ip;
1901
1902         /* Find an IP and destination node that best reduces imbalance. */
1903         minip = NULL;
1904         minsrcimbl = 0;
1905         mindstnode = -1;
1906         mindstimbl = 0;
1907
1908         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1909         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1910
1911         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1912                 /* Only consider addresses on srcnode. */
1913                 if (tmp_ip->pnn != srcnode) {
1914                         continue;
1915                 }
1916
1917                 /* What is this IP address costing the source node? */
1918                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1919                 srcimbl = candimbl - srcdsum;
1920
1921                 /* Consider this IP address would cost each potential
1922                  * destination node.  Destination nodes are limited to
1923                  * those that are newly healthy, since we don't want
1924                  * to do gratuitous failover of IPs just to make minor
1925                  * balance improvements.
1926                  */
1927                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1928                         if (! newly_healthy[dstnode]) {
1929                                 continue;
1930                         }
1931
1932                         /* Only check nodes that are allowed to takeover an ip */
1933                         if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1934                                 continue;
1935                         }
1936
1937                         /* only check nodes that can actually serve this ip */
1938                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1939                                 /* no it couldnt   so skip to the next node */
1940                                 continue;
1941                         }
1942
1943                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1944                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1945                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1946                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1947                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1948                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1949
1950                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1951                             ((mindstnode == -1) ||                              \
1952                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1953
1954                                 minip = tmp_ip;
1955                                 minsrcimbl = srcimbl;
1956                                 mindstnode = dstnode;
1957                                 mindstimbl = dstimbl;
1958                         }
1959                 }
1960         }
1961         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1962
1963         if (mindstnode != -1) {
1964                 /* We found a move that makes things better... */
1965                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1966                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1967                                   ctdb_addr_to_str(&(minip->addr)),
1968                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1969
1970
1971                 lcp2_imbalances[srcnode] = srcimbl;
1972                 lcp2_imbalances[mindstnode] = mindstimbl;
1973                 minip->pnn = mindstnode;
1974
1975                 return true;
1976         }
1977
1978         return false;
1979         
1980 }
1981
1982 struct lcp2_imbalance_pnn {
1983         uint32_t imbalance;
1984         int pnn;
1985 };
1986
1987 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1988 {
1989         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1990         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1991
1992         if (lipa->imbalance > lipb->imbalance) {
1993                 return -1;
1994         } else if (lipa->imbalance == lipb->imbalance) {
1995                 return 0;
1996         } else {
1997                 return 1;
1998         }
1999 }
2000
2001 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2002  * node with the highest LCP2 imbalance, and then determines the best
2003  * IP/destination node combination to move from the source node.
2004  */
2005 static void lcp2_failback(struct ctdb_context *ctdb,
2006                           struct ctdb_node_map *nodemap,
2007                           uint32_t mask,
2008                           struct ctdb_public_ip_list *all_ips,
2009                           uint32_t *lcp2_imbalances,
2010                           bool *newly_healthy)
2011 {
2012         int i, num_newly_healthy;
2013         struct lcp2_imbalance_pnn * lips;
2014         bool again;
2015
2016 try_again:
2017
2018         /* It is only worth continuing if we have suitable target
2019          * nodes to transfer IPs to.  This check is much cheaper than
2020          * continuing on...
2021          */
2022         num_newly_healthy = 0;
2023         for (i = 0; i < nodemap->num; i++) {
2024                 if (newly_healthy[i]) {
2025                         num_newly_healthy++;
2026                 }
2027         }
2028         if (num_newly_healthy == 0) {
2029                 return;
2030         }
2031
2032         /* Put the imbalances and nodes into an array, sort them and
2033          * iterate through candidates.  Usually the 1st one will be
2034          * used, so this doesn't cost much...
2035          */
2036         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2037         for (i = 0; i < nodemap->num; i++) {
2038                 lips[i].imbalance = lcp2_imbalances[i];
2039                 lips[i].pnn = i;
2040         }
2041         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2042               lcp2_cmp_imbalance_pnn);
2043
2044         again = false;
2045         for (i = 0; i < nodemap->num; i++) {
2046                 /* This means that all nodes had 0 or 1 addresses, so
2047                  * can't be imbalanced.
2048                  */
2049                 if (lips[i].imbalance == 0) {
2050                         break;
2051                 }
2052
2053                 if (lcp2_failback_candidate(ctdb,
2054                                             nodemap,
2055                                             all_ips,
2056                                             lips[i].pnn,
2057                                             lips[i].imbalance,
2058                                             lcp2_imbalances,
2059                                             newly_healthy)) {
2060                         again = true;
2061                         break;
2062                 }
2063         }
2064
2065         talloc_free(lips);
2066         if (again) {
2067                 goto try_again;
2068         }
2069 }
2070
2071 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2072                                     struct ctdb_node_map *nodemap,
2073                                     struct ctdb_public_ip_list *all_ips,
2074                                     uint32_t mask)
2075 {
2076         struct ctdb_public_ip_list *tmp_ip;
2077
2078         /* mark all public addresses with a masked node as being served by
2079            node -1
2080         */
2081         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2082                 if (tmp_ip->pnn == -1) {
2083                         continue;
2084                 }
2085                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
2086                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2087                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2088                                            tmp_ip->pnn));
2089                         tmp_ip->pnn = -1;
2090                 }
2091         }
2092
2093         /* verify that the assigned nodes can serve that public ip
2094            and set it to -1 if not
2095         */
2096         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2097                 if (tmp_ip->pnn == -1) {
2098                         continue;
2099                 }
2100                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2101                         /* this node can not serve this ip. */
2102                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2103                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2104                                            tmp_ip->pnn));
2105                         tmp_ip->pnn = -1;
2106                 }
2107         }
2108 }
2109
2110 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2111                                        struct ctdb_node_map *nodemap,
2112                                        struct ctdb_public_ip_list *all_ips,
2113                                        uint32_t mask)
2114 {
2115         struct ctdb_public_ip_list *tmp_ip;
2116         int i;
2117
2118         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2119        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2120         *  always be allocated the same way for a specific set of
2121         *  available/unavailable nodes.
2122         */
2123
2124         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2125                 tmp_ip->pnn = i%nodemap->num;
2126         }
2127
2128         /* IP failback doesn't make sense with deterministic
2129          * IPs, since the modulo step above implicitly fails
2130          * back IPs to their "home" node.
2131          */
2132         if (1 == ctdb->tunable.no_ip_failback) {
2133                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2134         }
2135
2136         unassign_unsuitable_ips(ctdb, nodemap, all_ips, mask);
2137
2138         basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2139
2140         /* No failback here! */
2141 }
2142
2143 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2144                                           struct ctdb_node_map *nodemap,
2145                                           struct ctdb_public_ip_list *all_ips,
2146                                           uint32_t mask)
2147 {
2148         /* This should be pushed down into basic_failback. */
2149         struct ctdb_public_ip_list *tmp_ip;
2150         int num_ips = 0;
2151         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2152                 num_ips++;
2153         }
2154
2155         unassign_unsuitable_ips(ctdb, nodemap, all_ips, mask);
2156
2157         basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2158
2159         /* If we don't want IPs to fail back then don't rebalance IPs. */
2160         if (1 == ctdb->tunable.no_ip_failback) {
2161                 return;
2162         }
2163
2164         /* Now, try to make sure the ip adresses are evenly distributed
2165            across the nodes.
2166         */
2167         basic_failback(ctdb, nodemap, mask, all_ips, num_ips);
2168 }
2169
2170 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2171                           struct ctdb_node_map *nodemap,
2172                           struct ctdb_public_ip_list *all_ips,
2173                           uint32_t mask)
2174 {
2175         uint32_t *lcp2_imbalances;
2176         bool *newly_healthy;
2177
2178         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2179
2180         unassign_unsuitable_ips(ctdb, nodemap, all_ips, mask);
2181
2182         lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2183
2184         lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2185
2186         /* If we don't want IPs to fail back then don't rebalance IPs. */
2187         if (1 == ctdb->tunable.no_ip_failback) {
2188                 goto finished;
2189         }
2190
2191         /* Now, try to make sure the ip adresses are evenly distributed
2192            across the nodes.
2193         */
2194         lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy);
2195
2196 finished:
2197         talloc_free(tmp_ctx);
2198 }
2199
2200 /* The calculation part of the IP allocation algorithm. */
2201 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2202                                    struct ctdb_node_map *nodemap,
2203                                    struct ctdb_public_ip_list **all_ips_p)
2204 {
2205         int i, num_healthy;
2206         uint32_t mask;
2207
2208         /* Count how many completely healthy nodes we have */
2209         num_healthy = 0;
2210         for (i=0;i<nodemap->num;i++) {
2211                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2212                         num_healthy++;
2213                 }
2214         }
2215
2216         /* If we have healthy nodes then we will only consider them
2217            for serving public addresses
2218         */
2219         mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
2220         if ((num_healthy == 0) &&
2221             (ctdb->tunable.no_ip_takeover_on_disabled == 0)) {
2222                 /* We didnt have any completely healthy nodes so
2223                    use "disabled" nodes as a fallback
2224                 */
2225                 mask = NODE_FLAGS_INACTIVE;
2226         }
2227
2228         /* since nodes only know about those public addresses that
2229            can be served by that particular node, no single node has
2230            a full list of all public addresses that exist in the cluster.
2231            Walk over all node structures and create a merged list of
2232            all public addresses that exist in the cluster.
2233
2234            keep the tree of ips around as ctdb->ip_tree
2235         */
2236         *all_ips_p = create_merged_ip_list(ctdb);
2237
2238         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2239                 ip_alloc_lcp2(ctdb, nodemap, *all_ips_p, mask);
2240         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2241                 ip_alloc_deterministic_ips(ctdb, nodemap, *all_ips_p, mask);
2242         } else {
2243                 ip_alloc_nondeterministic_ips(ctdb, nodemap, *all_ips_p, mask);
2244         }
2245
2246         /* at this point ->pnn is the node which will own each IP
2247            or -1 if there is no node that can cover this ip
2248         */
2249
2250         return;
2251 }
2252
2253 struct get_tunable_callback_data {
2254         const char *tunable;
2255         uint32_t *out;
2256 };
2257
2258 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2259                                  int32_t res, TDB_DATA outdata,
2260                                  void *callback)
2261 {
2262         struct get_tunable_callback_data *cd =
2263                 (struct get_tunable_callback_data *)callback;
2264         int size;
2265
2266         if (res != 0) {
2267                 DEBUG(DEBUG_ERR,
2268                       ("Failure to read \"%s\" tunable from remote node %d\n",
2269                        cd->tunable, pnn));
2270                 return;
2271         }
2272
2273         if (outdata.dsize != sizeof(uint32_t)) {
2274                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2275                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2276                                  (int)outdata.dsize));
2277                 return;
2278         }
2279
2280         size = talloc_get_size(cd->out) / sizeof(uint32_t);
2281         if (pnn >= size) {
2282                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2283                                  cd->tunable, pnn, size));
2284                 return;
2285         }
2286
2287                 
2288         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2289 }
2290
2291 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2292                                         TALLOC_CTX *tmp_ctx,
2293                                         struct ctdb_node_map *nodemap,
2294                                         const char *tunable)
2295 {
2296         TDB_DATA data;
2297         struct ctdb_control_get_tunable *t;
2298         uint32_t *nodes;
2299         uint32_t *tvals;
2300         struct get_tunable_callback_data callback_data;
2301
2302         tvals = talloc_zero_array(tmp_ctx, uint32_t, nodemap->num);
2303         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2304         callback_data.out = tvals;
2305         callback_data.tunable = tunable;
2306
2307         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2308         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2309         t = (struct ctdb_control_get_tunable *)data.dptr;
2310         t->length = strlen(tunable)+1;
2311         memcpy(t->name, tunable, t->length);
2312         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2313         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2314                                       nodes, 0, TAKEOVER_TIMEOUT(),
2315                                       false, data,
2316                                       get_tunable_callback, NULL,
2317                                       &callback_data) != 0) {
2318                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get %s tunable failed\n", tunable));
2319         }
2320         talloc_free(nodes);
2321         talloc_free(data.dptr);
2322
2323         return tvals;
2324 }
2325
2326 /* Set internal flags for IP allocation:
2327  *   Clear ip flags
2328  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2329  */
2330 static void set_ipflags_internal(struct ctdb_node_map *nodemap,
2331                                  uint32_t *tval_noiptakeover)
2332 {
2333         int i;
2334
2335         /* Clear IP flags */
2336         for (i=0;i<nodemap->num;i++) {
2337                 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2338         }
2339
2340         /* Can not take IPs on node with NoIPTakeover set */
2341         for (i=0;i<nodemap->num;i++) {
2342                 if (tval_noiptakeover[i] != 0) {
2343                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPTAKEOVER;
2344                 }
2345         }
2346 }
2347
2348 static bool set_ipflags(struct ctdb_context *ctdb,
2349                         TALLOC_CTX *tmp_ctx,
2350                         struct ctdb_node_map *nodemap)
2351 {
2352         uint32_t *tval_noiptakeover;
2353
2354         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2355                                                    "NoIPTakeover");
2356         if (tval_noiptakeover == NULL) {
2357                 return false;
2358         }
2359
2360         set_ipflags_internal(nodemap, tval_noiptakeover);
2361
2362         talloc_free(tval_noiptakeover);
2363
2364         return true;
2365 }
2366
2367 /*
2368   make any IP alias changes for public addresses that are necessary 
2369  */
2370 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2371                       client_async_callback fail_callback, void *callback_data)
2372 {
2373         int i;
2374         struct ctdb_public_ip ip;
2375         struct ctdb_public_ipv4 ipv4;
2376         uint32_t *nodes;
2377         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2378         TDB_DATA data;
2379         struct timeval timeout;
2380         struct client_async_data *async_data;
2381         struct ctdb_client_control_state *state;
2382         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2383         uint32_t disable_timeout;
2384
2385         /*
2386          * ip failover is completely disabled, just send out the 
2387          * ipreallocated event.
2388          */
2389         if (ctdb->tunable.disable_ip_failover != 0) {
2390                 goto ipreallocated;
2391         }
2392
2393
2394         if (!set_ipflags(ctdb, tmp_ctx, nodemap)) {
2395                 DEBUG(DEBUG_ERR,("Failed to set IP flags from tunables\n"));
2396                 return -1;
2397         }
2398
2399         ZERO_STRUCT(ip);
2400
2401         /* Do the IP reassignment calculations */
2402         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2403
2404         /* The recovery daemon does regular sanity checks of the IPs.
2405          * However, sometimes it is overzealous and thinks changes are
2406          * required when they're already underway.  This stops the
2407          * checks for a while before we start moving IPs.
2408          */
2409         disable_timeout = ctdb->tunable.takeover_timeout;
2410         data.dptr  = (uint8_t*)&disable_timeout;
2411         data.dsize = sizeof(disable_timeout);
2412         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2413                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2414                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2415         }
2416
2417         /* now tell all nodes to delete any alias that they should not
2418            have.  This will be a NOOP on nodes that don't currently
2419            hold the given alias */
2420         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2421         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2422
2423         async_data->fail_callback = fail_callback;
2424         async_data->callback_data = callback_data;
2425
2426         for (i=0;i<nodemap->num;i++) {
2427                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2428                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2429                         continue;
2430                 }
2431
2432                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2433                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2434                                 /* This node should be serving this
2435                                    vnn so dont tell it to release the ip
2436                                 */
2437                                 continue;
2438                         }
2439                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2440                                 ipv4.pnn = tmp_ip->pnn;
2441                                 ipv4.sin = tmp_ip->addr.ip;
2442
2443                                 timeout = TAKEOVER_TIMEOUT();
2444                                 data.dsize = sizeof(ipv4);
2445                                 data.dptr  = (uint8_t *)&ipv4;
2446                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2447                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2448                                                 data, async_data,
2449                                                 &timeout, NULL);
2450                         } else {
2451                                 ip.pnn  = tmp_ip->pnn;
2452                                 ip.addr = tmp_ip->addr;
2453
2454                                 timeout = TAKEOVER_TIMEOUT();
2455                                 data.dsize = sizeof(ip);
2456                                 data.dptr  = (uint8_t *)&ip;
2457                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2458                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2459                                                 data, async_data,
2460                                                 &timeout, NULL);
2461                         }
2462
2463                         if (state == NULL) {
2464                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2465                                 talloc_free(tmp_ctx);
2466                                 return -1;
2467                         }
2468                 
2469                         ctdb_client_async_add(async_data, state);
2470                 }
2471         }
2472         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2473                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2474                 talloc_free(tmp_ctx);
2475                 return -1;
2476         }
2477         talloc_free(async_data);
2478
2479
2480         /* tell all nodes to get their own IPs */
2481         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2482         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2483
2484         async_data->fail_callback = fail_callback;
2485         async_data->callback_data = callback_data;
2486
2487         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2488                 if (tmp_ip->pnn == -1) {
2489                         /* this IP won't be taken over */
2490                         continue;
2491                 }
2492
2493                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2494                         ipv4.pnn = tmp_ip->pnn;
2495                         ipv4.sin = tmp_ip->addr.ip;
2496
2497                         timeout = TAKEOVER_TIMEOUT();
2498                         data.dsize = sizeof(ipv4);
2499                         data.dptr  = (uint8_t *)&ipv4;
2500                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2501                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2502                                         data, async_data,
2503                                         &timeout, NULL);
2504                 } else {
2505                         ip.pnn  = tmp_ip->pnn;
2506                         ip.addr = tmp_ip->addr;
2507
2508                         timeout = TAKEOVER_TIMEOUT();
2509                         data.dsize = sizeof(ip);
2510                         data.dptr  = (uint8_t *)&ip;
2511                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2512                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2513                                         data, async_data,
2514                                         &timeout, NULL);
2515                 }
2516                 if (state == NULL) {
2517                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2518                         talloc_free(tmp_ctx);
2519                         return -1;
2520                 }
2521                 
2522                 ctdb_client_async_add(async_data, state);
2523         }
2524         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2525                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2526                 talloc_free(tmp_ctx);
2527                 return -1;
2528         }
2529
2530 ipreallocated:
2531         /* 
2532          * Tell all nodes to run eventscripts to process the
2533          * "ipreallocated" event.  This can do a lot of things,
2534          * including restarting services to reconfigure them if public
2535          * IPs have moved.  Once upon a time this event only used to
2536          * update natwg.
2537          */
2538         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2539         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2540                                       nodes, 0, TAKEOVER_TIMEOUT(),
2541                                       false, tdb_null,
2542                                       NULL, fail_callback,
2543                                       callback_data) != 0) {
2544                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2545         }
2546
2547         talloc_free(tmp_ctx);
2548         return 0;
2549 }
2550
2551
2552 /*
2553   destroy a ctdb_client_ip structure
2554  */
2555 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2556 {
2557         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2558                 ctdb_addr_to_str(&ip->addr),
2559                 ntohs(ip->addr.ip.sin_port),
2560                 ip->client_id));
2561
2562         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2563         return 0;
2564 }
2565
2566 /*
2567   called by a client to inform us of a TCP connection that it is managing
2568   that should tickled with an ACK when IP takeover is done
2569   we handle both the old ipv4 style of packets as well as the new ipv4/6
2570   pdus.
2571  */
2572 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2573                                 TDB_DATA indata)
2574 {
2575         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2576         struct ctdb_control_tcp *old_addr = NULL;
2577         struct ctdb_control_tcp_addr new_addr;
2578         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2579         struct ctdb_tcp_list *tcp;
2580         struct ctdb_tcp_connection t;
2581         int ret;
2582         TDB_DATA data;
2583         struct ctdb_client_ip *ip;
2584         struct ctdb_vnn *vnn;
2585         ctdb_sock_addr addr;
2586
2587         switch (indata.dsize) {
2588         case sizeof(struct ctdb_control_tcp):
2589                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2590                 ZERO_STRUCT(new_addr);
2591                 tcp_sock = &new_addr;
2592                 tcp_sock->src.ip  = old_addr->src;
2593                 tcp_sock->dest.ip = old_addr->dest;
2594                 break;
2595         case sizeof(struct ctdb_control_tcp_addr):
2596                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2597                 break;
2598         default:
2599                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2600                                  "to ctdb_control_tcp_client. size was %d but "
2601                                  "only allowed sizes are %lu and %lu\n",
2602                                  (int)indata.dsize,
2603                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2604                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2605                 return -1;
2606         }
2607
2608         addr = tcp_sock->src;
2609         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2610         addr = tcp_sock->dest;
2611         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2612
2613         ZERO_STRUCT(addr);
2614         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2615         vnn = find_public_ip_vnn(ctdb, &addr);
2616         if (vnn == NULL) {
2617                 switch (addr.sa.sa_family) {
2618                 case AF_INET:
2619                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2620                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2621                                         ctdb_addr_to_str(&addr)));
2622                         }
2623                         break;
2624                 case AF_INET6:
2625                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2626                                 ctdb_addr_to_str(&addr)));
2627                         break;
2628                 default:
2629                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2630                 }
2631
2632                 return 0;
2633         }
2634
2635         if (vnn->pnn != ctdb->pnn) {
2636                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2637                         ctdb_addr_to_str(&addr),
2638                         client_id, client->pid));
2639                 /* failing this call will tell smbd to die */
2640                 return -1;
2641         }
2642
2643         ip = talloc(client, struct ctdb_client_ip);
2644         CTDB_NO_MEMORY(ctdb, ip);
2645
2646         ip->ctdb      = ctdb;
2647         ip->addr      = addr;
2648         ip->client_id = client_id;
2649         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2650         DLIST_ADD(ctdb->client_ip_list, ip);
2651
2652         tcp = talloc(client, struct ctdb_tcp_list);
2653         CTDB_NO_MEMORY(ctdb, tcp);
2654
2655         tcp->connection.src_addr = tcp_sock->src;
2656         tcp->connection.dst_addr = tcp_sock->dest;
2657
2658         DLIST_ADD(client->tcp_list, tcp);
2659
2660         t.src_addr = tcp_sock->src;
2661         t.dst_addr = tcp_sock->dest;
2662
2663         data.dptr = (uint8_t *)&t;
2664         data.dsize = sizeof(t);
2665
2666         switch (addr.sa.sa_family) {
2667         case AF_INET:
2668                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2669                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2670                         ctdb_addr_to_str(&tcp_sock->src),
2671                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2672                 break;
2673         case AF_INET6:
2674                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2675                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2676                         ctdb_addr_to_str(&tcp_sock->src),
2677                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2678                 break;
2679         default:
2680                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2681         }
2682
2683
2684         /* tell all nodes about this tcp connection */
2685         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2686                                        CTDB_CONTROL_TCP_ADD,
2687                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2688         if (ret != 0) {
2689                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2690                 return -1;
2691         }
2692
2693         return 0;
2694 }
2695
2696 /*
2697   find a tcp address on a list
2698  */
2699 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2700                                            struct ctdb_tcp_connection *tcp)
2701 {
2702         int i;
2703
2704         if (array == NULL) {
2705                 return NULL;
2706         }
2707
2708         for (i=0;i<array->num;i++) {
2709                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2710                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2711                         return &array->connections[i];
2712                 }
2713         }
2714         return NULL;
2715 }
2716
2717
2718
2719 /*
2720   called by a daemon to inform us of a TCP connection that one of its
2721   clients managing that should tickled with an ACK when IP takeover is
2722   done
2723  */
2724 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2725 {
2726         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2727         struct ctdb_tcp_array *tcparray;
2728         struct ctdb_tcp_connection tcp;
2729         struct ctdb_vnn *vnn;
2730
2731         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2732         if (vnn == NULL) {
2733                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2734                         ctdb_addr_to_str(&p->dst_addr)));
2735
2736                 return -1;
2737         }
2738
2739
2740         tcparray = vnn->tcp_array;
2741
2742         /* If this is the first tickle */
2743         if (tcparray == NULL) {
2744                 tcparray = talloc_size(ctdb->nodes, 
2745                         offsetof(struct ctdb_tcp_array, connections) +
2746                         sizeof(struct ctdb_tcp_connection) * 1);
2747                 CTDB_NO_MEMORY(ctdb, tcparray);
2748                 vnn->tcp_array = tcparray;
2749
2750                 tcparray->num = 0;
2751                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2752                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2753
2754                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2755                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2756                 tcparray->num++;
2757
2758                 if (tcp_update_needed) {
2759                         vnn->tcp_update_needed = true;
2760                 }
2761                 return 0;
2762         }
2763
2764
2765         /* Do we already have this tickle ?*/
2766         tcp.src_addr = p->src_addr;
2767         tcp.dst_addr = p->dst_addr;
2768         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2769                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2770                         ctdb_addr_to_str(&tcp.dst_addr),
2771                         ntohs(tcp.dst_addr.ip.sin_port),
2772                         vnn->pnn));
2773                 return 0;
2774         }
2775
2776         /* A new tickle, we must add it to the array */
2777         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2778                                         struct ctdb_tcp_connection,
2779                                         tcparray->num+1);
2780         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2781
2782         vnn->tcp_array = tcparray;
2783         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2784         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2785         tcparray->num++;
2786                                 
2787         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2788                 ctdb_addr_to_str(&tcp.dst_addr),
2789                 ntohs(tcp.dst_addr.ip.sin_port),
2790                 vnn->pnn));
2791
2792         if (tcp_update_needed) {
2793                 vnn->tcp_update_needed = true;
2794         }
2795
2796         return 0;
2797 }
2798
2799
2800 /*
2801   called by a daemon to inform us of a TCP connection that one of its
2802   clients managing that should tickled with an ACK when IP takeover is
2803   done
2804  */
2805 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2806 {
2807         struct ctdb_tcp_connection *tcpp;
2808         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2809
2810         if (vnn == NULL) {
2811                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2812                         ctdb_addr_to_str(&conn->dst_addr)));
2813                 return;
2814         }
2815
2816         /* if the array is empty we cant remove it
2817            and we dont need to do anything
2818          */
2819         if (vnn->tcp_array == NULL) {
2820                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2821                         ctdb_addr_to_str(&conn->dst_addr),
2822                         ntohs(conn->dst_addr.ip.sin_port)));
2823                 return;
2824         }
2825
2826
2827         /* See if we know this connection
2828            if we dont know this connection  then we dont need to do anything
2829          */
2830         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2831         if (tcpp == NULL) {
2832                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2833                         ctdb_addr_to_str(&conn->dst_addr),
2834                         ntohs(conn->dst_addr.ip.sin_port)));
2835                 return;
2836         }
2837
2838
2839         /* We need to remove this entry from the array.
2840            Instead of allocating a new array and copying data to it
2841            we cheat and just copy the last entry in the existing array
2842            to the entry that is to be removed and just shring the 
2843            ->num field
2844          */
2845         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2846         vnn->tcp_array->num--;
2847
2848         /* If we deleted the last entry we also need to remove the entire array
2849          */
2850         if (vnn->tcp_array->num == 0) {
2851                 talloc_free(vnn->tcp_array);
2852                 vnn->tcp_array = NULL;
2853         }               
2854
2855         vnn->tcp_update_needed = true;
2856
2857         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2858                 ctdb_addr_to_str(&conn->src_addr),
2859                 ntohs(conn->src_addr.ip.sin_port)));
2860 }
2861
2862
2863 /*
2864   called by a daemon to inform us of a TCP connection that one of its
2865   clients used are no longer needed in the tickle database
2866  */
2867 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2868 {
2869         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2870
2871         ctdb_remove_tcp_connection(ctdb, conn);
2872
2873         return 0;
2874 }
2875
2876
2877 /*
2878   called when a daemon restarts - send all tickes for all public addresses
2879   we are serving immediately to the new node.
2880  */
2881 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2882 {
2883 /*XXX here we should send all tickes we are serving to the new node */
2884         return 0;
2885 }
2886
2887
2888 /*
2889   called when a client structure goes away - hook to remove
2890   elements from the tcp_list in all daemons
2891  */
2892 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2893 {
2894         while (client->tcp_list) {
2895                 struct ctdb_tcp_list *tcp = client->tcp_list;
2896                 DLIST_REMOVE(client->tcp_list, tcp);
2897                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2898         }
2899 }
2900
2901
2902 /*
2903   release all IPs on shutdown
2904  */
2905 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2906 {
2907         struct ctdb_vnn *vnn;
2908
2909         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2910                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2911                         ctdb_vnn_unassign_iface(ctdb, vnn);
2912                         continue;
2913                 }
2914                 if (!vnn->iface) {
2915                         continue;
2916                 }
2917                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2918                                   ctdb_vnn_iface_string(vnn),
2919                                   ctdb_addr_to_str(&vnn->public_address),
2920                                   vnn->public_netmask_bits);
2921                 release_kill_clients(ctdb, &vnn->public_address);
2922                 ctdb_vnn_unassign_iface(ctdb, vnn);
2923         }
2924 }
2925
2926
2927 /*
2928   get list of public IPs
2929  */
2930 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2931                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2932 {
2933         int i, num, len;
2934         struct ctdb_all_public_ips *ips;
2935         struct ctdb_vnn *vnn;
2936         bool only_available = false;
2937
2938         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2939                 only_available = true;
2940         }
2941
2942         /* count how many public ip structures we have */
2943         num = 0;
2944         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2945                 num++;
2946         }
2947
2948         len = offsetof(struct ctdb_all_public_ips, ips) + 
2949                 num*sizeof(struct ctdb_public_ip);
2950         ips = talloc_zero_size(outdata, len);
2951         CTDB_NO_MEMORY(ctdb, ips);
2952
2953         i = 0;
2954         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2955                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2956                         continue;
2957                 }
2958                 ips->ips[i].pnn  = vnn->pnn;
2959                 ips->ips[i].addr = vnn->public_address;
2960                 i++;
2961         }
2962         ips->num = i;
2963         len = offsetof(struct ctdb_all_public_ips, ips) +
2964                 i*sizeof(struct ctdb_public_ip);
2965
2966         outdata->dsize = len;
2967         outdata->dptr  = (uint8_t *)ips;
2968
2969         return 0;
2970 }
2971
2972
2973 /*
2974   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2975  */
2976 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2977                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2978 {
2979         int i, num, len;
2980         struct ctdb_all_public_ipsv4 *ips;
2981         struct ctdb_vnn *vnn;
2982
2983         /* count how many public ip structures we have */
2984         num = 0;
2985         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2986                 if (vnn->public_address.sa.sa_family != AF_INET) {
2987                         continue;
2988                 }
2989                 num++;
2990         }
2991
2992         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2993                 num*sizeof(struct ctdb_public_ipv4);
2994         ips = talloc_zero_size(outdata, len);
2995         CTDB_NO_MEMORY(ctdb, ips);
2996
2997         outdata->dsize = len;
2998         outdata->dptr  = (uint8_t *)ips;
2999
3000         ips->num = num;
3001         i = 0;
3002         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3003                 if (vnn->public_address.sa.sa_family != AF_INET) {
3004                         continue;
3005                 }
3006                 ips->ips[i].pnn = vnn->pnn;
3007                 ips->ips[i].sin = vnn->public_address.ip;
3008                 i++;
3009         }
3010
3011         return 0;
3012 }
3013
3014 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3015                                         struct ctdb_req_control *c,
3016                                         TDB_DATA indata,
3017                                         TDB_DATA *outdata)
3018 {
3019         int i, num, len;
3020         ctdb_sock_addr *addr;
3021         struct ctdb_control_public_ip_info *info;
3022         struct ctdb_vnn *vnn;
3023
3024         addr = (ctdb_sock_addr *)indata.dptr;
3025
3026         vnn = find_public_ip_vnn(ctdb, addr);
3027         if (vnn == NULL) {
3028                 /* if it is not a public ip   it could be our 'single ip' */
3029                 if (ctdb->single_ip_vnn) {
3030                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3031                                 vnn = ctdb->single_ip_vnn;
3032                         }
3033                 }
3034         }
3035         if (vnn == NULL) {
3036                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3037                                  "'%s'not a public address\n",
3038                                  ctdb_addr_to_str(addr)));
3039                 return -1;
3040         }
3041
3042         /* count how many public ip structures we have */
3043         num = 0;
3044         for (;vnn->ifaces[num];) {
3045                 num++;
3046         }
3047
3048         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3049                 num*sizeof(struct ctdb_control_iface_info);
3050         info = talloc_zero_size(outdata, len);
3051         CTDB_NO_MEMORY(ctdb, info);
3052
3053         info->ip.addr = vnn->public_address;
3054         info->ip.pnn = vnn->pnn;
3055         info->active_idx = 0xFFFFFFFF;
3056
3057         for (i=0; vnn->ifaces[i]; i++) {
3058                 struct ctdb_iface *cur;
3059
3060                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3061                 if (cur == NULL) {
3062                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3063                                            vnn->ifaces[i]));
3064                         return -1;
3065                 }
3066                 if (vnn->iface == cur) {
3067                         info->active_idx = i;
3068                 }
3069                 strcpy(info->ifaces[i].name, cur->name);
3070                 info->ifaces[i].link_state = cur->link_up;
3071                 info->ifaces[i].references = cur->references;
3072         }
3073         info->num = i;
3074         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3075                 i*sizeof(struct ctdb_control_iface_info);
3076
3077         outdata->dsize = len;
3078         outdata->dptr  = (uint8_t *)info;
3079
3080         return 0;
3081 }
3082
3083 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3084                                 struct ctdb_req_control *c,
3085                                 TDB_DATA *outdata)
3086 {
3087         int i, num, len;
3088         struct ctdb_control_get_ifaces *ifaces;
3089         struct ctdb_iface *cur;
3090
3091         /* count how many public ip structures we have */
3092         num = 0;
3093         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3094                 num++;
3095         }
3096
3097         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3098                 num*sizeof(struct ctdb_control_iface_info);
3099         ifaces = talloc_zero_size(outdata, len);
3100         CTDB_NO_MEMORY(ctdb, ifaces);
3101
3102         i = 0;
3103         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3104                 strcpy(ifaces->ifaces[i].name, cur->name);
3105                 ifaces->ifaces[i].link_state = cur->link_up;
3106                 ifaces->ifaces[i].references = cur->references;
3107                 i++;
3108         }
3109         ifaces->num = i;
3110         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3111                 i*sizeof(struct ctdb_control_iface_info);
3112
3113         outdata->dsize = len;
3114         outdata->dptr  = (uint8_t *)ifaces;
3115
3116         return 0;
3117 }
3118
3119 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3120                                     struct ctdb_req_control *c,
3121                                     TDB_DATA indata)
3122 {
3123         struct ctdb_control_iface_info *info;
3124         struct ctdb_iface *iface;
3125         bool link_up = false;
3126
3127         info = (struct ctdb_control_iface_info *)indata.dptr;
3128
3129         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3130                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3131                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3132                                   len, len, info->name));
3133                 return -1;
3134         }
3135
3136         switch (info->link_state) {
3137         case 0:
3138                 link_up = false;
3139                 break;
3140         case 1:
3141                 link_up = true;
3142                 break;
3143         default:
3144                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3145                                   (unsigned int)info->link_state));
3146                 return -1;
3147         }
3148
3149         if (info->references != 0) {
3150                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3151                                   (unsigned int)info->references));
3152                 return -1;
3153         }
3154
3155         iface = ctdb_find_iface(ctdb, info->name);
3156         if (iface == NULL) {
3157                 return -1;
3158         }
3159
3160         if (link_up == iface->link_up) {
3161                 return 0;
3162         }
3163
3164         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3165               ("iface[%s] has changed it's link status %s => %s\n",
3166                iface->name,
3167                iface->link_up?"up":"down",
3168                link_up?"up":"down"));
3169
3170         iface->link_up = link_up;
3171         return 0;
3172 }
3173
3174
3175 /* 
3176    structure containing the listening socket and the list of tcp connections
3177    that the ctdb daemon is to kill
3178 */
3179 struct ctdb_kill_tcp {
3180         struct ctdb_vnn *vnn;
3181         struct ctdb_context *ctdb;
3182         int capture_fd;
3183         struct fd_event *fde;
3184         trbt_tree_t *connections;
3185         void *private_data;
3186 };
3187
3188 /*
3189   a tcp connection that is to be killed
3190  */
3191 struct ctdb_killtcp_con {
3192         ctdb_sock_addr src_addr;
3193         ctdb_sock_addr dst_addr;
3194         int count;
3195         struct ctdb_kill_tcp *killtcp;
3196 };
3197
3198 /* this function is used to create a key to represent this socketpair
3199    in the killtcp tree.
3200    this key is used to insert and lookup matching socketpairs that are
3201    to be tickled and RST
3202 */
3203 #define KILLTCP_KEYLEN  10
3204 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3205 {
3206         static uint32_t key[KILLTCP_KEYLEN];
3207
3208         bzero(key, sizeof(key));
3209
3210         if (src->sa.sa_family != dst->sa.sa_family) {
3211                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3212                 return key;
3213         }
3214         
3215         switch (src->sa.sa_family) {
3216         case AF_INET:
3217                 key[0]  = dst->ip.sin_addr.s_addr;
3218                 key[1]  = src->ip.sin_addr.s_addr;
3219                 key[2]  = dst->ip.sin_port;
3220                 key[3]  = src->ip.sin_port;
3221                 break;
3222         case AF_INET6: {
3223                 uint32_t *dst6_addr32 =
3224                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3225                 uint32_t *src6_addr32 =
3226                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3227                 key[0]  = dst6_addr32[3];
3228                 key[1]  = src6_addr32[3];
3229                 key[2]  = dst6_addr32[2];
3230                 key[3]  = src6_addr32[2];
3231                 key[4]  = dst6_addr32[1];
3232                 key[5]  = src6_addr32[1];
3233                 key[6]  = dst6_addr32[0];
3234                 key[7]  = src6_addr32[0];
3235                 key[8]  = dst->ip6.sin6_port;
3236                 key[9]  = src->ip6.sin6_port;
3237                 break;
3238         }
3239         default:
3240                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3241                 return key;
3242         }
3243
3244         return key;
3245 }
3246
3247 /*
3248   called when we get a read event on the raw socket
3249  */
3250 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3251                                 uint16_t flags, void *private_data)
3252 {
3253         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3254         struct ctdb_killtcp_con *con;
3255         ctdb_sock_addr src, dst;
3256         uint32_t ack_seq, seq;
3257
3258         if (!(flags & EVENT_FD_READ)) {
3259                 return;
3260         }
3261
3262         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3263                                 killtcp->private_data,
3264                                 &src, &dst,
3265                                 &ack_seq, &seq) != 0) {
3266                 /* probably a non-tcp ACK packet */
3267                 return;
3268         }
3269
3270         /* check if we have this guy in our list of connections
3271            to kill
3272         */
3273         con = trbt_lookuparray32(killtcp->connections, 
3274                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3275         if (con == NULL) {
3276                 /* no this was some other packet we can just ignore */
3277                 return;
3278         }
3279
3280         /* This one has been tickled !
3281            now reset him and remove him from the list.
3282          */
3283         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3284                 ntohs(con->dst_addr.ip.sin_port),
3285                 ctdb_addr_to_str(&con->src_addr),
3286                 ntohs(con->src_addr.ip.sin_port)));
3287
3288         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3289         talloc_free(con);
3290 }
3291
3292
3293 /* when traversing the list of all tcp connections to send tickle acks to
3294    (so that we can capture the ack coming back and kill the connection
3295     by a RST)
3296    this callback is called for each connection we are currently trying to kill
3297 */
3298 static int tickle_connection_traverse(void *param, void *data)
3299 {
3300         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3301
3302         /* have tried too many times, just give up */
3303         if (con->count >= 5) {
3304                 /* can't delete in traverse: reparent to delete_cons */
3305                 talloc_steal(param, con);
3306                 return 0;
3307         }
3308
3309         /* othervise, try tickling it again */
3310         con->count++;
3311         ctdb_sys_send_tcp(
3312                 (ctdb_sock_addr *)&con->dst_addr,
3313                 (ctdb_sock_addr *)&con->src_addr,
3314                 0, 0, 0);
3315         return 0;
3316 }
3317
3318
3319 /* 
3320    called every second until all sentenced connections have been reset
3321  */
3322 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3323                                               struct timeval t, void *private_data)
3324 {
3325         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3326         void *delete_cons = talloc_new(NULL);
3327
3328         /* loop over all connections sending tickle ACKs */
3329         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3330
3331         /* now we've finished traverse, it's safe to do deletion. */
3332         talloc_free(delete_cons);
3333
3334         /* If there are no more connections to kill we can remove the
3335            entire killtcp structure
3336          */
3337         if ( (killtcp->connections == NULL) || 
3338              (killtcp->connections->root == NULL) ) {
3339                 talloc_free(killtcp);
3340                 return;
3341         }
3342
3343         /* try tickling them again in a seconds time
3344          */
3345         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3346                         ctdb_tickle_sentenced_connections, killtcp);
3347 }
3348
3349 /*
3350   destroy the killtcp structure
3351  */
3352 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3353 {
3354         struct ctdb_vnn *tmpvnn;
3355
3356         /* verify that this vnn is still active */
3357         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3358                 if (tmpvnn == killtcp->vnn) {
3359                         break;
3360                 }
3361         }
3362
3363         if (tmpvnn == NULL) {
3364                 return 0;
3365         }
3366
3367         if (killtcp->vnn->killtcp != killtcp) {
3368                 return 0;
3369         }
3370
3371         killtcp->vnn->killtcp = NULL;
3372
3373         return 0;
3374 }
3375
3376
3377 /* nothing fancy here, just unconditionally replace any existing
3378    connection structure with the new one.
3379
3380    dont even free the old one if it did exist, that one is talloc_stolen
3381    by the same node in the tree anyway and will be deleted when the new data 
3382    is deleted
3383 */
3384 static void *add_killtcp_callback(void *parm, void *data)
3385 {
3386         return parm;
3387 }
3388
3389 /*
3390   add a tcp socket to the list of connections we want to RST
3391  */
3392 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3393                                        ctdb_sock_addr *s,
3394                                        ctdb_sock_addr *d)
3395 {
3396         ctdb_sock_addr src, dst;
3397         struct ctdb_kill_tcp *killtcp;
3398         struct ctdb_killtcp_con *con;
3399         struct ctdb_vnn *vnn;
3400
3401         ctdb_canonicalize_ip(s, &src);
3402         ctdb_canonicalize_ip(d, &dst);
3403
3404         vnn = find_public_ip_vnn(ctdb, &dst);
3405         if (vnn == NULL) {
3406                 vnn = find_public_ip_vnn(ctdb, &src);
3407         }
3408         if (vnn == NULL) {
3409                 /* if it is not a public ip   it could be our 'single ip' */
3410                 if (ctdb->single_ip_vnn) {
3411                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3412                                 vnn = ctdb->single_ip_vnn;
3413                         }
3414                 }
3415         }
3416         if (vnn == NULL) {
3417                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3418                 return -1;
3419         }
3420
3421         killtcp = vnn->killtcp;
3422         
3423         /* If this is the first connection to kill we must allocate
3424            a new structure
3425          */
3426         if (killtcp == NULL) {
3427                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3428                 CTDB_NO_MEMORY(ctdb, killtcp);
3429
3430                 killtcp->vnn         = vnn;
3431                 killtcp->ctdb        = ctdb;
3432                 killtcp->capture_fd  = -1;
3433                 killtcp->connections = trbt_create(killtcp, 0);
3434
3435                 vnn->killtcp         = killtcp;
3436                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3437         }
3438
3439
3440
3441         /* create a structure that describes this connection we want to
3442            RST and store it in killtcp->connections
3443         */
3444         con = talloc(killtcp, struct ctdb_killtcp_con);
3445         CTDB_NO_MEMORY(ctdb, con);
3446         con->src_addr = src;
3447         con->dst_addr = dst;
3448         con->count    = 0;
3449         con->killtcp  = killtcp;
3450
3451
3452         trbt_insertarray32_callback(killtcp->connections,
3453                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3454                         add_killtcp_callback, con);
3455
3456         /* 
3457            If we dont have a socket to listen on yet we must create it
3458          */
3459         if (killtcp->capture_fd == -1) {
3460                 const char *iface = ctdb_vnn_iface_string(vnn);
3461                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3462                 if (killtcp->capture_fd == -1) {
3463                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3464                                           "socket on iface '%s' for killtcp (%s)\n",
3465                                           iface, strerror(errno)));
3466                         goto failed;
3467                 }
3468         }
3469
3470
3471         if (killtcp->fde == NULL) {
3472                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3473                                             EVENT_FD_READ,
3474                                             capture_tcp_handler, killtcp);
3475                 tevent_fd_set_auto_close(killtcp->fde);
3476
3477                 /* We also need to set up some events to tickle all these connections
3478                    until they are all reset
3479                 */
3480                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3481                                 ctdb_tickle_sentenced_connections, killtcp);
3482         }
3483
3484         /* tickle him once now */
3485         ctdb_sys_send_tcp(
3486                 &con->dst_addr,
3487                 &con->src_addr,
3488                 0, 0, 0);
3489
3490         return 0;
3491
3492 failed:
3493         talloc_free(vnn->killtcp);
3494         vnn->killtcp = NULL;
3495         return -1;
3496 }
3497
3498 /*
3499   kill a TCP connection.
3500  */
3501 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3502 {
3503         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3504
3505         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3506 }
3507
3508 /*
3509   called by a daemon to inform us of the entire list of TCP tickles for
3510   a particular public address.
3511   this control should only be sent by the node that is currently serving
3512   that public address.
3513  */
3514 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3515 {
3516         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3517         struct ctdb_tcp_array *tcparray;
3518         struct ctdb_vnn *vnn;
3519
3520         /* We must at least have tickles.num or else we cant verify the size
3521            of the received data blob
3522          */
3523         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3524                                         tickles.connections)) {
3525                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3526                 return -1;
3527         }
3528
3529         /* verify that the size of data matches what we expect */
3530         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3531                                 tickles.connections)
3532                          + sizeof(struct ctdb_tcp_connection)
3533                                  * list->tickles.num) {
3534                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3535                 return -1;
3536         }       
3537
3538         vnn = find_public_ip_vnn(ctdb, &list->addr);
3539         if (vnn == NULL) {
3540                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3541                         ctdb_addr_to_str(&list->addr)));
3542
3543                 return 1;
3544         }
3545
3546         /* remove any old ticklelist we might have */
3547         talloc_free(vnn->tcp_array);
3548         vnn->tcp_array = NULL;
3549
3550         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3551         CTDB_NO_MEMORY(ctdb, tcparray);
3552
3553         tcparray->num = list->tickles.num;
3554
3555         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3556         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3557
3558         memcpy(tcparray->connections, &list->tickles.connections[0], 
3559                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3560
3561         /* We now have a new fresh tickle list array for this vnn */
3562         vnn->tcp_array = talloc_steal(vnn, tcparray);
3563         
3564         return 0;
3565 }
3566
3567 /*
3568   called to return the full list of tickles for the puclic address associated 
3569   with the provided vnn
3570  */
3571 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3572 {
3573         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3574         struct ctdb_control_tcp_tickle_list *list;
3575         struct ctdb_tcp_array *tcparray;
3576         int num;
3577         struct ctdb_vnn *vnn;
3578
3579         vnn = find_public_ip_vnn(ctdb, addr);
3580         if (vnn == NULL) {
3581                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3582                         ctdb_addr_to_str(addr)));
3583
3584                 return 1;
3585         }
3586
3587         tcparray = vnn->tcp_array;
3588         if (tcparray) {
3589                 num = tcparray->num;
3590         } else {
3591                 num = 0;
3592         }
3593
3594         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3595                                 tickles.connections)
3596                         + sizeof(struct ctdb_tcp_connection) * num;
3597
3598         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3599         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3600         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3601
3602         list->addr = *addr;
3603         list->tickles.num = num;
3604         if (num) {
3605                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3606                         sizeof(struct ctdb_tcp_connection) * num);
3607         }
3608
3609         return 0;
3610 }
3611
3612
3613 /*
3614   set the list of all tcp tickles for a public address
3615  */
3616 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3617                               struct timeval timeout, uint32_t destnode, 
3618                               ctdb_sock_addr *addr,
3619                               struct ctdb_tcp_array *tcparray)
3620 {
3621         int ret, num;
3622         TDB_DATA data;
3623         struct ctdb_control_tcp_tickle_list *list;
3624
3625         if (tcparray) {
3626                 num = tcparray->num;
3627         } else {
3628                 num = 0;
3629         }
3630
3631         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3632                                 tickles.connections) +
3633                         sizeof(struct ctdb_tcp_connection) * num;
3634         data.dptr = talloc_size(ctdb, data.dsize);
3635         CTDB_NO_MEMORY(ctdb, data.dptr);
3636
3637         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3638         list->addr = *addr;
3639         list->tickles.num = num;
3640         if (tcparray) {
3641                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3642         }
3643
3644         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3645                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3646                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3647         if (ret != 0) {
3648                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3649                 return -1;
3650         }
3651
3652         talloc_free(data.dptr);
3653
3654         return ret;
3655 }
3656
3657
3658 /*
3659   perform tickle updates if required
3660  */
3661 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3662                                 struct timed_event *te, 
3663                                 struct timeval t, void *private_data)
3664 {
3665         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3666         int ret;
3667         struct ctdb_vnn *vnn;
3668
3669         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3670                 /* we only send out updates for public addresses that 
3671                    we have taken over
3672                  */
3673                 if (ctdb->pnn != vnn->pnn) {
3674                         continue;
3675                 }
3676                 /* We only send out the updates if we need to */
3677                 if (!vnn->tcp_update_needed) {
3678                         continue;
3679                 }
3680                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3681                                 TAKEOVER_TIMEOUT(),
3682                                 CTDB_BROADCAST_CONNECTED,
3683                                 &vnn->public_address,
3684                                 vnn->tcp_array);
3685                 if (ret != 0) {
3686                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3687                                 ctdb_addr_to_str(&vnn->public_address)));
3688                 }
3689         }
3690
3691         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3692                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3693                              ctdb_update_tcp_tickles, ctdb);
3694 }               
3695         
3696
3697 /*
3698   start periodic update of tcp tickles
3699  */
3700 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3701 {
3702         ctdb->tickle_update_context = talloc_new(ctdb);
3703
3704         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3705                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3706                              ctdb_update_tcp_tickles, ctdb);
3707 }
3708
3709
3710
3711
3712 struct control_gratious_arp {
3713         struct ctdb_context *ctdb;
3714         ctdb_sock_addr addr;
3715         const char *iface;
3716         int count;
3717 };
3718
3719 /*
3720   send a control_gratuitous arp
3721  */
3722 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3723                                   struct timeval t, void *private_data)
3724 {
3725         int ret;
3726         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3727                                                         struct control_gratious_arp);
3728
3729         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3730         if (ret != 0) {
3731                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3732                                  arp->iface, strerror(errno)));
3733         }
3734
3735
3736         arp->count++;
3737         if (arp->count == CTDB_ARP_REPEAT) {
3738                 talloc_free(arp);
3739                 return;
3740         }
3741
3742         event_add_timed(arp->ctdb->ev, arp, 
3743                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3744                         send_gratious_arp, arp);
3745 }
3746
3747
3748 /*
3749   send a gratious arp 
3750  */
3751 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3752 {
3753         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3754         struct control_gratious_arp *arp;
3755
3756         /* verify the size of indata */
3757         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3758                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3759                                  (unsigned)indata.dsize, 
3760                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3761                 return -1;
3762         }
3763         if (indata.dsize != 
3764                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3765                 + gratious_arp->len ) ){
3766
3767                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3768                         "but should be %u bytes\n", 
3769                          (unsigned)indata.dsize, 
3770                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3771                 return -1;
3772         }
3773
3774
3775         arp = talloc(ctdb, struct control_gratious_arp);
3776         CTDB_NO_MEMORY(ctdb, arp);
3777
3778         arp->ctdb  = ctdb;
3779         arp->addr   = gratious_arp->addr;
3780         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3781         CTDB_NO_MEMORY(ctdb, arp->iface);
3782         arp->count = 0;
3783         
3784         event_add_timed(arp->ctdb->ev, arp, 
3785                         timeval_zero(), send_gratious_arp, arp);
3786
3787         return 0;
3788 }
3789
3790 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3791 {
3792         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3793         int ret;
3794
3795         /* verify the size of indata */
3796         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3797                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3798                 return -1;
3799         }
3800         if (indata.dsize != 
3801                 ( offsetof(struct ctdb_control_ip_iface, iface)
3802                 + pub->len ) ){
3803
3804                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3805                         "but should be %u bytes\n", 
3806                          (unsigned)indata.dsize, 
3807                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3808                 return -1;
3809         }
3810
3811         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3812
3813         if (ret != 0) {
3814                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3815                 return -1;
3816         }
3817
3818         return 0;
3819 }
3820
3821 /*
3822   called when releaseip event finishes for del_public_address
3823  */
3824 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3825                                 void *private_data)
3826 {
3827         talloc_free(private_data);
3828 }
3829
3830 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3831 {
3832         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3833         struct ctdb_vnn *vnn;
3834         int ret;
3835
3836         /* verify the size of indata */
3837         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3838                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3839                 return -1;
3840         }
3841         if (indata.dsize != 
3842                 ( offsetof(struct ctdb_control_ip_iface, iface)
3843                 + pub->len ) ){
3844
3845                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3846                         "but should be %u bytes\n", 
3847                          (unsigned)indata.dsize, 
3848                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3849                 return -1;
3850         }
3851
3852         /* walk over all public addresses until we find a match */
3853         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3854                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3855                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3856
3857                         DLIST_REMOVE(ctdb->vnn, vnn);
3858                         talloc_steal(mem_ctx, vnn);
3859                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3860                         if (vnn->pnn != ctdb->pnn) {
3861                                 if (vnn->iface != NULL) {
3862                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3863                                 }
3864                                 talloc_free(mem_ctx);
3865                                 return 0;
3866                         }
3867                         vnn->pnn = -1;
3868
3869                         ret = ctdb_event_script_callback(ctdb, 
3870                                          mem_ctx, delete_ip_callback, mem_ctx,
3871                                          false,
3872                                          CTDB_EVENT_RELEASE_IP,
3873                                          "%s %s %u",
3874                                          ctdb_vnn_iface_string(vnn),
3875                                          ctdb_addr_to_str(&vnn->public_address),
3876                                          vnn->public_netmask_bits);
3877                         if (vnn->iface != NULL) {
3878                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3879                         }
3880                         if (ret != 0) {
3881                                 return -1;
3882                         }
3883                         return 0;
3884                 }
3885         }
3886
3887         return -1;
3888 }
3889
3890
3891 struct ipreallocated_callback_state {
3892         struct ctdb_req_control *c;
3893 };
3894
3895 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3896                                         int status, void *p)
3897 {
3898         struct ipreallocated_callback_state *state =
3899                 talloc_get_type(p, struct ipreallocated_callback_state);
3900
3901         if (status != 0) {
3902                 DEBUG(DEBUG_ERR,
3903                       (" \"ipreallocated\" event script failed (status %d)\n",
3904                        status));
3905                 if (status == -ETIME) {
3906                         ctdb_ban_self(ctdb);
3907                 }
3908         }
3909
3910         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3911         talloc_free(state);
3912 }
3913
3914 /* A control to run the ipreallocated event */
3915 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3916                                    struct ctdb_req_control *c,
3917                                    bool *async_reply)
3918 {
3919         int ret;
3920         struct ipreallocated_callback_state *state;
3921
3922         state = talloc(ctdb, struct ipreallocated_callback_state);
3923         CTDB_NO_MEMORY(ctdb, state);
3924
3925         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3926
3927         ret = ctdb_event_script_callback(ctdb, state,
3928                                          ctdb_ipreallocated_callback, state,
3929                                          false, CTDB_EVENT_IPREALLOCATED,
3930                                          "%s", "");
3931
3932         if (ret != 0) {
3933                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3934                 talloc_free(state);
3935                 return -1;
3936         }
3937
3938         /* tell the control that we will be reply asynchronously */
3939         state->c    = talloc_steal(state, c);
3940         *async_reply = true;
3941
3942         return 0;
3943 }
3944
3945
3946 /* This function is called from the recovery daemon to verify that a remote
3947    node has the expected ip allocation.
3948    This is verified against ctdb->ip_tree
3949 */
3950 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3951 {
3952         struct ctdb_public_ip_list *tmp_ip; 
3953         int i;
3954
3955         if (ctdb->ip_tree == NULL) {
3956                 /* dont know the expected allocation yet, assume remote node
3957                    is correct. */
3958                 return 0;
3959         }
3960
3961         if (ips == NULL) {
3962                 return 0;
3963         }
3964
3965         for (i=0; i<ips->num; i++) {
3966                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3967                 if (tmp_ip == NULL) {
3968                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3969                         return -1;
3970                 }
3971
3972                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3973                         continue;
3974                 }
3975
3976                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3977                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3978                         return -1;
3979                 }
3980         }
3981
3982         return 0;
3983 }
3984
3985 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3986 {
3987         struct ctdb_public_ip_list *tmp_ip; 
3988
3989         if (ctdb->ip_tree == NULL) {
3990                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3991                 return -1;
3992         }
3993
3994         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3995         if (tmp_ip == NULL) {
3996                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3997                 return -1;
3998         }
3999
4000         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4001         tmp_ip->pnn = ip->pnn;
4002
4003         return 0;
4004 }
4005
4006
4007 struct ctdb_reloadips_handle {
4008         struct ctdb_context *ctdb;
4009         struct ctdb_req_control *c;
4010         int status;
4011         int fd[2];
4012         pid_t child;
4013         struct fd_event *fde;
4014 };
4015
4016 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4017 {
4018         if (h == h->ctdb->reload_ips) {
4019                 h->ctdb->reload_ips = NULL;
4020         }
4021         if (h->c != NULL) {
4022                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4023                 h->c = NULL;
4024         }
4025         ctdb_kill(h->ctdb, h->child, SIGKILL);
4026         return 0;
4027 }
4028
4029 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4030                                 struct timed_event *te,
4031                                 struct timeval t, void *private_data)
4032 {
4033         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4034
4035         talloc_free(h);
4036 }       
4037
4038 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4039                              uint16_t flags, void *private_data)
4040 {
4041         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4042
4043         char res;
4044         int ret;
4045
4046         ret = read(h->fd[0], &res, 1);
4047         if (ret < 1 || res != 0) {
4048                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4049                 res = 1;
4050         }
4051         h->status = res;
4052
4053         talloc_free(h);
4054 }
4055
4056 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4057 {
4058         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4059         struct ctdb_all_public_ips *ips;
4060         struct ctdb_vnn *vnn;
4061         int i, ret;
4062
4063         /* read the ip allocation from the local node */
4064         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4065         if (ret != 0) {
4066                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4067                 talloc_free(mem_ctx);
4068                 return -1;
4069         }
4070
4071         /* re-read the public ips file */
4072         ctdb->vnn = NULL;
4073         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4074                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4075                 talloc_free(mem_ctx);
4076                 return -1;
4077         }               
4078
4079
4080         /* check the previous list of ips and scan for ips that have been
4081            dropped.
4082          */
4083         for (i = 0; i < ips->num; i++) {
4084                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4085                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4086                                 break;
4087                         }
4088                 }
4089
4090                 /* we need to delete this ip, no longer available on this node */
4091                 if (vnn == NULL) {
4092                         struct ctdb_control_ip_iface pub;
4093
4094                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4095                         pub.addr  = ips->ips[i].addr;
4096                         pub.mask  = 0;
4097                         pub.len   = 0;
4098
4099                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4100                         if (ret != 0) {
4101                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4102                                 return -1;
4103                         }
4104                 }
4105         }
4106
4107
4108         /* loop over all new ones and check the ones we need to add */
4109         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4110                 for (i = 0; i < ips->num; i++) {
4111                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4112                                 break;
4113                         }
4114                 }
4115                 if (i == ips->num) {
4116                         struct ctdb_control_ip_iface pub;
4117                         const char *ifaces = NULL;
4118                         int iface = 0;
4119
4120                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4121
4122                         pub.addr  = vnn->public_address;
4123                         pub.mask  = vnn->public_netmask_bits;
4124
4125
4126                         ifaces = vnn->ifaces[0];
4127                         iface = 1;
4128                         while (vnn->ifaces[iface] != NULL) {
4129                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4130                                 iface++;
4131                         }
4132                         pub.len   = strlen(ifaces)+1;
4133                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4134
4135                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4136                         if (ret != 0) {
4137                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4138                                 return -1;
4139                         }
4140                 }
4141         }
4142
4143         return 0;
4144 }
4145
4146 /* This control is sent to force the node to re-read the public addresses file
4147    and drop any addresses we should nnot longer host, and add new addresses
4148    that we are now able to host
4149 */
4150 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4151 {
4152         struct ctdb_reloadips_handle *h;
4153         pid_t parent = getpid();
4154
4155         if (ctdb->reload_ips != NULL) {
4156                 talloc_free(ctdb->reload_ips);
4157                 ctdb->reload_ips = NULL;
4158         }
4159
4160         h = talloc(ctdb, struct ctdb_reloadips_handle);
4161         CTDB_NO_MEMORY(ctdb, h);
4162         h->ctdb     = ctdb;
4163         h->c        = NULL;
4164         h->status   = -1;
4165         
4166         if (pipe(h->fd) == -1) {
4167                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4168                 talloc_free(h);
4169                 return -1;
4170         }
4171
4172         h->child = ctdb_fork(ctdb);
4173         if (h->child == (pid_t)-1) {
4174                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4175                 close(h->fd[0]);
4176                 close(h->fd[1]);
4177                 talloc_free(h);
4178                 return -1;
4179         }
4180
4181         /* child process */
4182         if (h->child == 0) {
4183                 signed char res = 0;
4184
4185                 close(h->fd[0]);
4186                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4187
4188                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4189                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4190                         res = -1;
4191                 } else {
4192                         res = ctdb_reloadips_child(ctdb);
4193                         if (res != 0) {
4194                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4195                         }
4196                 }
4197
4198                 write(h->fd[1], &res, 1);
4199                 /* make sure we die when our parent dies */
4200                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4201                         sleep(5);
4202                 }
4203                 _exit(0);
4204         }
4205
4206         h->c             = talloc_steal(h, c);
4207
4208         close(h->fd[1]);
4209         set_close_on_exec(h->fd[0]);
4210
4211         talloc_set_destructor(h, ctdb_reloadips_destructor);
4212
4213
4214         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4215                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4216                         (void *)h);
4217         tevent_fd_set_auto_close(h->fde);
4218
4219         event_add_timed(ctdb->ev, h,
4220                         timeval_current_ofs(120, 0),
4221                         ctdb_reloadips_timeout_event, h);
4222
4223         /* we reply later */
4224         *async_reply = true;
4225         return 0;
4226 }