recoverd: Clear IP flags after IP allocation algorithm has run
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Can the given node host the given IP: is the public IP known to the
1303  * node and is NOIPHOST unset?
1304 */
1305 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                              struct ctdb_node_map *nodemap,
1307                              struct ctdb_public_ip_list *ip)
1308 {
1309         struct ctdb_all_public_ips *public_ips;
1310         int i;
1311
1312         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPHOST) {
1313                 return false;
1314         }
1315
1316         public_ips = ctdb->nodes[pnn]->available_public_ips;
1317
1318         if (public_ips == NULL) {
1319                 return false;
1320         }
1321
1322         for (i=0;i<public_ips->num;i++) {
1323                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1324                         /* yes, this node can serve this public ip */
1325                         return true;
1326                 }
1327         }
1328
1329         return false;
1330 }
1331
1332 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1333                                  struct ctdb_node_map *nodemap,
1334                                  struct ctdb_public_ip_list *ip)
1335 {
1336         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPTAKEOVER) {
1337                 return false;
1338         }
1339
1340         return can_node_host_ip(ctdb, pnn, nodemap, ip);
1341 }
1342
1343 /* search the node lists list for a node to takeover this ip.
1344    pick the node that currently are serving the least number of ips
1345    so that the ips get spread out evenly.
1346 */
1347 static int find_takeover_node(struct ctdb_context *ctdb, 
1348                 struct ctdb_node_map *nodemap,
1349                 struct ctdb_public_ip_list *ip,
1350                 struct ctdb_public_ip_list *all_ips)
1351 {
1352         int pnn, min=0, num;
1353         int i;
1354
1355         pnn    = -1;
1356         for (i=0;i<nodemap->num;i++) {
1357                 /* verify that this node can serve this ip */
1358                 if (!can_node_takeover_ip(ctdb, i, nodemap, ip)) {
1359                         /* no it couldnt   so skip to the next node */
1360                         continue;
1361                 }
1362
1363                 num = node_ip_coverage(ctdb, i, all_ips);
1364                 /* was this the first node we checked ? */
1365                 if (pnn == -1) {
1366                         pnn = i;
1367                         min  = num;
1368                 } else {
1369                         if (num < min) {
1370                                 pnn = i;
1371                                 min  = num;
1372                         }
1373                 }
1374         }       
1375         if (pnn == -1) {
1376                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1377                         ctdb_addr_to_str(&ip->addr)));
1378
1379                 return -1;
1380         }
1381
1382         ip->pnn = pnn;
1383         return 0;
1384 }
1385
1386 #define IP_KEYLEN       4
1387 static uint32_t *ip_key(ctdb_sock_addr *ip)
1388 {
1389         static uint32_t key[IP_KEYLEN];
1390
1391         bzero(key, sizeof(key));
1392
1393         switch (ip->sa.sa_family) {
1394         case AF_INET:
1395                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1396                 break;
1397         case AF_INET6: {
1398                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1399                 key[0]  = htonl(s6_a32[0]);
1400                 key[1]  = htonl(s6_a32[1]);
1401                 key[2]  = htonl(s6_a32[2]);
1402                 key[3]  = htonl(s6_a32[3]);
1403                 break;
1404         }
1405         default:
1406                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1407                 return key;
1408         }
1409
1410         return key;
1411 }
1412
1413 static void *add_ip_callback(void *parm, void *data)
1414 {
1415         struct ctdb_public_ip_list *this_ip = parm; 
1416         struct ctdb_public_ip_list *prev_ip = data; 
1417
1418         if (prev_ip == NULL) {
1419                 return parm;
1420         }
1421         if (this_ip->pnn == -1) {
1422                 this_ip->pnn = prev_ip->pnn;
1423         }
1424
1425         return parm;
1426 }
1427
1428 static int getips_count_callback(void *param, void *data)
1429 {
1430         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1431         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1432
1433         new_ip->next = *ip_list;
1434         *ip_list     = new_ip;
1435         return 0;
1436 }
1437
1438 static struct ctdb_public_ip_list *
1439 create_merged_ip_list(struct ctdb_context *ctdb)
1440 {
1441         int i, j;
1442         struct ctdb_public_ip_list *ip_list;
1443         struct ctdb_all_public_ips *public_ips;
1444
1445         if (ctdb->ip_tree != NULL) {
1446                 talloc_free(ctdb->ip_tree);
1447                 ctdb->ip_tree = NULL;
1448         }
1449         ctdb->ip_tree = trbt_create(ctdb, 0);
1450
1451         for (i=0;i<ctdb->num_nodes;i++) {
1452                 public_ips = ctdb->nodes[i]->known_public_ips;
1453
1454                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1455                         continue;
1456                 }
1457
1458                 /* there were no public ips for this node */
1459                 if (public_ips == NULL) {
1460                         continue;
1461                 }               
1462
1463                 for (j=0;j<public_ips->num;j++) {
1464                         struct ctdb_public_ip_list *tmp_ip; 
1465
1466                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1467                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1468                         /* Do not use information about IP addresses hosted
1469                          * on other nodes, it may not be accurate */
1470                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1471                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1472                         } else {
1473                                 tmp_ip->pnn = -1;
1474                         }
1475                         tmp_ip->addr = public_ips->ips[j].addr;
1476                         tmp_ip->next = NULL;
1477
1478                         trbt_insertarray32_callback(ctdb->ip_tree,
1479                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1480                                 add_ip_callback,
1481                                 tmp_ip);
1482                 }
1483         }
1484
1485         ip_list = NULL;
1486         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1487
1488         return ip_list;
1489 }
1490
1491 /* 
1492  * This is the length of the longtest common prefix between the IPs.
1493  * It is calculated by XOR-ing the 2 IPs together and counting the
1494  * number of leading zeroes.  The implementation means that all
1495  * addresses end up being 128 bits long.
1496  *
1497  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1498  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1499  * lots of nodes and IP addresses?
1500  */
1501 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1502 {
1503         uint32_t ip1_k[IP_KEYLEN];
1504         uint32_t *t;
1505         int i;
1506         uint32_t x;
1507
1508         uint32_t distance = 0;
1509
1510         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1511         t = ip_key(ip2);
1512         for (i=0; i<IP_KEYLEN; i++) {
1513                 x = ip1_k[i] ^ t[i];
1514                 if (x == 0) {
1515                         distance += 32;
1516                 } else {
1517                         /* Count number of leading zeroes. 
1518                          * FIXME? This could be optimised...
1519                          */
1520                         while ((x & (1 << 31)) == 0) {
1521                                 x <<= 1;
1522                                 distance += 1;
1523                         }
1524                 }
1525         }
1526
1527         return distance;
1528 }
1529
1530 /* Calculate the IP distance for the given IP relative to IPs on the
1531    given node.  The ips argument is generally the all_ips variable
1532    used in the main part of the algorithm.
1533  */
1534 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1535                                   struct ctdb_public_ip_list *ips,
1536                                   int pnn)
1537 {
1538         struct ctdb_public_ip_list *t;
1539         uint32_t d;
1540
1541         uint32_t sum = 0;
1542
1543         for (t=ips; t != NULL; t=t->next) {
1544                 if (t->pnn != pnn) {
1545                         continue;
1546                 }
1547
1548                 /* Optimisation: We never calculate the distance
1549                  * between an address and itself.  This allows us to
1550                  * calculate the effect of removing an address from a
1551                  * node by simply calculating the distance between
1552                  * that address and all of the exitsing addresses.
1553                  * Moreover, we assume that we're only ever dealing
1554                  * with addresses from all_ips so we can identify an
1555                  * address via a pointer rather than doing a more
1556                  * expensive address comparison. */
1557                 if (&(t->addr) == ip) {
1558                         continue;
1559                 }
1560
1561                 d = ip_distance(ip, &(t->addr));
1562                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1563         }
1564
1565         return sum;
1566 }
1567
1568 /* Return the LCP2 imbalance metric for addresses currently assigned
1569    to the given node.
1570  */
1571 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1572 {
1573         struct ctdb_public_ip_list *t;
1574
1575         uint32_t imbalance = 0;
1576
1577         for (t=all_ips; t!=NULL; t=t->next) {
1578                 if (t->pnn != pnn) {
1579                         continue;
1580                 }
1581                 /* Pass the rest of the IPs rather than the whole
1582                    all_ips input list.
1583                 */
1584                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1585         }
1586
1587         return imbalance;
1588 }
1589
1590 /* Allocate any unassigned IPs just by looping through the IPs and
1591  * finding the best node for each.
1592  */
1593 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1594                                       struct ctdb_node_map *nodemap,
1595                                       struct ctdb_public_ip_list *all_ips)
1596 {
1597         struct ctdb_public_ip_list *tmp_ip;
1598
1599         /* loop over all ip's and find a physical node to cover for 
1600            each unassigned ip.
1601         */
1602         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1603                 if (tmp_ip->pnn == -1) {
1604                         if (find_takeover_node(ctdb, nodemap, tmp_ip, all_ips)) {
1605                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1606                                         ctdb_addr_to_str(&tmp_ip->addr)));
1607                         }
1608                 }
1609         }
1610 }
1611
1612 /* Basic non-deterministic rebalancing algorithm.
1613  */
1614 static void basic_failback(struct ctdb_context *ctdb,
1615                            struct ctdb_node_map *nodemap,
1616                            struct ctdb_public_ip_list *all_ips,
1617                            int num_ips)
1618 {
1619         int i;
1620         int maxnode, maxnum, minnode, minnum, num, retries;
1621         struct ctdb_public_ip_list *tmp_ip;
1622
1623         retries = 0;
1624
1625 try_again:
1626         maxnum=0;
1627         minnum=0;
1628
1629         /* for each ip address, loop over all nodes that can serve
1630            this ip and make sure that the difference between the node
1631            serving the most and the node serving the least ip's are
1632            not greater than 1.
1633         */
1634         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1635                 if (tmp_ip->pnn == -1) {
1636                         continue;
1637                 }
1638
1639                 /* Get the highest and lowest number of ips's served by any 
1640                    valid node which can serve this ip.
1641                 */
1642                 maxnode = -1;
1643                 minnode = -1;
1644                 for (i=0;i<nodemap->num;i++) {
1645                         /* only check nodes that can actually serve this ip */
1646                         if (!can_node_takeover_ip(ctdb, i, nodemap, tmp_ip)) {
1647                                 /* no it couldnt   so skip to the next node */
1648                                 continue;
1649                         }
1650
1651                         num = node_ip_coverage(ctdb, i, all_ips);
1652                         if (maxnode == -1) {
1653                                 maxnode = i;
1654                                 maxnum  = num;
1655                         } else {
1656                                 if (num > maxnum) {
1657                                         maxnode = i;
1658                                         maxnum  = num;
1659                                 }
1660                         }
1661                         if (minnode == -1) {
1662                                 minnode = i;
1663                                 minnum  = num;
1664                         } else {
1665                                 if (num < minnum) {
1666                                         minnode = i;
1667                                         minnum  = num;
1668                                 }
1669                         }
1670                 }
1671                 if (maxnode == -1) {
1672                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1673                                 ctdb_addr_to_str(&tmp_ip->addr)));
1674
1675                         continue;
1676                 }
1677
1678                 /* if the spread between the smallest and largest coverage by
1679                    a node is >=2 we steal one of the ips from the node with
1680                    most coverage to even things out a bit.
1681                    try to do this a limited number of times since we dont
1682                    want to spend too much time balancing the ip coverage.
1683                 */
1684                 if ( (maxnum > minnum+1)
1685                      && (retries < (num_ips + 5)) ){
1686                         struct ctdb_public_ip_list *tmp;
1687
1688                         /* Reassign one of maxnode's VNNs */
1689                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1690                                 if (tmp->pnn == maxnode) {
1691                                         (void)find_takeover_node(ctdb, nodemap, tmp, all_ips);
1692                                         retries++;
1693                                         goto try_again;;
1694                                 }
1695                         }
1696                 }
1697         }
1698 }
1699
1700 struct ctdb_rebalancenodes {
1701         struct ctdb_rebalancenodes *next;
1702         uint32_t pnn;
1703 };
1704 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1705
1706
1707 /* set this flag to force the node to be rebalanced even if it just didnt
1708    become healthy again.
1709 */
1710 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1711 {
1712         struct ctdb_rebalancenodes *rebalance;
1713
1714         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1715                 if (rebalance->pnn == pnn) {
1716                         return;
1717                 }
1718         }
1719
1720         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1721         rebalance->pnn = pnn;
1722         rebalance->next = force_rebalance_list;
1723         force_rebalance_list = rebalance;
1724 }
1725
1726 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1727  * that we can unit test it.
1728  */
1729 static void lcp2_init(struct ctdb_context * tmp_ctx,
1730                struct ctdb_node_map * nodemap,
1731                struct ctdb_public_ip_list *all_ips,
1732                uint32_t **lcp2_imbalances,
1733                bool **rebalance_candidates)
1734 {
1735         int i;
1736         struct ctdb_public_ip_list *tmp_ip;
1737
1738         *rebalance_candidates = talloc_array(tmp_ctx, bool, nodemap->num);
1739         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1740         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1741         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1742
1743         for (i=0;i<nodemap->num;i++) {
1744                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1745                 /* First step: assume all nodes are candidates */
1746                 (*rebalance_candidates)[i] = true;
1747         }
1748
1749         /* 2nd step: if a node has IPs assigned then it must have been
1750          * healthy before, so we remove it from consideration.  This
1751          * is overkill but is all we have because we don't maintain
1752          * state between takeover runs.  An alternative would be to
1753          * keep state and invalidate it every time the recovery master
1754          * changes.
1755          */
1756         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1757                 if (tmp_ip->pnn != -1) {
1758                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1759                 }
1760         }
1761
1762         /* 3rd step: if a node is forced to re-balance then
1763            we allow failback onto the node */
1764         while (force_rebalance_list != NULL) {
1765                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1766
1767                 if (force_rebalance_list->pnn <= nodemap->num) {
1768                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1769                 }
1770
1771                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1772                 talloc_free(force_rebalance_list);
1773                 force_rebalance_list = next;
1774         }
1775 }
1776
1777 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1778  * the IP/node combination that will cost the least.
1779  */
1780 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1781                                      struct ctdb_node_map *nodemap,
1782                                      struct ctdb_public_ip_list *all_ips,
1783                                      uint32_t *lcp2_imbalances)
1784 {
1785         struct ctdb_public_ip_list *tmp_ip;
1786         int dstnode;
1787
1788         int minnode;
1789         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1790         struct ctdb_public_ip_list *minip;
1791
1792         bool should_loop = true;
1793         bool have_unassigned = true;
1794
1795         while (have_unassigned && should_loop) {
1796                 should_loop = false;
1797
1798                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1799                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1800
1801                 minnode = -1;
1802                 mindsum = 0;
1803                 minip = NULL;
1804
1805                 /* loop over each unassigned ip. */
1806                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1807                         if (tmp_ip->pnn != -1) {
1808                                 continue;
1809                         }
1810
1811                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1812                                 /* only check nodes that can actually takeover this ip */
1813                                 if (!can_node_takeover_ip(ctdb, dstnode,
1814                                                           nodemap, tmp_ip)) {
1815                                         /* no it couldnt   so skip to the next node */
1816                                         continue;
1817                                 }
1818
1819                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1820                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1821                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1822                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1823                                                    dstnode,
1824                                                    dstimbl - lcp2_imbalances[dstnode]));
1825
1826
1827                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1828                                         minnode = dstnode;
1829                                         minimbl = dstimbl;
1830                                         mindsum = dstdsum;
1831                                         minip = tmp_ip;
1832                                         should_loop = true;
1833                                 }
1834                         }
1835                 }
1836
1837                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1838
1839                 /* If we found one then assign it to the given node. */
1840                 if (minnode != -1) {
1841                         minip->pnn = minnode;
1842                         lcp2_imbalances[minnode] = minimbl;
1843                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1844                                           ctdb_addr_to_str(&(minip->addr)),
1845                                           minnode,
1846                                           mindsum));
1847                 }
1848
1849                 /* There might be a better way but at least this is clear. */
1850                 have_unassigned = false;
1851                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1852                         if (tmp_ip->pnn == -1) {
1853                                 have_unassigned = true;
1854                         }
1855                 }
1856         }
1857
1858         /* We know if we have an unassigned addresses so we might as
1859          * well optimise.
1860          */
1861         if (have_unassigned) {
1862                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1863                         if (tmp_ip->pnn == -1) {
1864                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1865                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1866                         }
1867                 }
1868         }
1869 }
1870
1871 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1872  * to move IPs from, determines the best IP/destination node
1873  * combination to move from the source node.
1874  */
1875 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1876                                     struct ctdb_node_map *nodemap,
1877                                     struct ctdb_public_ip_list *all_ips,
1878                                     int srcnode,
1879                                     uint32_t candimbl,
1880                                     uint32_t *lcp2_imbalances,
1881                                     bool *rebalance_candidates)
1882 {
1883         int dstnode, mindstnode;
1884         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1885         uint32_t minsrcimbl, mindstimbl;
1886         struct ctdb_public_ip_list *minip;
1887         struct ctdb_public_ip_list *tmp_ip;
1888
1889         /* Find an IP and destination node that best reduces imbalance. */
1890         minip = NULL;
1891         minsrcimbl = 0;
1892         mindstnode = -1;
1893         mindstimbl = 0;
1894
1895         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1896         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1897
1898         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1899                 /* Only consider addresses on srcnode. */
1900                 if (tmp_ip->pnn != srcnode) {
1901                         continue;
1902                 }
1903
1904                 /* What is this IP address costing the source node? */
1905                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1906                 srcimbl = candimbl - srcdsum;
1907
1908                 /* Consider this IP address would cost each potential
1909                  * destination node.  Destination nodes are limited to
1910                  * those that are newly healthy, since we don't want
1911                  * to do gratuitous failover of IPs just to make minor
1912                  * balance improvements.
1913                  */
1914                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1915                         if (!rebalance_candidates[dstnode]) {
1916                                 continue;
1917                         }
1918
1919                         /* only check nodes that can actually takeover this ip */
1920                         if (!can_node_takeover_ip(ctdb, dstnode,
1921                                                   nodemap, tmp_ip)) {
1922                                 /* no it couldnt   so skip to the next node */
1923                                 continue;
1924                         }
1925
1926                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1927                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1928                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1929                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1930                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1931                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1932
1933                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1934                             ((mindstnode == -1) ||                              \
1935                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1936
1937                                 minip = tmp_ip;
1938                                 minsrcimbl = srcimbl;
1939                                 mindstnode = dstnode;
1940                                 mindstimbl = dstimbl;
1941                         }
1942                 }
1943         }
1944         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1945
1946         if (mindstnode != -1) {
1947                 /* We found a move that makes things better... */
1948                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1949                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1950                                   ctdb_addr_to_str(&(minip->addr)),
1951                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1952
1953
1954                 lcp2_imbalances[srcnode] = srcimbl;
1955                 lcp2_imbalances[mindstnode] = mindstimbl;
1956                 minip->pnn = mindstnode;
1957
1958                 return true;
1959         }
1960
1961         return false;
1962         
1963 }
1964
1965 struct lcp2_imbalance_pnn {
1966         uint32_t imbalance;
1967         int pnn;
1968 };
1969
1970 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1971 {
1972         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1973         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1974
1975         if (lipa->imbalance > lipb->imbalance) {
1976                 return -1;
1977         } else if (lipa->imbalance == lipb->imbalance) {
1978                 return 0;
1979         } else {
1980                 return 1;
1981         }
1982 }
1983
1984 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1985  * node with the highest LCP2 imbalance, and then determines the best
1986  * IP/destination node combination to move from the source node.
1987  */
1988 static void lcp2_failback(struct ctdb_context *ctdb,
1989                           struct ctdb_node_map *nodemap,
1990                           struct ctdb_public_ip_list *all_ips,
1991                           uint32_t *lcp2_imbalances,
1992                           bool *rebalance_candidates)
1993 {
1994         int i, num_rebalance_candidates;
1995         struct lcp2_imbalance_pnn * lips;
1996         bool again;
1997
1998 try_again:
1999
2000         /* It is only worth continuing if we have suitable target
2001          * nodes to transfer IPs to.  This check is much cheaper than
2002          * continuing on...
2003          */
2004         num_rebalance_candidates = 0;
2005         for (i = 0; i < nodemap->num; i++) {
2006                 if (rebalance_candidates[i]) {
2007                         num_rebalance_candidates++;
2008                 }
2009         }
2010         if (num_rebalance_candidates == 0) {
2011                 return;
2012         }
2013
2014         /* Put the imbalances and nodes into an array, sort them and
2015          * iterate through candidates.  Usually the 1st one will be
2016          * used, so this doesn't cost much...
2017          */
2018         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2019         for (i = 0; i < nodemap->num; i++) {
2020                 lips[i].imbalance = lcp2_imbalances[i];
2021                 lips[i].pnn = i;
2022         }
2023         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2024               lcp2_cmp_imbalance_pnn);
2025
2026         again = false;
2027         for (i = 0; i < nodemap->num; i++) {
2028                 /* This means that all nodes had 0 or 1 addresses, so
2029                  * can't be imbalanced.
2030                  */
2031                 if (lips[i].imbalance == 0) {
2032                         break;
2033                 }
2034
2035                 if (lcp2_failback_candidate(ctdb,
2036                                             nodemap,
2037                                             all_ips,
2038                                             lips[i].pnn,
2039                                             lips[i].imbalance,
2040                                             lcp2_imbalances,
2041                                             rebalance_candidates)) {
2042                         again = true;
2043                         break;
2044                 }
2045         }
2046
2047         talloc_free(lips);
2048         if (again) {
2049                 goto try_again;
2050         }
2051 }
2052
2053 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2054                                     struct ctdb_node_map *nodemap,
2055                                     struct ctdb_public_ip_list *all_ips)
2056 {
2057         struct ctdb_public_ip_list *tmp_ip;
2058
2059         /* verify that the assigned nodes can serve that public ip
2060            and set it to -1 if not
2061         */
2062         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2063                 if (tmp_ip->pnn == -1) {
2064                         continue;
2065                 }
2066                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2067                                       nodemap, tmp_ip) != 0) {
2068                         /* this node can not serve this ip. */
2069                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2070                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2071                                            tmp_ip->pnn));
2072                         tmp_ip->pnn = -1;
2073                 }
2074         }
2075 }
2076
2077 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2078                                        struct ctdb_node_map *nodemap,
2079                                        struct ctdb_public_ip_list *all_ips)
2080 {
2081         struct ctdb_public_ip_list *tmp_ip;
2082         int i;
2083
2084         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2085        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2086         *  always be allocated the same way for a specific set of
2087         *  available/unavailable nodes.
2088         */
2089
2090         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2091                 tmp_ip->pnn = i%nodemap->num;
2092         }
2093
2094         /* IP failback doesn't make sense with deterministic
2095          * IPs, since the modulo step above implicitly fails
2096          * back IPs to their "home" node.
2097          */
2098         if (1 == ctdb->tunable.no_ip_failback) {
2099                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2100         }
2101
2102         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2103
2104         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2105
2106         /* No failback here! */
2107 }
2108
2109 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2110                                           struct ctdb_node_map *nodemap,
2111                                           struct ctdb_public_ip_list *all_ips)
2112 {
2113         /* This should be pushed down into basic_failback. */
2114         struct ctdb_public_ip_list *tmp_ip;
2115         int num_ips = 0;
2116         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2117                 num_ips++;
2118         }
2119
2120         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2121
2122         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2123
2124         /* If we don't want IPs to fail back then don't rebalance IPs. */
2125         if (1 == ctdb->tunable.no_ip_failback) {
2126                 return;
2127         }
2128
2129         /* Now, try to make sure the ip adresses are evenly distributed
2130            across the nodes.
2131         */
2132         basic_failback(ctdb, nodemap, all_ips, num_ips);
2133 }
2134
2135 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2136                           struct ctdb_node_map *nodemap,
2137                           struct ctdb_public_ip_list *all_ips)
2138 {
2139         uint32_t *lcp2_imbalances;
2140         bool *rebalance_candidates;
2141
2142         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2143
2144         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2145
2146         lcp2_init(tmp_ctx, nodemap, all_ips,
2147                   &lcp2_imbalances, &rebalance_candidates);
2148
2149         lcp2_allocate_unassigned(ctdb, nodemap, all_ips, lcp2_imbalances);
2150
2151         /* If we don't want IPs to fail back then don't rebalance IPs. */
2152         if (1 == ctdb->tunable.no_ip_failback) {
2153                 goto finished;
2154         }
2155
2156         /* Now, try to make sure the ip adresses are evenly distributed
2157            across the nodes.
2158         */
2159         lcp2_failback(ctdb, nodemap, all_ips,
2160                       lcp2_imbalances, rebalance_candidates);
2161
2162 finished:
2163         talloc_free(tmp_ctx);
2164 }
2165
2166 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2167 {
2168         int i, num_healthy;
2169
2170         /* Count how many completely healthy nodes we have */
2171         num_healthy = 0;
2172         for (i=0;i<nodemap->num;i++) {
2173                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2174                         num_healthy++;
2175                 }
2176         }
2177
2178         return num_healthy == 0;
2179 }
2180
2181 /* The calculation part of the IP allocation algorithm. */
2182 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2183                                    struct ctdb_node_map *nodemap,
2184                                    struct ctdb_public_ip_list **all_ips_p)
2185 {
2186         /* since nodes only know about those public addresses that
2187            can be served by that particular node, no single node has
2188            a full list of all public addresses that exist in the cluster.
2189            Walk over all node structures and create a merged list of
2190            all public addresses that exist in the cluster.
2191
2192            keep the tree of ips around as ctdb->ip_tree
2193         */
2194         *all_ips_p = create_merged_ip_list(ctdb);
2195
2196         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2197                 ip_alloc_lcp2(ctdb, nodemap, *all_ips_p);
2198         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2199                 ip_alloc_deterministic_ips(ctdb, nodemap, *all_ips_p);
2200         } else {
2201                 ip_alloc_nondeterministic_ips(ctdb, nodemap, *all_ips_p);
2202         }
2203
2204         /* at this point ->pnn is the node which will own each IP
2205            or -1 if there is no node that can cover this ip
2206         */
2207
2208         return;
2209 }
2210
2211 struct get_tunable_callback_data {
2212         const char *tunable;
2213         uint32_t *out;
2214 };
2215
2216 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2217                                  int32_t res, TDB_DATA outdata,
2218                                  void *callback)
2219 {
2220         struct get_tunable_callback_data *cd =
2221                 (struct get_tunable_callback_data *)callback;
2222         int size;
2223
2224         if (res != 0) {
2225                 DEBUG(DEBUG_ERR,
2226                       ("Failure to read \"%s\" tunable from remote node %d\n",
2227                        cd->tunable, pnn));
2228                 return;
2229         }
2230
2231         if (outdata.dsize != sizeof(uint32_t)) {
2232                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2233                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2234                                  (int)outdata.dsize));
2235                 return;
2236         }
2237
2238         size = talloc_get_size(cd->out) / sizeof(uint32_t);
2239         if (pnn >= size) {
2240                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2241                                  cd->tunable, pnn, size));
2242                 return;
2243         }
2244
2245                 
2246         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2247 }
2248
2249 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2250                                         TALLOC_CTX *tmp_ctx,
2251                                         struct ctdb_node_map *nodemap,
2252                                         const char *tunable)
2253 {
2254         TDB_DATA data;
2255         struct ctdb_control_get_tunable *t;
2256         uint32_t *nodes;
2257         uint32_t *tvals;
2258         struct get_tunable_callback_data callback_data;
2259
2260         tvals = talloc_zero_array(tmp_ctx, uint32_t, nodemap->num);
2261         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2262         callback_data.out = tvals;
2263         callback_data.tunable = tunable;
2264
2265         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2266         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2267         t = (struct ctdb_control_get_tunable *)data.dptr;
2268         t->length = strlen(tunable)+1;
2269         memcpy(t->name, tunable, t->length);
2270         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2271         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2272                                       nodes, 0, TAKEOVER_TIMEOUT(),
2273                                       false, data,
2274                                       get_tunable_callback, NULL,
2275                                       &callback_data) != 0) {
2276                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get %s tunable failed\n", tunable));
2277         }
2278         talloc_free(nodes);
2279         talloc_free(data.dptr);
2280
2281         return tvals;
2282 }
2283
2284 static void clear_ipflags(struct ctdb_node_map *nodemap)
2285 {
2286         int i;
2287
2288         for (i=0;i<nodemap->num;i++) {
2289                 nodemap->nodes[i].flags &=
2290                         ~(NODE_FLAGS_NOIPTAKEOVER|NODE_FLAGS_NOIPHOST);
2291         }
2292 }
2293
2294
2295 /* Set internal flags for IP allocation:
2296  *   Clear ip flags
2297  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2298  *   Set NOIPHOST ip flag for each INACTIVE node
2299  *   if all nodes are disabled:
2300  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2301  *   else
2302  *     Set NOIPHOST ip flags for disabled nodes
2303  */
2304 static void set_ipflags_internal(struct ctdb_node_map *nodemap,
2305                                  uint32_t *tval_noiptakeover,
2306                                  uint32_t *tval_noiphostonalldisabled)
2307 {
2308         int i;
2309
2310         clear_ipflags(nodemap);
2311
2312         for (i=0;i<nodemap->num;i++) {
2313                 /* Can not take IPs on node with NoIPTakeover set */
2314                 if (tval_noiptakeover[i] != 0) {
2315                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPTAKEOVER;
2316                 }
2317
2318                 /* Can not host IPs on INACTIVE node */
2319                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2320                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2321                 }
2322         }
2323
2324         if (all_nodes_are_disabled(nodemap)) {
2325                 /* If all nodes are disabled, can not host IPs on node
2326                  * with NoIPHostOnAllDisabled set
2327                  */
2328                 for (i=0;i<nodemap->num;i++) {
2329                         if (tval_noiphostonalldisabled[i] != 0) {
2330                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2331                         }
2332                 }
2333         } else {
2334                 /* If some nodes are not disabled, then can not host
2335                  * IPs on DISABLED node
2336                  */
2337                 for (i=0;i<nodemap->num;i++) {
2338                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2339                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2340                         }
2341                 }
2342         }
2343 }
2344
2345 static bool set_ipflags(struct ctdb_context *ctdb,
2346                         TALLOC_CTX *tmp_ctx,
2347                         struct ctdb_node_map *nodemap)
2348 {
2349         uint32_t *tval_noiptakeover;
2350         uint32_t *tval_noiphostonalldisabled;
2351
2352         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2353                                                    "NoIPTakeover");
2354         if (tval_noiptakeover == NULL) {
2355                 return false;
2356         }
2357
2358         tval_noiphostonalldisabled =
2359                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2360                                        "NoIPHostOnAllDisabled");
2361         if (tval_noiphostonalldisabled == NULL) {
2362                 return false;
2363         }
2364
2365         set_ipflags_internal(nodemap,
2366                              tval_noiptakeover, tval_noiphostonalldisabled);
2367
2368         talloc_free(tval_noiptakeover);
2369         talloc_free(tval_noiphostonalldisabled);
2370
2371         return true;
2372 }
2373
2374 /*
2375   make any IP alias changes for public addresses that are necessary 
2376  */
2377 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2378                       client_async_callback fail_callback, void *callback_data)
2379 {
2380         int i;
2381         struct ctdb_public_ip ip;
2382         struct ctdb_public_ipv4 ipv4;
2383         uint32_t *nodes;
2384         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2385         TDB_DATA data;
2386         struct timeval timeout;
2387         struct client_async_data *async_data;
2388         struct ctdb_client_control_state *state;
2389         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2390         uint32_t disable_timeout;
2391
2392         /*
2393          * ip failover is completely disabled, just send out the 
2394          * ipreallocated event.
2395          */
2396         if (ctdb->tunable.disable_ip_failover != 0) {
2397                 goto ipreallocated;
2398         }
2399
2400         if (!set_ipflags(ctdb, tmp_ctx, nodemap)) {
2401                 DEBUG(DEBUG_ERR,("Failed to set IP flags from tunables\n"));
2402                 return -1;
2403         }
2404
2405         ZERO_STRUCT(ip);
2406
2407         /* Do the IP reassignment calculations */
2408         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2409
2410         /* The IP flags need to be cleared because they should never
2411          * be seen outside the IP allocation code.
2412          */
2413         clear_ipflags(nodemap);
2414
2415         /* The recovery daemon does regular sanity checks of the IPs.
2416          * However, sometimes it is overzealous and thinks changes are
2417          * required when they're already underway.  This stops the
2418          * checks for a while before we start moving IPs.
2419          */
2420         disable_timeout = ctdb->tunable.takeover_timeout;
2421         data.dptr  = (uint8_t*)&disable_timeout;
2422         data.dsize = sizeof(disable_timeout);
2423         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2424                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2425                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2426         }
2427
2428         /* now tell all nodes to delete any alias that they should not
2429            have.  This will be a NOOP on nodes that don't currently
2430            hold the given alias */
2431         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2432         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2433
2434         async_data->fail_callback = fail_callback;
2435         async_data->callback_data = callback_data;
2436
2437         for (i=0;i<nodemap->num;i++) {
2438                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2439                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2440                         continue;
2441                 }
2442
2443                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2444                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2445                                 /* This node should be serving this
2446                                    vnn so dont tell it to release the ip
2447                                 */
2448                                 continue;
2449                         }
2450                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2451                                 ipv4.pnn = tmp_ip->pnn;
2452                                 ipv4.sin = tmp_ip->addr.ip;
2453
2454                                 timeout = TAKEOVER_TIMEOUT();
2455                                 data.dsize = sizeof(ipv4);
2456                                 data.dptr  = (uint8_t *)&ipv4;
2457                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2458                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2459                                                 data, async_data,
2460                                                 &timeout, NULL);
2461                         } else {
2462                                 ip.pnn  = tmp_ip->pnn;
2463                                 ip.addr = tmp_ip->addr;
2464
2465                                 timeout = TAKEOVER_TIMEOUT();
2466                                 data.dsize = sizeof(ip);
2467                                 data.dptr  = (uint8_t *)&ip;
2468                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2469                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2470                                                 data, async_data,
2471                                                 &timeout, NULL);
2472                         }
2473
2474                         if (state == NULL) {
2475                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2476                                 talloc_free(tmp_ctx);
2477                                 return -1;
2478                         }
2479                 
2480                         ctdb_client_async_add(async_data, state);
2481                 }
2482         }
2483         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2484                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2485                 talloc_free(tmp_ctx);
2486                 return -1;
2487         }
2488         talloc_free(async_data);
2489
2490
2491         /* tell all nodes to get their own IPs */
2492         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2493         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2494
2495         async_data->fail_callback = fail_callback;
2496         async_data->callback_data = callback_data;
2497
2498         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2499                 if (tmp_ip->pnn == -1) {
2500                         /* this IP won't be taken over */
2501                         continue;
2502                 }
2503
2504                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2505                         ipv4.pnn = tmp_ip->pnn;
2506                         ipv4.sin = tmp_ip->addr.ip;
2507
2508                         timeout = TAKEOVER_TIMEOUT();
2509                         data.dsize = sizeof(ipv4);
2510                         data.dptr  = (uint8_t *)&ipv4;
2511                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2512                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2513                                         data, async_data,
2514                                         &timeout, NULL);
2515                 } else {
2516                         ip.pnn  = tmp_ip->pnn;
2517                         ip.addr = tmp_ip->addr;
2518
2519                         timeout = TAKEOVER_TIMEOUT();
2520                         data.dsize = sizeof(ip);
2521                         data.dptr  = (uint8_t *)&ip;
2522                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2523                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2524                                         data, async_data,
2525                                         &timeout, NULL);
2526                 }
2527                 if (state == NULL) {
2528                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2529                         talloc_free(tmp_ctx);
2530                         return -1;
2531                 }
2532                 
2533                 ctdb_client_async_add(async_data, state);
2534         }
2535         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2536                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2537                 talloc_free(tmp_ctx);
2538                 return -1;
2539         }
2540
2541 ipreallocated:
2542         /* 
2543          * Tell all nodes to run eventscripts to process the
2544          * "ipreallocated" event.  This can do a lot of things,
2545          * including restarting services to reconfigure them if public
2546          * IPs have moved.  Once upon a time this event only used to
2547          * update natwg.
2548          */
2549         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2550         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2551                                       nodes, 0, TAKEOVER_TIMEOUT(),
2552                                       false, tdb_null,
2553                                       NULL, fail_callback,
2554                                       callback_data) != 0) {
2555                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2556         }
2557
2558         talloc_free(tmp_ctx);
2559         return 0;
2560 }
2561
2562
2563 /*
2564   destroy a ctdb_client_ip structure
2565  */
2566 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2567 {
2568         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2569                 ctdb_addr_to_str(&ip->addr),
2570                 ntohs(ip->addr.ip.sin_port),
2571                 ip->client_id));
2572
2573         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2574         return 0;
2575 }
2576
2577 /*
2578   called by a client to inform us of a TCP connection that it is managing
2579   that should tickled with an ACK when IP takeover is done
2580   we handle both the old ipv4 style of packets as well as the new ipv4/6
2581   pdus.
2582  */
2583 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2584                                 TDB_DATA indata)
2585 {
2586         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2587         struct ctdb_control_tcp *old_addr = NULL;
2588         struct ctdb_control_tcp_addr new_addr;
2589         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2590         struct ctdb_tcp_list *tcp;
2591         struct ctdb_tcp_connection t;
2592         int ret;
2593         TDB_DATA data;
2594         struct ctdb_client_ip *ip;
2595         struct ctdb_vnn *vnn;
2596         ctdb_sock_addr addr;
2597
2598         switch (indata.dsize) {
2599         case sizeof(struct ctdb_control_tcp):
2600                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2601                 ZERO_STRUCT(new_addr);
2602                 tcp_sock = &new_addr;
2603                 tcp_sock->src.ip  = old_addr->src;
2604                 tcp_sock->dest.ip = old_addr->dest;
2605                 break;
2606         case sizeof(struct ctdb_control_tcp_addr):
2607                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2608                 break;
2609         default:
2610                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2611                                  "to ctdb_control_tcp_client. size was %d but "
2612                                  "only allowed sizes are %lu and %lu\n",
2613                                  (int)indata.dsize,
2614                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2615                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2616                 return -1;
2617         }
2618
2619         addr = tcp_sock->src;
2620         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2621         addr = tcp_sock->dest;
2622         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2623
2624         ZERO_STRUCT(addr);
2625         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2626         vnn = find_public_ip_vnn(ctdb, &addr);
2627         if (vnn == NULL) {
2628                 switch (addr.sa.sa_family) {
2629                 case AF_INET:
2630                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2631                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2632                                         ctdb_addr_to_str(&addr)));
2633                         }
2634                         break;
2635                 case AF_INET6:
2636                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2637                                 ctdb_addr_to_str(&addr)));
2638                         break;
2639                 default:
2640                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2641                 }
2642
2643                 return 0;
2644         }
2645
2646         if (vnn->pnn != ctdb->pnn) {
2647                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2648                         ctdb_addr_to_str(&addr),
2649                         client_id, client->pid));
2650                 /* failing this call will tell smbd to die */
2651                 return -1;
2652         }
2653
2654         ip = talloc(client, struct ctdb_client_ip);
2655         CTDB_NO_MEMORY(ctdb, ip);
2656
2657         ip->ctdb      = ctdb;
2658         ip->addr      = addr;
2659         ip->client_id = client_id;
2660         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2661         DLIST_ADD(ctdb->client_ip_list, ip);
2662
2663         tcp = talloc(client, struct ctdb_tcp_list);
2664         CTDB_NO_MEMORY(ctdb, tcp);
2665
2666         tcp->connection.src_addr = tcp_sock->src;
2667         tcp->connection.dst_addr = tcp_sock->dest;
2668
2669         DLIST_ADD(client->tcp_list, tcp);
2670
2671         t.src_addr = tcp_sock->src;
2672         t.dst_addr = tcp_sock->dest;
2673
2674         data.dptr = (uint8_t *)&t;
2675         data.dsize = sizeof(t);
2676
2677         switch (addr.sa.sa_family) {
2678         case AF_INET:
2679                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2680                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2681                         ctdb_addr_to_str(&tcp_sock->src),
2682                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2683                 break;
2684         case AF_INET6:
2685                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2686                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2687                         ctdb_addr_to_str(&tcp_sock->src),
2688                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2689                 break;
2690         default:
2691                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2692         }
2693
2694
2695         /* tell all nodes about this tcp connection */
2696         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2697                                        CTDB_CONTROL_TCP_ADD,
2698                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2699         if (ret != 0) {
2700                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2701                 return -1;
2702         }
2703
2704         return 0;
2705 }
2706
2707 /*
2708   find a tcp address on a list
2709  */
2710 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2711                                            struct ctdb_tcp_connection *tcp)
2712 {
2713         int i;
2714
2715         if (array == NULL) {
2716                 return NULL;
2717         }
2718
2719         for (i=0;i<array->num;i++) {
2720                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2721                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2722                         return &array->connections[i];
2723                 }
2724         }
2725         return NULL;
2726 }
2727
2728
2729
2730 /*
2731   called by a daemon to inform us of a TCP connection that one of its
2732   clients managing that should tickled with an ACK when IP takeover is
2733   done
2734  */
2735 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2736 {
2737         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2738         struct ctdb_tcp_array *tcparray;
2739         struct ctdb_tcp_connection tcp;
2740         struct ctdb_vnn *vnn;
2741
2742         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2743         if (vnn == NULL) {
2744                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2745                         ctdb_addr_to_str(&p->dst_addr)));
2746
2747                 return -1;
2748         }
2749
2750
2751         tcparray = vnn->tcp_array;
2752
2753         /* If this is the first tickle */
2754         if (tcparray == NULL) {
2755                 tcparray = talloc_size(ctdb->nodes, 
2756                         offsetof(struct ctdb_tcp_array, connections) +
2757                         sizeof(struct ctdb_tcp_connection) * 1);
2758                 CTDB_NO_MEMORY(ctdb, tcparray);
2759                 vnn->tcp_array = tcparray;
2760
2761                 tcparray->num = 0;
2762                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2763                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2764
2765                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2766                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2767                 tcparray->num++;
2768
2769                 if (tcp_update_needed) {
2770                         vnn->tcp_update_needed = true;
2771                 }
2772                 return 0;
2773         }
2774
2775
2776         /* Do we already have this tickle ?*/
2777         tcp.src_addr = p->src_addr;
2778         tcp.dst_addr = p->dst_addr;
2779         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2780                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2781                         ctdb_addr_to_str(&tcp.dst_addr),
2782                         ntohs(tcp.dst_addr.ip.sin_port),
2783                         vnn->pnn));
2784                 return 0;
2785         }
2786
2787         /* A new tickle, we must add it to the array */
2788         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2789                                         struct ctdb_tcp_connection,
2790                                         tcparray->num+1);
2791         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2792
2793         vnn->tcp_array = tcparray;
2794         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2795         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2796         tcparray->num++;
2797                                 
2798         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2799                 ctdb_addr_to_str(&tcp.dst_addr),
2800                 ntohs(tcp.dst_addr.ip.sin_port),
2801                 vnn->pnn));
2802
2803         if (tcp_update_needed) {
2804                 vnn->tcp_update_needed = true;
2805         }
2806
2807         return 0;
2808 }
2809
2810
2811 /*
2812   called by a daemon to inform us of a TCP connection that one of its
2813   clients managing that should tickled with an ACK when IP takeover is
2814   done
2815  */
2816 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2817 {
2818         struct ctdb_tcp_connection *tcpp;
2819         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2820
2821         if (vnn == NULL) {
2822                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2823                         ctdb_addr_to_str(&conn->dst_addr)));
2824                 return;
2825         }
2826
2827         /* if the array is empty we cant remove it
2828            and we dont need to do anything
2829          */
2830         if (vnn->tcp_array == NULL) {
2831                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2832                         ctdb_addr_to_str(&conn->dst_addr),
2833                         ntohs(conn->dst_addr.ip.sin_port)));
2834                 return;
2835         }
2836
2837
2838         /* See if we know this connection
2839            if we dont know this connection  then we dont need to do anything
2840          */
2841         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2842         if (tcpp == NULL) {
2843                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2844                         ctdb_addr_to_str(&conn->dst_addr),
2845                         ntohs(conn->dst_addr.ip.sin_port)));
2846                 return;
2847         }
2848
2849
2850         /* We need to remove this entry from the array.
2851            Instead of allocating a new array and copying data to it
2852            we cheat and just copy the last entry in the existing array
2853            to the entry that is to be removed and just shring the 
2854            ->num field
2855          */
2856         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2857         vnn->tcp_array->num--;
2858
2859         /* If we deleted the last entry we also need to remove the entire array
2860          */
2861         if (vnn->tcp_array->num == 0) {
2862                 talloc_free(vnn->tcp_array);
2863                 vnn->tcp_array = NULL;
2864         }               
2865
2866         vnn->tcp_update_needed = true;
2867
2868         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2869                 ctdb_addr_to_str(&conn->src_addr),
2870                 ntohs(conn->src_addr.ip.sin_port)));
2871 }
2872
2873
2874 /*
2875   called by a daemon to inform us of a TCP connection that one of its
2876   clients used are no longer needed in the tickle database
2877  */
2878 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2879 {
2880         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2881
2882         ctdb_remove_tcp_connection(ctdb, conn);
2883
2884         return 0;
2885 }
2886
2887
2888 /*
2889   called when a daemon restarts - send all tickes for all public addresses
2890   we are serving immediately to the new node.
2891  */
2892 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2893 {
2894 /*XXX here we should send all tickes we are serving to the new node */
2895         return 0;
2896 }
2897
2898
2899 /*
2900   called when a client structure goes away - hook to remove
2901   elements from the tcp_list in all daemons
2902  */
2903 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2904 {
2905         while (client->tcp_list) {
2906                 struct ctdb_tcp_list *tcp = client->tcp_list;
2907                 DLIST_REMOVE(client->tcp_list, tcp);
2908                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2909         }
2910 }
2911
2912
2913 /*
2914   release all IPs on shutdown
2915  */
2916 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2917 {
2918         struct ctdb_vnn *vnn;
2919
2920         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2921                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2922                         ctdb_vnn_unassign_iface(ctdb, vnn);
2923                         continue;
2924                 }
2925                 if (!vnn->iface) {
2926                         continue;
2927                 }
2928                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2929                                   ctdb_vnn_iface_string(vnn),
2930                                   ctdb_addr_to_str(&vnn->public_address),
2931                                   vnn->public_netmask_bits);
2932                 release_kill_clients(ctdb, &vnn->public_address);
2933                 ctdb_vnn_unassign_iface(ctdb, vnn);
2934         }
2935 }
2936
2937
2938 /*
2939   get list of public IPs
2940  */
2941 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2942                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2943 {
2944         int i, num, len;
2945         struct ctdb_all_public_ips *ips;
2946         struct ctdb_vnn *vnn;
2947         bool only_available = false;
2948
2949         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2950                 only_available = true;
2951         }
2952
2953         /* count how many public ip structures we have */
2954         num = 0;
2955         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2956                 num++;
2957         }
2958
2959         len = offsetof(struct ctdb_all_public_ips, ips) + 
2960                 num*sizeof(struct ctdb_public_ip);
2961         ips = talloc_zero_size(outdata, len);
2962         CTDB_NO_MEMORY(ctdb, ips);
2963
2964         i = 0;
2965         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2966                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2967                         continue;
2968                 }
2969                 ips->ips[i].pnn  = vnn->pnn;
2970                 ips->ips[i].addr = vnn->public_address;
2971                 i++;
2972         }
2973         ips->num = i;
2974         len = offsetof(struct ctdb_all_public_ips, ips) +
2975                 i*sizeof(struct ctdb_public_ip);
2976
2977         outdata->dsize = len;
2978         outdata->dptr  = (uint8_t *)ips;
2979
2980         return 0;
2981 }
2982
2983
2984 /*
2985   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2986  */
2987 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2988                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2989 {
2990         int i, num, len;
2991         struct ctdb_all_public_ipsv4 *ips;
2992         struct ctdb_vnn *vnn;
2993
2994         /* count how many public ip structures we have */
2995         num = 0;
2996         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2997                 if (vnn->public_address.sa.sa_family != AF_INET) {
2998                         continue;
2999                 }
3000                 num++;
3001         }
3002
3003         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3004                 num*sizeof(struct ctdb_public_ipv4);
3005         ips = talloc_zero_size(outdata, len);
3006         CTDB_NO_MEMORY(ctdb, ips);
3007
3008         outdata->dsize = len;
3009         outdata->dptr  = (uint8_t *)ips;
3010
3011         ips->num = num;
3012         i = 0;
3013         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3014                 if (vnn->public_address.sa.sa_family != AF_INET) {
3015                         continue;
3016                 }
3017                 ips->ips[i].pnn = vnn->pnn;
3018                 ips->ips[i].sin = vnn->public_address.ip;
3019                 i++;
3020         }
3021
3022         return 0;
3023 }
3024
3025 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3026                                         struct ctdb_req_control *c,
3027                                         TDB_DATA indata,
3028                                         TDB_DATA *outdata)
3029 {
3030         int i, num, len;
3031         ctdb_sock_addr *addr;
3032         struct ctdb_control_public_ip_info *info;
3033         struct ctdb_vnn *vnn;
3034
3035         addr = (ctdb_sock_addr *)indata.dptr;
3036
3037         vnn = find_public_ip_vnn(ctdb, addr);
3038         if (vnn == NULL) {
3039                 /* if it is not a public ip   it could be our 'single ip' */
3040                 if (ctdb->single_ip_vnn) {
3041                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3042                                 vnn = ctdb->single_ip_vnn;
3043                         }
3044                 }
3045         }
3046         if (vnn == NULL) {
3047                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3048                                  "'%s'not a public address\n",
3049                                  ctdb_addr_to_str(addr)));
3050                 return -1;
3051         }
3052
3053         /* count how many public ip structures we have */
3054         num = 0;
3055         for (;vnn->ifaces[num];) {
3056                 num++;
3057         }
3058
3059         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3060                 num*sizeof(struct ctdb_control_iface_info);
3061         info = talloc_zero_size(outdata, len);
3062         CTDB_NO_MEMORY(ctdb, info);
3063
3064         info->ip.addr = vnn->public_address;
3065         info->ip.pnn = vnn->pnn;
3066         info->active_idx = 0xFFFFFFFF;
3067
3068         for (i=0; vnn->ifaces[i]; i++) {
3069                 struct ctdb_iface *cur;
3070
3071                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3072                 if (cur == NULL) {
3073                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3074                                            vnn->ifaces[i]));
3075                         return -1;
3076                 }
3077                 if (vnn->iface == cur) {
3078                         info->active_idx = i;
3079                 }
3080                 strcpy(info->ifaces[i].name, cur->name);
3081                 info->ifaces[i].link_state = cur->link_up;
3082                 info->ifaces[i].references = cur->references;
3083         }
3084         info->num = i;
3085         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3086                 i*sizeof(struct ctdb_control_iface_info);
3087
3088         outdata->dsize = len;
3089         outdata->dptr  = (uint8_t *)info;
3090
3091         return 0;
3092 }
3093
3094 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3095                                 struct ctdb_req_control *c,
3096                                 TDB_DATA *outdata)
3097 {
3098         int i, num, len;
3099         struct ctdb_control_get_ifaces *ifaces;
3100         struct ctdb_iface *cur;
3101
3102         /* count how many public ip structures we have */
3103         num = 0;
3104         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3105                 num++;
3106         }
3107
3108         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3109                 num*sizeof(struct ctdb_control_iface_info);
3110         ifaces = talloc_zero_size(outdata, len);
3111         CTDB_NO_MEMORY(ctdb, ifaces);
3112
3113         i = 0;
3114         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3115                 strcpy(ifaces->ifaces[i].name, cur->name);
3116                 ifaces->ifaces[i].link_state = cur->link_up;
3117                 ifaces->ifaces[i].references = cur->references;
3118                 i++;
3119         }
3120         ifaces->num = i;
3121         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3122                 i*sizeof(struct ctdb_control_iface_info);
3123
3124         outdata->dsize = len;
3125         outdata->dptr  = (uint8_t *)ifaces;
3126
3127         return 0;
3128 }
3129
3130 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3131                                     struct ctdb_req_control *c,
3132                                     TDB_DATA indata)
3133 {
3134         struct ctdb_control_iface_info *info;
3135         struct ctdb_iface *iface;
3136         bool link_up = false;
3137
3138         info = (struct ctdb_control_iface_info *)indata.dptr;
3139
3140         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3141                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3142                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3143                                   len, len, info->name));
3144                 return -1;
3145         }
3146
3147         switch (info->link_state) {
3148         case 0:
3149                 link_up = false;
3150                 break;
3151         case 1:
3152                 link_up = true;
3153                 break;
3154         default:
3155                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3156                                   (unsigned int)info->link_state));
3157                 return -1;
3158         }
3159
3160         if (info->references != 0) {
3161                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3162                                   (unsigned int)info->references));
3163                 return -1;
3164         }
3165
3166         iface = ctdb_find_iface(ctdb, info->name);
3167         if (iface == NULL) {
3168                 return -1;
3169         }
3170
3171         if (link_up == iface->link_up) {
3172                 return 0;
3173         }
3174
3175         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3176               ("iface[%s] has changed it's link status %s => %s\n",
3177                iface->name,
3178                iface->link_up?"up":"down",
3179                link_up?"up":"down"));
3180
3181         iface->link_up = link_up;
3182         return 0;
3183 }
3184
3185
3186 /* 
3187    structure containing the listening socket and the list of tcp connections
3188    that the ctdb daemon is to kill
3189 */
3190 struct ctdb_kill_tcp {
3191         struct ctdb_vnn *vnn;
3192         struct ctdb_context *ctdb;
3193         int capture_fd;
3194         struct fd_event *fde;
3195         trbt_tree_t *connections;
3196         void *private_data;
3197 };
3198
3199 /*
3200   a tcp connection that is to be killed
3201  */
3202 struct ctdb_killtcp_con {
3203         ctdb_sock_addr src_addr;
3204         ctdb_sock_addr dst_addr;
3205         int count;
3206         struct ctdb_kill_tcp *killtcp;
3207 };
3208
3209 /* this function is used to create a key to represent this socketpair
3210    in the killtcp tree.
3211    this key is used to insert and lookup matching socketpairs that are
3212    to be tickled and RST
3213 */
3214 #define KILLTCP_KEYLEN  10
3215 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3216 {
3217         static uint32_t key[KILLTCP_KEYLEN];
3218
3219         bzero(key, sizeof(key));
3220
3221         if (src->sa.sa_family != dst->sa.sa_family) {
3222                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3223                 return key;
3224         }
3225         
3226         switch (src->sa.sa_family) {
3227         case AF_INET:
3228                 key[0]  = dst->ip.sin_addr.s_addr;
3229                 key[1]  = src->ip.sin_addr.s_addr;
3230                 key[2]  = dst->ip.sin_port;
3231                 key[3]  = src->ip.sin_port;
3232                 break;
3233         case AF_INET6: {
3234                 uint32_t *dst6_addr32 =
3235                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3236                 uint32_t *src6_addr32 =
3237                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3238                 key[0]  = dst6_addr32[3];
3239                 key[1]  = src6_addr32[3];
3240                 key[2]  = dst6_addr32[2];
3241                 key[3]  = src6_addr32[2];
3242                 key[4]  = dst6_addr32[1];
3243                 key[5]  = src6_addr32[1];
3244                 key[6]  = dst6_addr32[0];
3245                 key[7]  = src6_addr32[0];
3246                 key[8]  = dst->ip6.sin6_port;
3247                 key[9]  = src->ip6.sin6_port;
3248                 break;
3249         }
3250         default:
3251                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3252                 return key;
3253         }
3254
3255         return key;
3256 }
3257
3258 /*
3259   called when we get a read event on the raw socket
3260  */
3261 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3262                                 uint16_t flags, void *private_data)
3263 {
3264         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3265         struct ctdb_killtcp_con *con;
3266         ctdb_sock_addr src, dst;
3267         uint32_t ack_seq, seq;
3268
3269         if (!(flags & EVENT_FD_READ)) {
3270                 return;
3271         }
3272
3273         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3274                                 killtcp->private_data,
3275                                 &src, &dst,
3276                                 &ack_seq, &seq) != 0) {
3277                 /* probably a non-tcp ACK packet */
3278                 return;
3279         }
3280
3281         /* check if we have this guy in our list of connections
3282            to kill
3283         */
3284         con = trbt_lookuparray32(killtcp->connections, 
3285                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3286         if (con == NULL) {
3287                 /* no this was some other packet we can just ignore */
3288                 return;
3289         }
3290
3291         /* This one has been tickled !
3292            now reset him and remove him from the list.
3293          */
3294         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3295                 ntohs(con->dst_addr.ip.sin_port),
3296                 ctdb_addr_to_str(&con->src_addr),
3297                 ntohs(con->src_addr.ip.sin_port)));
3298
3299         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3300         talloc_free(con);
3301 }
3302
3303
3304 /* when traversing the list of all tcp connections to send tickle acks to
3305    (so that we can capture the ack coming back and kill the connection
3306     by a RST)
3307    this callback is called for each connection we are currently trying to kill
3308 */
3309 static int tickle_connection_traverse(void *param, void *data)
3310 {
3311         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3312
3313         /* have tried too many times, just give up */
3314         if (con->count >= 5) {
3315                 /* can't delete in traverse: reparent to delete_cons */
3316                 talloc_steal(param, con);
3317                 return 0;
3318         }
3319
3320         /* othervise, try tickling it again */
3321         con->count++;
3322         ctdb_sys_send_tcp(
3323                 (ctdb_sock_addr *)&con->dst_addr,
3324                 (ctdb_sock_addr *)&con->src_addr,
3325                 0, 0, 0);
3326         return 0;
3327 }
3328
3329
3330 /* 
3331    called every second until all sentenced connections have been reset
3332  */
3333 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3334                                               struct timeval t, void *private_data)
3335 {
3336         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3337         void *delete_cons = talloc_new(NULL);
3338
3339         /* loop over all connections sending tickle ACKs */
3340         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3341
3342         /* now we've finished traverse, it's safe to do deletion. */
3343         talloc_free(delete_cons);
3344
3345         /* If there are no more connections to kill we can remove the
3346            entire killtcp structure
3347          */
3348         if ( (killtcp->connections == NULL) || 
3349              (killtcp->connections->root == NULL) ) {
3350                 talloc_free(killtcp);
3351                 return;
3352         }
3353
3354         /* try tickling them again in a seconds time
3355          */
3356         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3357                         ctdb_tickle_sentenced_connections, killtcp);
3358 }
3359
3360 /*
3361   destroy the killtcp structure
3362  */
3363 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3364 {
3365         struct ctdb_vnn *tmpvnn;
3366
3367         /* verify that this vnn is still active */
3368         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3369                 if (tmpvnn == killtcp->vnn) {
3370                         break;
3371                 }
3372         }
3373
3374         if (tmpvnn == NULL) {
3375                 return 0;
3376         }
3377
3378         if (killtcp->vnn->killtcp != killtcp) {
3379                 return 0;
3380         }
3381
3382         killtcp->vnn->killtcp = NULL;
3383
3384         return 0;
3385 }
3386
3387
3388 /* nothing fancy here, just unconditionally replace any existing
3389    connection structure with the new one.
3390
3391    dont even free the old one if it did exist, that one is talloc_stolen
3392    by the same node in the tree anyway and will be deleted when the new data 
3393    is deleted
3394 */
3395 static void *add_killtcp_callback(void *parm, void *data)
3396 {
3397         return parm;
3398 }
3399
3400 /*
3401   add a tcp socket to the list of connections we want to RST
3402  */
3403 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3404                                        ctdb_sock_addr *s,
3405                                        ctdb_sock_addr *d)
3406 {
3407         ctdb_sock_addr src, dst;
3408         struct ctdb_kill_tcp *killtcp;
3409         struct ctdb_killtcp_con *con;
3410         struct ctdb_vnn *vnn;
3411
3412         ctdb_canonicalize_ip(s, &src);
3413         ctdb_canonicalize_ip(d, &dst);
3414
3415         vnn = find_public_ip_vnn(ctdb, &dst);
3416         if (vnn == NULL) {
3417                 vnn = find_public_ip_vnn(ctdb, &src);
3418         }
3419         if (vnn == NULL) {
3420                 /* if it is not a public ip   it could be our 'single ip' */
3421                 if (ctdb->single_ip_vnn) {
3422                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3423                                 vnn = ctdb->single_ip_vnn;
3424                         }
3425                 }
3426         }
3427         if (vnn == NULL) {
3428                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3429                 return -1;
3430         }
3431
3432         killtcp = vnn->killtcp;
3433         
3434         /* If this is the first connection to kill we must allocate
3435            a new structure
3436          */
3437         if (killtcp == NULL) {
3438                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3439                 CTDB_NO_MEMORY(ctdb, killtcp);
3440
3441                 killtcp->vnn         = vnn;
3442                 killtcp->ctdb        = ctdb;
3443                 killtcp->capture_fd  = -1;
3444                 killtcp->connections = trbt_create(killtcp, 0);
3445
3446                 vnn->killtcp         = killtcp;
3447                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3448         }
3449
3450
3451
3452         /* create a structure that describes this connection we want to
3453            RST and store it in killtcp->connections
3454         */
3455         con = talloc(killtcp, struct ctdb_killtcp_con);
3456         CTDB_NO_MEMORY(ctdb, con);
3457         con->src_addr = src;
3458         con->dst_addr = dst;
3459         con->count    = 0;
3460         con->killtcp  = killtcp;
3461
3462
3463         trbt_insertarray32_callback(killtcp->connections,
3464                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3465                         add_killtcp_callback, con);
3466
3467         /* 
3468            If we dont have a socket to listen on yet we must create it
3469          */
3470         if (killtcp->capture_fd == -1) {
3471                 const char *iface = ctdb_vnn_iface_string(vnn);
3472                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3473                 if (killtcp->capture_fd == -1) {
3474                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3475                                           "socket on iface '%s' for killtcp (%s)\n",
3476                                           iface, strerror(errno)));
3477                         goto failed;
3478                 }
3479         }
3480
3481
3482         if (killtcp->fde == NULL) {
3483                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3484                                             EVENT_FD_READ,
3485                                             capture_tcp_handler, killtcp);
3486                 tevent_fd_set_auto_close(killtcp->fde);
3487
3488                 /* We also need to set up some events to tickle all these connections
3489                    until they are all reset
3490                 */
3491                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3492                                 ctdb_tickle_sentenced_connections, killtcp);
3493         }
3494
3495         /* tickle him once now */
3496         ctdb_sys_send_tcp(
3497                 &con->dst_addr,
3498                 &con->src_addr,
3499                 0, 0, 0);
3500
3501         return 0;
3502
3503 failed:
3504         talloc_free(vnn->killtcp);
3505         vnn->killtcp = NULL;
3506         return -1;
3507 }
3508
3509 /*
3510   kill a TCP connection.
3511  */
3512 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3513 {
3514         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3515
3516         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3517 }
3518
3519 /*
3520   called by a daemon to inform us of the entire list of TCP tickles for
3521   a particular public address.
3522   this control should only be sent by the node that is currently serving
3523   that public address.
3524  */
3525 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3526 {
3527         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3528         struct ctdb_tcp_array *tcparray;
3529         struct ctdb_vnn *vnn;
3530
3531         /* We must at least have tickles.num or else we cant verify the size
3532            of the received data blob
3533          */
3534         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3535                                         tickles.connections)) {
3536                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3537                 return -1;
3538         }
3539
3540         /* verify that the size of data matches what we expect */
3541         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3542                                 tickles.connections)
3543                          + sizeof(struct ctdb_tcp_connection)
3544                                  * list->tickles.num) {
3545                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3546                 return -1;
3547         }       
3548
3549         vnn = find_public_ip_vnn(ctdb, &list->addr);
3550         if (vnn == NULL) {
3551                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3552                         ctdb_addr_to_str(&list->addr)));
3553
3554                 return 1;
3555         }
3556
3557         /* remove any old ticklelist we might have */
3558         talloc_free(vnn->tcp_array);
3559         vnn->tcp_array = NULL;
3560
3561         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3562         CTDB_NO_MEMORY(ctdb, tcparray);
3563
3564         tcparray->num = list->tickles.num;
3565
3566         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3567         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3568
3569         memcpy(tcparray->connections, &list->tickles.connections[0], 
3570                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3571
3572         /* We now have a new fresh tickle list array for this vnn */
3573         vnn->tcp_array = talloc_steal(vnn, tcparray);
3574         
3575         return 0;
3576 }
3577
3578 /*
3579   called to return the full list of tickles for the puclic address associated 
3580   with the provided vnn
3581  */
3582 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3583 {
3584         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3585         struct ctdb_control_tcp_tickle_list *list;
3586         struct ctdb_tcp_array *tcparray;
3587         int num;
3588         struct ctdb_vnn *vnn;
3589
3590         vnn = find_public_ip_vnn(ctdb, addr);
3591         if (vnn == NULL) {
3592                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3593                         ctdb_addr_to_str(addr)));
3594
3595                 return 1;
3596         }
3597
3598         tcparray = vnn->tcp_array;
3599         if (tcparray) {
3600                 num = tcparray->num;
3601         } else {
3602                 num = 0;
3603         }
3604
3605         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3606                                 tickles.connections)
3607                         + sizeof(struct ctdb_tcp_connection) * num;
3608
3609         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3610         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3611         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3612
3613         list->addr = *addr;
3614         list->tickles.num = num;
3615         if (num) {
3616                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3617                         sizeof(struct ctdb_tcp_connection) * num);
3618         }
3619
3620         return 0;
3621 }
3622
3623
3624 /*
3625   set the list of all tcp tickles for a public address
3626  */
3627 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3628                               struct timeval timeout, uint32_t destnode, 
3629                               ctdb_sock_addr *addr,
3630                               struct ctdb_tcp_array *tcparray)
3631 {
3632         int ret, num;
3633         TDB_DATA data;
3634         struct ctdb_control_tcp_tickle_list *list;
3635
3636         if (tcparray) {
3637                 num = tcparray->num;
3638         } else {
3639                 num = 0;
3640         }
3641
3642         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3643                                 tickles.connections) +
3644                         sizeof(struct ctdb_tcp_connection) * num;
3645         data.dptr = talloc_size(ctdb, data.dsize);
3646         CTDB_NO_MEMORY(ctdb, data.dptr);
3647
3648         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3649         list->addr = *addr;
3650         list->tickles.num = num;
3651         if (tcparray) {
3652                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3653         }
3654
3655         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3656                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3657                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3658         if (ret != 0) {
3659                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3660                 return -1;
3661         }
3662
3663         talloc_free(data.dptr);
3664
3665         return ret;
3666 }
3667
3668
3669 /*
3670   perform tickle updates if required
3671  */
3672 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3673                                 struct timed_event *te, 
3674                                 struct timeval t, void *private_data)
3675 {
3676         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3677         int ret;
3678         struct ctdb_vnn *vnn;
3679
3680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3681                 /* we only send out updates for public addresses that 
3682                    we have taken over
3683                  */
3684                 if (ctdb->pnn != vnn->pnn) {
3685                         continue;
3686                 }
3687                 /* We only send out the updates if we need to */
3688                 if (!vnn->tcp_update_needed) {
3689                         continue;
3690                 }
3691                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3692                                 TAKEOVER_TIMEOUT(),
3693                                 CTDB_BROADCAST_CONNECTED,
3694                                 &vnn->public_address,
3695                                 vnn->tcp_array);
3696                 if (ret != 0) {
3697                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3698                                 ctdb_addr_to_str(&vnn->public_address)));
3699                 }
3700         }
3701
3702         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3703                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3704                              ctdb_update_tcp_tickles, ctdb);
3705 }               
3706         
3707
3708 /*
3709   start periodic update of tcp tickles
3710  */
3711 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3712 {
3713         ctdb->tickle_update_context = talloc_new(ctdb);
3714
3715         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3716                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3717                              ctdb_update_tcp_tickles, ctdb);
3718 }
3719
3720
3721
3722
3723 struct control_gratious_arp {
3724         struct ctdb_context *ctdb;
3725         ctdb_sock_addr addr;
3726         const char *iface;
3727         int count;
3728 };
3729
3730 /*
3731   send a control_gratuitous arp
3732  */
3733 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3734                                   struct timeval t, void *private_data)
3735 {
3736         int ret;
3737         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3738                                                         struct control_gratious_arp);
3739
3740         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3741         if (ret != 0) {
3742                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3743                                  arp->iface, strerror(errno)));
3744         }
3745
3746
3747         arp->count++;
3748         if (arp->count == CTDB_ARP_REPEAT) {
3749                 talloc_free(arp);
3750                 return;
3751         }
3752
3753         event_add_timed(arp->ctdb->ev, arp, 
3754                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3755                         send_gratious_arp, arp);
3756 }
3757
3758
3759 /*
3760   send a gratious arp 
3761  */
3762 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3763 {
3764         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3765         struct control_gratious_arp *arp;
3766
3767         /* verify the size of indata */
3768         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3769                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3770                                  (unsigned)indata.dsize, 
3771                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3772                 return -1;
3773         }
3774         if (indata.dsize != 
3775                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3776                 + gratious_arp->len ) ){
3777
3778                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3779                         "but should be %u bytes\n", 
3780                          (unsigned)indata.dsize, 
3781                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3782                 return -1;
3783         }
3784
3785
3786         arp = talloc(ctdb, struct control_gratious_arp);
3787         CTDB_NO_MEMORY(ctdb, arp);
3788
3789         arp->ctdb  = ctdb;
3790         arp->addr   = gratious_arp->addr;
3791         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3792         CTDB_NO_MEMORY(ctdb, arp->iface);
3793         arp->count = 0;
3794         
3795         event_add_timed(arp->ctdb->ev, arp, 
3796                         timeval_zero(), send_gratious_arp, arp);
3797
3798         return 0;
3799 }
3800
3801 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3802 {
3803         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3804         int ret;
3805
3806         /* verify the size of indata */
3807         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3808                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3809                 return -1;
3810         }
3811         if (indata.dsize != 
3812                 ( offsetof(struct ctdb_control_ip_iface, iface)
3813                 + pub->len ) ){
3814
3815                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3816                         "but should be %u bytes\n", 
3817                          (unsigned)indata.dsize, 
3818                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3819                 return -1;
3820         }
3821
3822         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3823
3824         if (ret != 0) {
3825                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3826                 return -1;
3827         }
3828
3829         return 0;
3830 }
3831
3832 /*
3833   called when releaseip event finishes for del_public_address
3834  */
3835 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3836                                 void *private_data)
3837 {
3838         talloc_free(private_data);
3839 }
3840
3841 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3842 {
3843         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3844         struct ctdb_vnn *vnn;
3845         int ret;
3846
3847         /* verify the size of indata */
3848         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3849                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3850                 return -1;
3851         }
3852         if (indata.dsize != 
3853                 ( offsetof(struct ctdb_control_ip_iface, iface)
3854                 + pub->len ) ){
3855
3856                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3857                         "but should be %u bytes\n", 
3858                          (unsigned)indata.dsize, 
3859                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3860                 return -1;
3861         }
3862
3863         /* walk over all public addresses until we find a match */
3864         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3865                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3866                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3867
3868                         DLIST_REMOVE(ctdb->vnn, vnn);
3869                         talloc_steal(mem_ctx, vnn);
3870                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3871                         if (vnn->pnn != ctdb->pnn) {
3872                                 if (vnn->iface != NULL) {
3873                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3874                                 }
3875                                 talloc_free(mem_ctx);
3876                                 return 0;
3877                         }
3878                         vnn->pnn = -1;
3879
3880                         ret = ctdb_event_script_callback(ctdb, 
3881                                          mem_ctx, delete_ip_callback, mem_ctx,
3882                                          false,
3883                                          CTDB_EVENT_RELEASE_IP,
3884                                          "%s %s %u",
3885                                          ctdb_vnn_iface_string(vnn),
3886                                          ctdb_addr_to_str(&vnn->public_address),
3887                                          vnn->public_netmask_bits);
3888                         if (vnn->iface != NULL) {
3889                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3890                         }
3891                         if (ret != 0) {
3892                                 return -1;
3893                         }
3894                         return 0;
3895                 }
3896         }
3897
3898         return -1;
3899 }
3900
3901
3902 struct ipreallocated_callback_state {
3903         struct ctdb_req_control *c;
3904 };
3905
3906 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3907                                         int status, void *p)
3908 {
3909         struct ipreallocated_callback_state *state =
3910                 talloc_get_type(p, struct ipreallocated_callback_state);
3911
3912         if (status != 0) {
3913                 DEBUG(DEBUG_ERR,
3914                       (" \"ipreallocated\" event script failed (status %d)\n",
3915                        status));
3916                 if (status == -ETIME) {
3917                         ctdb_ban_self(ctdb);
3918                 }
3919         }
3920
3921         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3922         talloc_free(state);
3923 }
3924
3925 /* A control to run the ipreallocated event */
3926 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3927                                    struct ctdb_req_control *c,
3928                                    bool *async_reply)
3929 {
3930         int ret;
3931         struct ipreallocated_callback_state *state;
3932
3933         state = talloc(ctdb, struct ipreallocated_callback_state);
3934         CTDB_NO_MEMORY(ctdb, state);
3935
3936         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3937
3938         ret = ctdb_event_script_callback(ctdb, state,
3939                                          ctdb_ipreallocated_callback, state,
3940                                          false, CTDB_EVENT_IPREALLOCATED,
3941                                          "%s", "");
3942
3943         if (ret != 0) {
3944                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3945                 talloc_free(state);
3946                 return -1;
3947         }
3948
3949         /* tell the control that we will be reply asynchronously */
3950         state->c    = talloc_steal(state, c);
3951         *async_reply = true;
3952
3953         return 0;
3954 }
3955
3956
3957 /* This function is called from the recovery daemon to verify that a remote
3958    node has the expected ip allocation.
3959    This is verified against ctdb->ip_tree
3960 */
3961 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3962 {
3963         struct ctdb_public_ip_list *tmp_ip; 
3964         int i;
3965
3966         if (ctdb->ip_tree == NULL) {
3967                 /* dont know the expected allocation yet, assume remote node
3968                    is correct. */
3969                 return 0;
3970         }
3971
3972         if (ips == NULL) {
3973                 return 0;
3974         }
3975
3976         for (i=0; i<ips->num; i++) {
3977                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3978                 if (tmp_ip == NULL) {
3979                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3980                         return -1;
3981                 }
3982
3983                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3984                         continue;
3985                 }
3986
3987                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3988                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3989                         return -1;
3990                 }
3991         }
3992
3993         return 0;
3994 }
3995
3996 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3997 {
3998         struct ctdb_public_ip_list *tmp_ip; 
3999
4000         if (ctdb->ip_tree == NULL) {
4001                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4002                 return -1;
4003         }
4004
4005         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4006         if (tmp_ip == NULL) {
4007                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4008                 return -1;
4009         }
4010
4011         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4012         tmp_ip->pnn = ip->pnn;
4013
4014         return 0;
4015 }
4016
4017
4018 struct ctdb_reloadips_handle {
4019         struct ctdb_context *ctdb;
4020         struct ctdb_req_control *c;
4021         int status;
4022         int fd[2];
4023         pid_t child;
4024         struct fd_event *fde;
4025 };
4026
4027 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4028 {
4029         if (h == h->ctdb->reload_ips) {
4030                 h->ctdb->reload_ips = NULL;
4031         }
4032         if (h->c != NULL) {
4033                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4034                 h->c = NULL;
4035         }
4036         ctdb_kill(h->ctdb, h->child, SIGKILL);
4037         return 0;
4038 }
4039
4040 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4041                                 struct timed_event *te,
4042                                 struct timeval t, void *private_data)
4043 {
4044         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4045
4046         talloc_free(h);
4047 }       
4048
4049 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4050                              uint16_t flags, void *private_data)
4051 {
4052         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4053
4054         char res;
4055         int ret;
4056
4057         ret = read(h->fd[0], &res, 1);
4058         if (ret < 1 || res != 0) {
4059                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4060                 res = 1;
4061         }
4062         h->status = res;
4063
4064         talloc_free(h);
4065 }
4066
4067 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4068 {
4069         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4070         struct ctdb_all_public_ips *ips;
4071         struct ctdb_vnn *vnn;
4072         int i, ret;
4073
4074         /* read the ip allocation from the local node */
4075         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4076         if (ret != 0) {
4077                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4078                 talloc_free(mem_ctx);
4079                 return -1;
4080         }
4081
4082         /* re-read the public ips file */
4083         ctdb->vnn = NULL;
4084         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4085                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4086                 talloc_free(mem_ctx);
4087                 return -1;
4088         }               
4089
4090
4091         /* check the previous list of ips and scan for ips that have been
4092            dropped.
4093          */
4094         for (i = 0; i < ips->num; i++) {
4095                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4096                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4097                                 break;
4098                         }
4099                 }
4100
4101                 /* we need to delete this ip, no longer available on this node */
4102                 if (vnn == NULL) {
4103                         struct ctdb_control_ip_iface pub;
4104
4105                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4106                         pub.addr  = ips->ips[i].addr;
4107                         pub.mask  = 0;
4108                         pub.len   = 0;
4109
4110                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4111                         if (ret != 0) {
4112                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4113                                 return -1;
4114                         }
4115                 }
4116         }
4117
4118
4119         /* loop over all new ones and check the ones we need to add */
4120         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4121                 for (i = 0; i < ips->num; i++) {
4122                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4123                                 break;
4124                         }
4125                 }
4126                 if (i == ips->num) {
4127                         struct ctdb_control_ip_iface pub;
4128                         const char *ifaces = NULL;
4129                         int iface = 0;
4130
4131                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4132
4133                         pub.addr  = vnn->public_address;
4134                         pub.mask  = vnn->public_netmask_bits;
4135
4136
4137                         ifaces = vnn->ifaces[0];
4138                         iface = 1;
4139                         while (vnn->ifaces[iface] != NULL) {
4140                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4141                                 iface++;
4142                         }
4143                         pub.len   = strlen(ifaces)+1;
4144                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4145
4146                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4147                         if (ret != 0) {
4148                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4149                                 return -1;
4150                         }
4151                 }
4152         }
4153
4154         return 0;
4155 }
4156
4157 /* This control is sent to force the node to re-read the public addresses file
4158    and drop any addresses we should nnot longer host, and add new addresses
4159    that we are now able to host
4160 */
4161 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4162 {
4163         struct ctdb_reloadips_handle *h;
4164         pid_t parent = getpid();
4165
4166         if (ctdb->reload_ips != NULL) {
4167                 talloc_free(ctdb->reload_ips);
4168                 ctdb->reload_ips = NULL;
4169         }
4170
4171         h = talloc(ctdb, struct ctdb_reloadips_handle);
4172         CTDB_NO_MEMORY(ctdb, h);
4173         h->ctdb     = ctdb;
4174         h->c        = NULL;
4175         h->status   = -1;
4176         
4177         if (pipe(h->fd) == -1) {
4178                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4179                 talloc_free(h);
4180                 return -1;
4181         }
4182
4183         h->child = ctdb_fork(ctdb);
4184         if (h->child == (pid_t)-1) {
4185                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4186                 close(h->fd[0]);
4187                 close(h->fd[1]);
4188                 talloc_free(h);
4189                 return -1;
4190         }
4191
4192         /* child process */
4193         if (h->child == 0) {
4194                 signed char res = 0;
4195
4196                 close(h->fd[0]);
4197                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4198
4199                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4200                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4201                         res = -1;
4202                 } else {
4203                         res = ctdb_reloadips_child(ctdb);
4204                         if (res != 0) {
4205                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4206                         }
4207                 }
4208
4209                 write(h->fd[1], &res, 1);
4210                 /* make sure we die when our parent dies */
4211                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4212                         sleep(5);
4213                 }
4214                 _exit(0);
4215         }
4216
4217         h->c             = talloc_steal(h, c);
4218
4219         close(h->fd[1]);
4220         set_close_on_exec(h->fd[0]);
4221
4222         talloc_set_destructor(h, ctdb_reloadips_destructor);
4223
4224
4225         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4226                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4227                         (void *)h);
4228         tevent_fd_set_auto_close(h->fde);
4229
4230         event_add_timed(ctdb->ev, h,
4231                         timeval_current_ofs(120, 0),
4232                         ctdb_reloadips_timeout_event, h);
4233
4234         /* we reply later */
4235         *async_reply = true;
4236         return 0;
4237 }