recoverd: Remove an orphaned comment
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          false,
514                                          CTDB_EVENT_TAKE_IP,
515                                          "%s %s %u",
516                                          ctdb_vnn_iface_string(vnn),
517                                          ctdb_addr_to_str(&vnn->public_address),
518                                          vnn->public_netmask_bits);
519
520         if (ret != 0) {
521                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522                         ctdb_addr_to_str(&vnn->public_address),
523                         ctdb_vnn_iface_string(vnn)));
524                 talloc_free(state);
525                 return -1;
526         }
527
528         return 0;
529 }
530
531 struct ctdb_do_updateip_state {
532         struct ctdb_req_control *c;
533         struct ctdb_iface *old;
534         struct ctdb_vnn *vnn;
535 };
536
537 /*
538   called when updateip event finishes
539  */
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541                                       void *private_data)
542 {
543         struct ctdb_do_updateip_state *state =
544                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545         int32_t ret;
546
547         if (status != 0) {
548                 if (status == -ETIME) {
549                         ctdb_ban_self(ctdb);
550                 }
551                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552                         ctdb_addr_to_str(&state->vnn->public_address),
553                         state->old->name,
554                         ctdb_vnn_iface_string(state->vnn)));
555
556                 /*
557                  * All we can do is reset the old interface
558                  * and let the next run fix it
559                  */
560                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561                 state->vnn->iface = state->old;
562                 state->vnn->iface->references++;
563
564                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         if (ctdb->do_checkpublicip) {
570
571         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572         if (ret != 0) {
573                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574                 talloc_free(state);
575                 return;
576         }
577
578         }
579
580         /* the control succeeded */
581         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582         talloc_free(state);
583         return;
584 }
585
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587 {
588         state->vnn->update_in_flight = false;
589         return 0;
590 }
591
592 /*
593   update (move) an ip address
594  */
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596                                 struct ctdb_req_control *c,
597                                 struct ctdb_vnn *vnn)
598 {
599         int ret;
600         struct ctdb_do_updateip_state *state;
601         struct ctdb_iface *old = vnn->iface;
602         const char *new_name;
603
604         if (vnn->update_in_flight) {
605                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606                                     "update for this IP already in flight\n",
607                                     ctdb_addr_to_str(&vnn->public_address),
608                                     vnn->public_netmask_bits));
609                 return -1;
610         }
611
612         ctdb_vnn_unassign_iface(ctdb, vnn);
613         ret = ctdb_vnn_assign_iface(ctdb, vnn);
614         if (ret != 0) {
615                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616                                  "assin a usable interface (old iface '%s')\n",
617                                  ctdb_addr_to_str(&vnn->public_address),
618                                  vnn->public_netmask_bits,
619                                  old->name));
620                 return -1;
621         }
622
623         new_name = ctdb_vnn_iface_string(vnn);
624         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625                 /* A benign update from one interface onto itself.
626                  * no need to run the eventscripts in this case, just return
627                  * success.
628                  */
629                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630                 return 0;
631         }
632
633         state = talloc(vnn, struct ctdb_do_updateip_state);
634         CTDB_NO_MEMORY(ctdb, state);
635
636         state->c = talloc_steal(ctdb, c);
637         state->old = old;
638         state->vnn = vnn;
639
640         vnn->update_in_flight = true;
641         talloc_set_destructor(state, ctdb_updateip_destructor);
642
643         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644                             "interface %s to %s\n",
645                             ctdb_addr_to_str(&vnn->public_address),
646                             vnn->public_netmask_bits,
647                             old->name,
648                             new_name));
649
650         ret = ctdb_event_script_callback(ctdb,
651                                          state,
652                                          ctdb_do_updateip_callback,
653                                          state,
654                                          false,
655                                          CTDB_EVENT_UPDATE_IP,
656                                          "%s %s %s %u",
657                                          state->old->name,
658                                          new_name,
659                                          ctdb_addr_to_str(&vnn->public_address),
660                                          vnn->public_netmask_bits);
661         if (ret != 0) {
662                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663                                  ctdb_addr_to_str(&vnn->public_address),
664                                  old->name, new_name));
665                 talloc_free(state);
666                 return -1;
667         }
668
669         return 0;
670 }
671
672 /*
673   Find the vnn of the node that has a public ip address
674   returns -1 if the address is not known as a public address
675  */
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677 {
678         struct ctdb_vnn *vnn;
679
680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681                 if (ctdb_same_ip(&vnn->public_address, addr)) {
682                         return vnn;
683                 }
684         }
685
686         return NULL;
687 }
688
689 /*
690   take over an ip address
691  */
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693                                  struct ctdb_req_control *c,
694                                  TDB_DATA indata,
695                                  bool *async_reply)
696 {
697         int ret;
698         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699         struct ctdb_vnn *vnn;
700         bool have_ip = false;
701         bool do_updateip = false;
702         bool do_takeip = false;
703         struct ctdb_iface *best_iface = NULL;
704
705         if (pip->pnn != ctdb->pnn) {
706                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707                                  "with pnn %d, but we're node %d\n",
708                                  ctdb_addr_to_str(&pip->addr),
709                                  pip->pnn, ctdb->pnn));
710                 return -1;
711         }
712
713         /* update out vnn list */
714         vnn = find_public_ip_vnn(ctdb, &pip->addr);
715         if (vnn == NULL) {
716                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717                         ctdb_addr_to_str(&pip->addr)));
718                 return 0;
719         }
720
721         if (ctdb->do_checkpublicip) {
722                 have_ip = ctdb_sys_have_ip(&pip->addr);
723         }
724         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725         if (best_iface == NULL) {
726                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727                                  "a usable interface (old %s, have_ip %d)\n",
728                                  ctdb_addr_to_str(&vnn->public_address),
729                                  vnn->public_netmask_bits,
730                                  ctdb_vnn_iface_string(vnn),
731                                  have_ip));
732                 return -1;
733         }
734
735         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737                 have_ip = false;
738         }
739
740
741         if (vnn->iface == NULL && have_ip) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744                                  ctdb_addr_to_str(&vnn->public_address)));
745                 return 0;
746         }
747
748         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750                                   "and we have it on iface[%s], but it was assigned to node %d"
751                                   "and we are node %d, banning ourself\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754                 ctdb_ban_self(ctdb);
755                 return -1;
756         }
757
758         if (vnn->pnn == -1 && have_ip) {
759                 vnn->pnn = ctdb->pnn;
760                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761                                   "and we already have it on iface[%s], update local daemon\n",
762                                  ctdb_addr_to_str(&vnn->public_address),
763                                   ctdb_vnn_iface_string(vnn)));
764                 return 0;
765         }
766
767         if (vnn->iface) {
768                 if (vnn->iface != best_iface) {
769                         if (!vnn->iface->link_up) {
770                                 do_updateip = true;
771                         } else if (vnn->iface->references > (best_iface->references + 1)) {
772                                 /* only move when the rebalance gains something */
773                                         do_updateip = true;
774                         }
775                 }
776         }
777
778         if (!have_ip) {
779                 if (do_updateip) {
780                         ctdb_vnn_unassign_iface(ctdb, vnn);
781                         do_updateip = false;
782                 }
783                 do_takeip = true;
784         }
785
786         if (do_takeip) {
787                 ret = ctdb_do_takeip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else if (do_updateip) {
792                 ret = ctdb_do_updateip(ctdb, c, vnn);
793                 if (ret != 0) {
794                         return -1;
795                 }
796         } else {
797                 /*
798                  * The interface is up and the kernel known the ip
799                  * => do nothing
800                  */
801                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits,
804                         ctdb_vnn_iface_string(vnn)));
805                 return 0;
806         }
807
808         /* tell ctdb_control.c that we will be replying asynchronously */
809         *async_reply = true;
810
811         return 0;
812 }
813
814 /*
815   takeover an ip address old v4 style
816  */
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
818                                 struct ctdb_req_control *c,
819                                 TDB_DATA indata, 
820                                 bool *async_reply)
821 {
822         TDB_DATA data;
823         
824         data.dsize = sizeof(struct ctdb_public_ip);
825         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826         CTDB_NO_MEMORY(ctdb, data.dptr);
827         
828         memcpy(data.dptr, indata.dptr, indata.dsize);
829         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
830 }
831
832 /*
833   kill any clients that are registered with a IP that is being released
834  */
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
836 {
837         struct ctdb_client_ip *ip;
838
839         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840                 ctdb_addr_to_str(addr)));
841
842         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843                 ctdb_sock_addr tmp_addr;
844
845                 tmp_addr = ip->addr;
846                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
847                         ip->client_id,
848                         ctdb_addr_to_str(&ip->addr)));
849
850                 if (ctdb_same_ip(&tmp_addr, addr)) {
851                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
852                                                                      ip->client_id, 
853                                                                      struct ctdb_client);
854                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
855                                 ip->client_id,
856                                 ctdb_addr_to_str(&ip->addr),
857                                 client->pid));
858
859                         if (client->pid != 0) {
860                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861                                         (unsigned)client->pid,
862                                         ctdb_addr_to_str(addr),
863                                         ip->client_id));
864                                 kill(client->pid, SIGKILL);
865                         }
866                 }
867         }
868 }
869
870 /*
871   called when releaseip event finishes
872  */
873 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
874                                 void *private_data)
875 {
876         struct takeover_callback_state *state = 
877                 talloc_get_type(private_data, struct takeover_callback_state);
878         TDB_DATA data;
879
880         if (status == -ETIME) {
881                 ctdb_ban_self(ctdb);
882         }
883
884         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
885                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
886                                   ctdb_addr_to_str(state->addr)));
887                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
888                 talloc_free(state);
889                 return;
890         }
891
892         /* send a message to all clients of this node telling them
893            that the cluster has been reconfigured and they should
894            release any sockets on this IP */
895         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
896         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
897         data.dsize = strlen((char *)data.dptr)+1;
898
899         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
900
901         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
902
903         /* kill clients that have registered with this IP */
904         release_kill_clients(ctdb, state->addr);
905
906         ctdb_vnn_unassign_iface(ctdb, state->vnn);
907
908         /* the control succeeded */
909         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
910         talloc_free(state);
911 }
912
913 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
914 {
915         state->vnn->update_in_flight = false;
916         return 0;
917 }
918
919 /*
920   release an ip address
921  */
922 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
923                                 struct ctdb_req_control *c,
924                                 TDB_DATA indata, 
925                                 bool *async_reply)
926 {
927         int ret;
928         struct takeover_callback_state *state;
929         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
930         struct ctdb_vnn *vnn;
931         char *iface;
932
933         /* update our vnn list */
934         vnn = find_public_ip_vnn(ctdb, &pip->addr);
935         if (vnn == NULL) {
936                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
937                         ctdb_addr_to_str(&pip->addr)));
938                 return 0;
939         }
940         vnn->pnn = pip->pnn;
941
942         /* stop any previous arps */
943         talloc_free(vnn->takeover_ctx);
944         vnn->takeover_ctx = NULL;
945
946         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
947          * lazy multicast to drop an IP from any node that isn't the
948          * intended new node.  The following causes makes ctdbd ignore
949          * a release for any address it doesn't host.
950          */
951         if (ctdb->do_checkpublicip) {
952                 if (!ctdb_sys_have_ip(&pip->addr)) {
953                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
954                                 ctdb_addr_to_str(&pip->addr),
955                                 vnn->public_netmask_bits,
956                                 ctdb_vnn_iface_string(vnn)));
957                         ctdb_vnn_unassign_iface(ctdb, vnn);
958                         return 0;
959                 }
960         } else {
961                 if (vnn->iface == NULL) {
962                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
963                                            ctdb_addr_to_str(&pip->addr),
964                                            vnn->public_netmask_bits));
965                         return 0;
966                 }
967         }
968
969         /* There is a potential race between take_ip and us because we
970          * update the VNN via a callback that run when the
971          * eventscripts have been run.  Avoid the race by allowing one
972          * update to be in flight at a time.
973          */
974         if (vnn->update_in_flight) {
975                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
976                                     "update for this IP already in flight\n",
977                                     ctdb_addr_to_str(&vnn->public_address),
978                                     vnn->public_netmask_bits));
979                 return -1;
980         }
981
982         if (ctdb->do_checkpublicip) {
983                 iface = ctdb_sys_find_ifname(&pip->addr);
984                 if (iface == NULL) {
985                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
986                         return 0;
987                 }
988                 if (vnn->iface == NULL) {
989                         DEBUG(DEBUG_WARNING,
990                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
991                                ctdb_addr_to_str(&pip->addr),
992                                iface));
993                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
994                         DEBUG(DEBUG_WARNING,
995                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
996                                ctdb_addr_to_str(&pip->addr),
997                                iface,
998                                ctdb_vnn_iface_string(vnn)));
999                         /* Should we fix vnn->iface?  If we do, what
1000                          * happens to reference counts?
1001                          */
1002                 }
1003         } else {
1004                 iface = strdup(ctdb_vnn_iface_string(vnn));
1005         }
1006
1007         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1008                 ctdb_addr_to_str(&pip->addr),
1009                 vnn->public_netmask_bits,
1010                 iface,
1011                 pip->pnn));
1012
1013         state = talloc(ctdb, struct takeover_callback_state);
1014         CTDB_NO_MEMORY(ctdb, state);
1015
1016         state->c = talloc_steal(state, c);
1017         state->addr = talloc(state, ctdb_sock_addr);       
1018         CTDB_NO_MEMORY(ctdb, state->addr);
1019         *state->addr = pip->addr;
1020         state->vnn   = vnn;
1021
1022         vnn->update_in_flight = true;
1023         talloc_set_destructor(state, ctdb_releaseip_destructor);
1024
1025         ret = ctdb_event_script_callback(ctdb, 
1026                                          state, release_ip_callback, state,
1027                                          false,
1028                                          CTDB_EVENT_RELEASE_IP,
1029                                          "%s %s %u",
1030                                          iface,
1031                                          ctdb_addr_to_str(&pip->addr),
1032                                          vnn->public_netmask_bits);
1033         free(iface);
1034         if (ret != 0) {
1035                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1036                         ctdb_addr_to_str(&pip->addr),
1037                         ctdb_vnn_iface_string(vnn)));
1038                 talloc_free(state);
1039                 return -1;
1040         }
1041
1042         /* tell the control that we will be reply asynchronously */
1043         *async_reply = true;
1044         return 0;
1045 }
1046
1047 /*
1048   release an ip address old v4 style
1049  */
1050 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1051                                 struct ctdb_req_control *c,
1052                                 TDB_DATA indata, 
1053                                 bool *async_reply)
1054 {
1055         TDB_DATA data;
1056         
1057         data.dsize = sizeof(struct ctdb_public_ip);
1058         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1059         CTDB_NO_MEMORY(ctdb, data.dptr);
1060         
1061         memcpy(data.dptr, indata.dptr, indata.dsize);
1062         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1063 }
1064
1065
1066 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1067                                    ctdb_sock_addr *addr,
1068                                    unsigned mask, const char *ifaces,
1069                                    bool check_address)
1070 {
1071         struct ctdb_vnn      *vnn;
1072         uint32_t num = 0;
1073         char *tmp;
1074         const char *iface;
1075         int i;
1076         int ret;
1077
1078         tmp = strdup(ifaces);
1079         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1080                 if (!ctdb_sys_check_iface_exists(iface)) {
1081                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1082                         free(tmp);
1083                         return -1;
1084                 }
1085         }
1086         free(tmp);
1087
1088         /* Verify that we dont have an entry for this ip yet */
1089         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1090                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1091                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1092                                 ctdb_addr_to_str(addr)));
1093                         return -1;
1094                 }               
1095         }
1096
1097         /* create a new vnn structure for this ip address */
1098         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1099         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1100         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1101         tmp = talloc_strdup(vnn, ifaces);
1102         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1103         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1104                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1105                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1106                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1107                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1108                 num++;
1109         }
1110         talloc_free(tmp);
1111         vnn->ifaces[num] = NULL;
1112         vnn->public_address      = *addr;
1113         vnn->public_netmask_bits = mask;
1114         vnn->pnn                 = -1;
1115         if (check_address) {
1116                 if (ctdb_sys_have_ip(addr)) {
1117                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1118                         vnn->pnn = ctdb->pnn;
1119                 }
1120         }
1121
1122         for (i=0; vnn->ifaces[i]; i++) {
1123                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1124                 if (ret != 0) {
1125                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1126                                            "for public_address[%s]\n",
1127                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1128                         talloc_free(vnn);
1129                         return -1;
1130                 }
1131         }
1132
1133         DLIST_ADD(ctdb->vnn, vnn);
1134
1135         return 0;
1136 }
1137
1138 /*
1139   setup the event script directory
1140 */
1141 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1142 {
1143         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1144         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1145         return 0;
1146 }
1147
1148 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1149                                   struct timeval t, void *private_data)
1150 {
1151         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1152                                                         struct ctdb_context);
1153         struct ctdb_vnn *vnn;
1154
1155         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1156                 int i;
1157
1158                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1159                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1160                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1161                                         vnn->ifaces[i],
1162                                         ctdb_addr_to_str(&vnn->public_address)));
1163                         }
1164                 }
1165         }
1166
1167         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1168                 timeval_current_ofs(30, 0), 
1169                 ctdb_check_interfaces_event, ctdb);
1170 }
1171
1172
1173 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1174 {
1175         if (ctdb->check_public_ifaces_ctx != NULL) {
1176                 talloc_free(ctdb->check_public_ifaces_ctx);
1177                 ctdb->check_public_ifaces_ctx = NULL;
1178         }
1179
1180         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1181         if (ctdb->check_public_ifaces_ctx == NULL) {
1182                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1183         }
1184
1185         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1186                 timeval_current_ofs(30, 0), 
1187                 ctdb_check_interfaces_event, ctdb);
1188
1189         return 0;
1190 }
1191
1192
1193 /*
1194   setup the public address lists from a file
1195 */
1196 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1197 {
1198         char **lines;
1199         int nlines;
1200         int i;
1201
1202         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1203         if (lines == NULL) {
1204                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1205                 return -1;
1206         }
1207         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1208                 nlines--;
1209         }
1210
1211         for (i=0;i<nlines;i++) {
1212                 unsigned mask;
1213                 ctdb_sock_addr addr;
1214                 const char *addrstr;
1215                 const char *ifaces;
1216                 char *tok, *line;
1217
1218                 line = lines[i];
1219                 while ((*line == ' ') || (*line == '\t')) {
1220                         line++;
1221                 }
1222                 if (*line == '#') {
1223                         continue;
1224                 }
1225                 if (strcmp(line, "") == 0) {
1226                         continue;
1227                 }
1228                 tok = strtok(line, " \t");
1229                 addrstr = tok;
1230                 tok = strtok(NULL, " \t");
1231                 if (tok == NULL) {
1232                         if (NULL == ctdb->default_public_interface) {
1233                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1234                                          i+1));
1235                                 talloc_free(lines);
1236                                 return -1;
1237                         }
1238                         ifaces = ctdb->default_public_interface;
1239                 } else {
1240                         ifaces = tok;
1241                 }
1242
1243                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1244                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1245                         talloc_free(lines);
1246                         return -1;
1247                 }
1248                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1249                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1250                         talloc_free(lines);
1251                         return -1;
1252                 }
1253         }
1254
1255
1256         talloc_free(lines);
1257         return 0;
1258 }
1259
1260 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1261                               const char *iface,
1262                               const char *ip)
1263 {
1264         struct ctdb_vnn *svnn;
1265         struct ctdb_iface *cur = NULL;
1266         bool ok;
1267         int ret;
1268
1269         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1270         CTDB_NO_MEMORY(ctdb, svnn);
1271
1272         svnn->ifaces = talloc_array(svnn, const char *, 2);
1273         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1274         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1275         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1276         svnn->ifaces[1] = NULL;
1277
1278         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1279         if (!ok) {
1280                 talloc_free(svnn);
1281                 return -1;
1282         }
1283
1284         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1285         if (ret != 0) {
1286                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1287                                    "for single_ip[%s]\n",
1288                                    svnn->ifaces[0],
1289                                    ctdb_addr_to_str(&svnn->public_address)));
1290                 talloc_free(svnn);
1291                 return -1;
1292         }
1293
1294         /* assume the single public ip interface is initially "good" */
1295         cur = ctdb_find_iface(ctdb, iface);
1296         if (cur == NULL) {
1297                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1298                 return -1;
1299         }
1300         cur->link_up = true;
1301
1302         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1303         if (ret != 0) {
1304                 talloc_free(svnn);
1305                 return -1;
1306         }
1307
1308         ctdb->single_ip_vnn = svnn;
1309         return 0;
1310 }
1311
1312 struct ctdb_public_ip_list {
1313         struct ctdb_public_ip_list *next;
1314         uint32_t pnn;
1315         ctdb_sock_addr addr;
1316 };
1317
1318 /* Given a physical node, return the number of
1319    public addresses that is currently assigned to this node.
1320 */
1321 static int node_ip_coverage(struct ctdb_context *ctdb, 
1322         int32_t pnn,
1323         struct ctdb_public_ip_list *ips)
1324 {
1325         int num=0;
1326
1327         for (;ips;ips=ips->next) {
1328                 if (ips->pnn == pnn) {
1329                         num++;
1330                 }
1331         }
1332         return num;
1333 }
1334
1335
1336 /* Can the given node host the given IP: is the public IP known to the
1337  * node and is NOIPHOST unset?
1338 */
1339 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1340                              struct ctdb_ipflags ipflags,
1341                              struct ctdb_public_ip_list *ip)
1342 {
1343         struct ctdb_all_public_ips *public_ips;
1344         int i;
1345
1346         if (ipflags.noiphost) {
1347                 return false;
1348         }
1349
1350         public_ips = ctdb->nodes[pnn]->available_public_ips;
1351
1352         if (public_ips == NULL) {
1353                 return false;
1354         }
1355
1356         for (i=0; i<public_ips->num; i++) {
1357                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1358                         /* yes, this node can serve this public ip */
1359                         return true;
1360                 }
1361         }
1362
1363         return false;
1364 }
1365
1366 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1367                                  struct ctdb_ipflags ipflags,
1368                                  struct ctdb_public_ip_list *ip)
1369 {
1370         if (ipflags.noiptakeover) {
1371                 return false;
1372         }
1373
1374         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1375 }
1376
1377 /* search the node lists list for a node to takeover this ip.
1378    pick the node that currently are serving the least number of ips
1379    so that the ips get spread out evenly.
1380 */
1381 static int find_takeover_node(struct ctdb_context *ctdb, 
1382                 struct ctdb_ipflags *ipflags,
1383                 struct ctdb_public_ip_list *ip,
1384                 struct ctdb_public_ip_list *all_ips)
1385 {
1386         int pnn, min=0, num;
1387         int i, numnodes;
1388
1389         numnodes = talloc_array_length(ipflags);
1390         pnn    = -1;
1391         for (i=0; i<numnodes; i++) {
1392                 /* verify that this node can serve this ip */
1393                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1394                         /* no it couldnt   so skip to the next node */
1395                         continue;
1396                 }
1397
1398                 num = node_ip_coverage(ctdb, i, all_ips);
1399                 /* was this the first node we checked ? */
1400                 if (pnn == -1) {
1401                         pnn = i;
1402                         min  = num;
1403                 } else {
1404                         if (num < min) {
1405                                 pnn = i;
1406                                 min  = num;
1407                         }
1408                 }
1409         }       
1410         if (pnn == -1) {
1411                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1412                         ctdb_addr_to_str(&ip->addr)));
1413
1414                 return -1;
1415         }
1416
1417         ip->pnn = pnn;
1418         return 0;
1419 }
1420
1421 #define IP_KEYLEN       4
1422 static uint32_t *ip_key(ctdb_sock_addr *ip)
1423 {
1424         static uint32_t key[IP_KEYLEN];
1425
1426         bzero(key, sizeof(key));
1427
1428         switch (ip->sa.sa_family) {
1429         case AF_INET:
1430                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1431                 break;
1432         case AF_INET6: {
1433                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1434                 key[0]  = htonl(s6_a32[0]);
1435                 key[1]  = htonl(s6_a32[1]);
1436                 key[2]  = htonl(s6_a32[2]);
1437                 key[3]  = htonl(s6_a32[3]);
1438                 break;
1439         }
1440         default:
1441                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1442                 return key;
1443         }
1444
1445         return key;
1446 }
1447
1448 static void *add_ip_callback(void *parm, void *data)
1449 {
1450         struct ctdb_public_ip_list *this_ip = parm; 
1451         struct ctdb_public_ip_list *prev_ip = data; 
1452
1453         if (prev_ip == NULL) {
1454                 return parm;
1455         }
1456         if (this_ip->pnn == -1) {
1457                 this_ip->pnn = prev_ip->pnn;
1458         }
1459
1460         return parm;
1461 }
1462
1463 static int getips_count_callback(void *param, void *data)
1464 {
1465         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1466         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1467
1468         new_ip->next = *ip_list;
1469         *ip_list     = new_ip;
1470         return 0;
1471 }
1472
1473 static struct ctdb_public_ip_list *
1474 create_merged_ip_list(struct ctdb_context *ctdb)
1475 {
1476         int i, j;
1477         struct ctdb_public_ip_list *ip_list;
1478         struct ctdb_all_public_ips *public_ips;
1479
1480         if (ctdb->ip_tree != NULL) {
1481                 talloc_free(ctdb->ip_tree);
1482                 ctdb->ip_tree = NULL;
1483         }
1484         ctdb->ip_tree = trbt_create(ctdb, 0);
1485
1486         for (i=0;i<ctdb->num_nodes;i++) {
1487                 public_ips = ctdb->nodes[i]->known_public_ips;
1488
1489                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1490                         continue;
1491                 }
1492
1493                 /* there were no public ips for this node */
1494                 if (public_ips == NULL) {
1495                         continue;
1496                 }               
1497
1498                 for (j=0;j<public_ips->num;j++) {
1499                         struct ctdb_public_ip_list *tmp_ip; 
1500
1501                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1502                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503                         /* Do not use information about IP addresses hosted
1504                          * on other nodes, it may not be accurate */
1505                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1507                         } else {
1508                                 tmp_ip->pnn = -1;
1509                         }
1510                         tmp_ip->addr = public_ips->ips[j].addr;
1511                         tmp_ip->next = NULL;
1512
1513                         trbt_insertarray32_callback(ctdb->ip_tree,
1514                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1515                                 add_ip_callback,
1516                                 tmp_ip);
1517                 }
1518         }
1519
1520         ip_list = NULL;
1521         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1522
1523         return ip_list;
1524 }
1525
1526 /* 
1527  * This is the length of the longtest common prefix between the IPs.
1528  * It is calculated by XOR-ing the 2 IPs together and counting the
1529  * number of leading zeroes.  The implementation means that all
1530  * addresses end up being 128 bits long.
1531  *
1532  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534  * lots of nodes and IP addresses?
1535  */
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1537 {
1538         uint32_t ip1_k[IP_KEYLEN];
1539         uint32_t *t;
1540         int i;
1541         uint32_t x;
1542
1543         uint32_t distance = 0;
1544
1545         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1546         t = ip_key(ip2);
1547         for (i=0; i<IP_KEYLEN; i++) {
1548                 x = ip1_k[i] ^ t[i];
1549                 if (x == 0) {
1550                         distance += 32;
1551                 } else {
1552                         /* Count number of leading zeroes. 
1553                          * FIXME? This could be optimised...
1554                          */
1555                         while ((x & (1 << 31)) == 0) {
1556                                 x <<= 1;
1557                                 distance += 1;
1558                         }
1559                 }
1560         }
1561
1562         return distance;
1563 }
1564
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566    given node.  The ips argument is generally the all_ips variable
1567    used in the main part of the algorithm.
1568  */
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570                                   struct ctdb_public_ip_list *ips,
1571                                   int pnn)
1572 {
1573         struct ctdb_public_ip_list *t;
1574         uint32_t d;
1575
1576         uint32_t sum = 0;
1577
1578         for (t=ips; t != NULL; t=t->next) {
1579                 if (t->pnn != pnn) {
1580                         continue;
1581                 }
1582
1583                 /* Optimisation: We never calculate the distance
1584                  * between an address and itself.  This allows us to
1585                  * calculate the effect of removing an address from a
1586                  * node by simply calculating the distance between
1587                  * that address and all of the exitsing addresses.
1588                  * Moreover, we assume that we're only ever dealing
1589                  * with addresses from all_ips so we can identify an
1590                  * address via a pointer rather than doing a more
1591                  * expensive address comparison. */
1592                 if (&(t->addr) == ip) {
1593                         continue;
1594                 }
1595
1596                 d = ip_distance(ip, &(t->addr));
1597                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1598         }
1599
1600         return sum;
1601 }
1602
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1604    to the given node.
1605  */
1606 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1607 {
1608         struct ctdb_public_ip_list *t;
1609
1610         uint32_t imbalance = 0;
1611
1612         for (t=all_ips; t!=NULL; t=t->next) {
1613                 if (t->pnn != pnn) {
1614                         continue;
1615                 }
1616                 /* Pass the rest of the IPs rather than the whole
1617                    all_ips input list.
1618                 */
1619                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1620         }
1621
1622         return imbalance;
1623 }
1624
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626  * finding the best node for each.
1627  */
1628 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1629                                       struct ctdb_ipflags *ipflags,
1630                                       struct ctdb_public_ip_list *all_ips)
1631 {
1632         struct ctdb_public_ip_list *tmp_ip;
1633
1634         /* loop over all ip's and find a physical node to cover for 
1635            each unassigned ip.
1636         */
1637         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638                 if (tmp_ip->pnn == -1) {
1639                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1640                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1641                                         ctdb_addr_to_str(&tmp_ip->addr)));
1642                         }
1643                 }
1644         }
1645 }
1646
1647 /* Basic non-deterministic rebalancing algorithm.
1648  */
1649 static void basic_failback(struct ctdb_context *ctdb,
1650                            struct ctdb_ipflags *ipflags,
1651                            struct ctdb_public_ip_list *all_ips,
1652                            int num_ips)
1653 {
1654         int i, numnodes;
1655         int maxnode, maxnum, minnode, minnum, num, retries;
1656         struct ctdb_public_ip_list *tmp_ip;
1657
1658         numnodes = talloc_array_length(ipflags);
1659         retries = 0;
1660
1661 try_again:
1662         maxnum=0;
1663         minnum=0;
1664
1665         /* for each ip address, loop over all nodes that can serve
1666            this ip and make sure that the difference between the node
1667            serving the most and the node serving the least ip's are
1668            not greater than 1.
1669         */
1670         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1671                 if (tmp_ip->pnn == -1) {
1672                         continue;
1673                 }
1674
1675                 /* Get the highest and lowest number of ips's served by any 
1676                    valid node which can serve this ip.
1677                 */
1678                 maxnode = -1;
1679                 minnode = -1;
1680                 for (i=0; i<numnodes; i++) {
1681                         /* only check nodes that can actually serve this ip */
1682                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1683                                 /* no it couldnt   so skip to the next node */
1684                                 continue;
1685                         }
1686
1687                         num = node_ip_coverage(ctdb, i, all_ips);
1688                         if (maxnode == -1) {
1689                                 maxnode = i;
1690                                 maxnum  = num;
1691                         } else {
1692                                 if (num > maxnum) {
1693                                         maxnode = i;
1694                                         maxnum  = num;
1695                                 }
1696                         }
1697                         if (minnode == -1) {
1698                                 minnode = i;
1699                                 minnum  = num;
1700                         } else {
1701                                 if (num < minnum) {
1702                                         minnode = i;
1703                                         minnum  = num;
1704                                 }
1705                         }
1706                 }
1707                 if (maxnode == -1) {
1708                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1709                                 ctdb_addr_to_str(&tmp_ip->addr)));
1710
1711                         continue;
1712                 }
1713
1714                 /* if the spread between the smallest and largest coverage by
1715                    a node is >=2 we steal one of the ips from the node with
1716                    most coverage to even things out a bit.
1717                    try to do this a limited number of times since we dont
1718                    want to spend too much time balancing the ip coverage.
1719                 */
1720                 if ( (maxnum > minnum+1)
1721                      && (retries < (num_ips + 5)) ){
1722                         struct ctdb_public_ip_list *tmp;
1723
1724                         /* Reassign one of maxnode's VNNs */
1725                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1726                                 if (tmp->pnn == maxnode) {
1727                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1728                                         retries++;
1729                                         goto try_again;;
1730                                 }
1731                         }
1732                 }
1733         }
1734 }
1735
1736 struct ctdb_rebalancenodes {
1737         struct ctdb_rebalancenodes *next;
1738         uint32_t pnn;
1739 };
1740 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1741
1742
1743 /* set this flag to force the node to be rebalanced even if it just didnt
1744    become healthy again.
1745 */
1746 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1747 {
1748         struct ctdb_rebalancenodes *rebalance;
1749
1750         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1751                 if (rebalance->pnn == pnn) {
1752                         return;
1753                 }
1754         }
1755
1756         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1757         rebalance->pnn = pnn;
1758         rebalance->next = force_rebalance_list;
1759         force_rebalance_list = rebalance;
1760 }
1761
1762 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1763  * that we can unit test it.
1764  */
1765 static void lcp2_init(struct ctdb_context *tmp_ctx,
1766                       struct ctdb_ipflags *ipflags,
1767                       struct ctdb_public_ip_list *all_ips,
1768                       uint32_t **lcp2_imbalances,
1769                       bool **rebalance_candidates)
1770 {
1771         int i, numnodes;
1772         struct ctdb_public_ip_list *tmp_ip;
1773
1774         numnodes = talloc_array_length(ipflags);
1775
1776         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1777         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1778         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1779         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1780
1781         for (i=0; i<numnodes; i++) {
1782                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1783                 /* First step: assume all nodes are candidates */
1784                 (*rebalance_candidates)[i] = true;
1785         }
1786
1787         /* 2nd step: if a node has IPs assigned then it must have been
1788          * healthy before, so we remove it from consideration.  This
1789          * is overkill but is all we have because we don't maintain
1790          * state between takeover runs.  An alternative would be to
1791          * keep state and invalidate it every time the recovery master
1792          * changes.
1793          */
1794         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1795                 if (tmp_ip->pnn != -1) {
1796                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1797                 }
1798         }
1799
1800         /* 3rd step: if a node is forced to re-balance then
1801            we allow failback onto the node */
1802         while (force_rebalance_list != NULL) {
1803                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1804
1805                 if (force_rebalance_list->pnn <= numnodes) {
1806                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1807                 }
1808
1809                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1810                 talloc_free(force_rebalance_list);
1811                 force_rebalance_list = next;
1812         }
1813 }
1814
1815 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1816  * the IP/node combination that will cost the least.
1817  */
1818 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1819                                      struct ctdb_ipflags *ipflags,
1820                                      struct ctdb_public_ip_list *all_ips,
1821                                      uint32_t *lcp2_imbalances)
1822 {
1823         struct ctdb_public_ip_list *tmp_ip;
1824         int dstnode, numnodes;
1825
1826         int minnode;
1827         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1828         struct ctdb_public_ip_list *minip;
1829
1830         bool should_loop = true;
1831         bool have_unassigned = true;
1832
1833         numnodes = talloc_array_length(ipflags);
1834
1835         while (have_unassigned && should_loop) {
1836                 should_loop = false;
1837
1838                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1839                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1840
1841                 minnode = -1;
1842                 mindsum = 0;
1843                 minip = NULL;
1844
1845                 /* loop over each unassigned ip. */
1846                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1847                         if (tmp_ip->pnn != -1) {
1848                                 continue;
1849                         }
1850
1851                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1852                                 /* only check nodes that can actually takeover this ip */
1853                                 if (!can_node_takeover_ip(ctdb, dstnode,
1854                                                           ipflags[dstnode],
1855                                                           tmp_ip)) {
1856                                         /* no it couldnt   so skip to the next node */
1857                                         continue;
1858                                 }
1859
1860                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1861                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1862                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1863                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1864                                                    dstnode,
1865                                                    dstimbl - lcp2_imbalances[dstnode]));
1866
1867
1868                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1869                                         minnode = dstnode;
1870                                         minimbl = dstimbl;
1871                                         mindsum = dstdsum;
1872                                         minip = tmp_ip;
1873                                         should_loop = true;
1874                                 }
1875                         }
1876                 }
1877
1878                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1879
1880                 /* If we found one then assign it to the given node. */
1881                 if (minnode != -1) {
1882                         minip->pnn = minnode;
1883                         lcp2_imbalances[minnode] = minimbl;
1884                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1885                                           ctdb_addr_to_str(&(minip->addr)),
1886                                           minnode,
1887                                           mindsum));
1888                 }
1889
1890                 /* There might be a better way but at least this is clear. */
1891                 have_unassigned = false;
1892                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1893                         if (tmp_ip->pnn == -1) {
1894                                 have_unassigned = true;
1895                         }
1896                 }
1897         }
1898
1899         /* We know if we have an unassigned addresses so we might as
1900          * well optimise.
1901          */
1902         if (have_unassigned) {
1903                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1904                         if (tmp_ip->pnn == -1) {
1905                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1906                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1907                         }
1908                 }
1909         }
1910 }
1911
1912 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1913  * to move IPs from, determines the best IP/destination node
1914  * combination to move from the source node.
1915  */
1916 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1917                                     struct ctdb_ipflags *ipflags,
1918                                     struct ctdb_public_ip_list *all_ips,
1919                                     int srcnode,
1920                                     uint32_t candimbl,
1921                                     uint32_t *lcp2_imbalances,
1922                                     bool *rebalance_candidates)
1923 {
1924         int dstnode, mindstnode, numnodes;
1925         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1926         uint32_t minsrcimbl, mindstimbl;
1927         struct ctdb_public_ip_list *minip;
1928         struct ctdb_public_ip_list *tmp_ip;
1929
1930         /* Find an IP and destination node that best reduces imbalance. */
1931         srcimbl = 0;
1932         minip = NULL;
1933         minsrcimbl = 0;
1934         mindstnode = -1;
1935         mindstimbl = 0;
1936
1937         numnodes = talloc_array_length(ipflags);
1938
1939         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1940         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1941
1942         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1943                 /* Only consider addresses on srcnode. */
1944                 if (tmp_ip->pnn != srcnode) {
1945                         continue;
1946                 }
1947
1948                 /* What is this IP address costing the source node? */
1949                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1950                 srcimbl = candimbl - srcdsum;
1951
1952                 /* Consider this IP address would cost each potential
1953                  * destination node.  Destination nodes are limited to
1954                  * those that are newly healthy, since we don't want
1955                  * to do gratuitous failover of IPs just to make minor
1956                  * balance improvements.
1957                  */
1958                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1959                         if (!rebalance_candidates[dstnode]) {
1960                                 continue;
1961                         }
1962
1963                         /* only check nodes that can actually takeover this ip */
1964                         if (!can_node_takeover_ip(ctdb, dstnode,
1965                                                   ipflags[dstnode], tmp_ip)) {
1966                                 /* no it couldnt   so skip to the next node */
1967                                 continue;
1968                         }
1969
1970                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1971                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1972                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1973                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1974                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1975                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1976
1977                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1978                             ((mindstnode == -1) ||                              \
1979                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1980
1981                                 minip = tmp_ip;
1982                                 minsrcimbl = srcimbl;
1983                                 mindstnode = dstnode;
1984                                 mindstimbl = dstimbl;
1985                         }
1986                 }
1987         }
1988         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1989
1990         if (mindstnode != -1) {
1991                 /* We found a move that makes things better... */
1992                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1993                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1994                                   ctdb_addr_to_str(&(minip->addr)),
1995                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1996
1997
1998                 lcp2_imbalances[srcnode] = srcimbl;
1999                 lcp2_imbalances[mindstnode] = mindstimbl;
2000                 minip->pnn = mindstnode;
2001
2002                 return true;
2003         }
2004
2005         return false;
2006         
2007 }
2008
2009 struct lcp2_imbalance_pnn {
2010         uint32_t imbalance;
2011         int pnn;
2012 };
2013
2014 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2015 {
2016         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2017         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2018
2019         if (lipa->imbalance > lipb->imbalance) {
2020                 return -1;
2021         } else if (lipa->imbalance == lipb->imbalance) {
2022                 return 0;
2023         } else {
2024                 return 1;
2025         }
2026 }
2027
2028 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2029  * node with the highest LCP2 imbalance, and then determines the best
2030  * IP/destination node combination to move from the source node.
2031  */
2032 static void lcp2_failback(struct ctdb_context *ctdb,
2033                           struct ctdb_ipflags *ipflags,
2034                           struct ctdb_public_ip_list *all_ips,
2035                           uint32_t *lcp2_imbalances,
2036                           bool *rebalance_candidates)
2037 {
2038         int i, num_rebalance_candidates, numnodes;
2039         struct lcp2_imbalance_pnn * lips;
2040         bool again;
2041
2042         numnodes = talloc_array_length(ipflags);
2043
2044 try_again:
2045
2046         /* It is only worth continuing if we have suitable target
2047          * nodes to transfer IPs to.  This check is much cheaper than
2048          * continuing on...
2049          */
2050         num_rebalance_candidates = 0;
2051         for (i=0; i<numnodes; i++) {
2052                 if (rebalance_candidates[i]) {
2053                         num_rebalance_candidates++;
2054                 }
2055         }
2056         if (num_rebalance_candidates == 0) {
2057                 return;
2058         }
2059
2060         /* Put the imbalances and nodes into an array, sort them and
2061          * iterate through candidates.  Usually the 1st one will be
2062          * used, so this doesn't cost much...
2063          */
2064         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2065         for (i=0; i<numnodes; i++) {
2066                 lips[i].imbalance = lcp2_imbalances[i];
2067                 lips[i].pnn = i;
2068         }
2069         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2070               lcp2_cmp_imbalance_pnn);
2071
2072         again = false;
2073         for (i=0; i<numnodes; i++) {
2074                 /* This means that all nodes had 0 or 1 addresses, so
2075                  * can't be imbalanced.
2076                  */
2077                 if (lips[i].imbalance == 0) {
2078                         break;
2079                 }
2080
2081                 if (lcp2_failback_candidate(ctdb,
2082                                             ipflags,
2083                                             all_ips,
2084                                             lips[i].pnn,
2085                                             lips[i].imbalance,
2086                                             lcp2_imbalances,
2087                                             rebalance_candidates)) {
2088                         again = true;
2089                         break;
2090                 }
2091         }
2092
2093         talloc_free(lips);
2094         if (again) {
2095                 goto try_again;
2096         }
2097 }
2098
2099 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2100                                     struct ctdb_ipflags *ipflags,
2101                                     struct ctdb_public_ip_list *all_ips)
2102 {
2103         struct ctdb_public_ip_list *tmp_ip;
2104
2105         /* verify that the assigned nodes can serve that public ip
2106            and set it to -1 if not
2107         */
2108         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2109                 if (tmp_ip->pnn == -1) {
2110                         continue;
2111                 }
2112                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2113                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2114                         /* this node can not serve this ip. */
2115                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2116                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2117                                            tmp_ip->pnn));
2118                         tmp_ip->pnn = -1;
2119                 }
2120         }
2121 }
2122
2123 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2124                                        struct ctdb_ipflags *ipflags,
2125                                        struct ctdb_public_ip_list *all_ips)
2126 {
2127         struct ctdb_public_ip_list *tmp_ip;
2128         int i, numnodes;
2129
2130         numnodes = talloc_array_length(ipflags);
2131
2132         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2133        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2134         *  always be allocated the same way for a specific set of
2135         *  available/unavailable nodes.
2136         */
2137
2138         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2139                 tmp_ip->pnn = i % numnodes;
2140         }
2141
2142         /* IP failback doesn't make sense with deterministic
2143          * IPs, since the modulo step above implicitly fails
2144          * back IPs to their "home" node.
2145          */
2146         if (1 == ctdb->tunable.no_ip_failback) {
2147                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2148         }
2149
2150         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2151
2152         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2153
2154         /* No failback here! */
2155 }
2156
2157 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2158                                           struct ctdb_ipflags *ipflags,
2159                                           struct ctdb_public_ip_list *all_ips)
2160 {
2161         /* This should be pushed down into basic_failback. */
2162         struct ctdb_public_ip_list *tmp_ip;
2163         int num_ips = 0;
2164         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2165                 num_ips++;
2166         }
2167
2168         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2169
2170         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2171
2172         /* If we don't want IPs to fail back then don't rebalance IPs. */
2173         if (1 == ctdb->tunable.no_ip_failback) {
2174                 return;
2175         }
2176
2177         /* Now, try to make sure the ip adresses are evenly distributed
2178            across the nodes.
2179         */
2180         basic_failback(ctdb, ipflags, all_ips, num_ips);
2181 }
2182
2183 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2184                           struct ctdb_ipflags *ipflags,
2185                           struct ctdb_public_ip_list *all_ips)
2186 {
2187         uint32_t *lcp2_imbalances;
2188         bool *rebalance_candidates;
2189
2190         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2191
2192         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2193
2194         lcp2_init(tmp_ctx, ipflags, all_ips,
2195                   &lcp2_imbalances, &rebalance_candidates);
2196
2197         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2198
2199         /* If we don't want IPs to fail back then don't rebalance IPs. */
2200         if (1 == ctdb->tunable.no_ip_failback) {
2201                 goto finished;
2202         }
2203
2204         /* Now, try to make sure the ip adresses are evenly distributed
2205            across the nodes.
2206         */
2207         lcp2_failback(ctdb, ipflags, all_ips,
2208                       lcp2_imbalances, rebalance_candidates);
2209
2210 finished:
2211         talloc_free(tmp_ctx);
2212 }
2213
2214 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2215 {
2216         int i, num_healthy;
2217
2218         /* Count how many completely healthy nodes we have */
2219         num_healthy = 0;
2220         for (i=0;i<nodemap->num;i++) {
2221                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2222                         num_healthy++;
2223                 }
2224         }
2225
2226         return num_healthy == 0;
2227 }
2228
2229 /* The calculation part of the IP allocation algorithm. */
2230 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2231                                    struct ctdb_ipflags *ipflags,
2232                                    struct ctdb_public_ip_list **all_ips_p)
2233 {
2234         /* since nodes only know about those public addresses that
2235            can be served by that particular node, no single node has
2236            a full list of all public addresses that exist in the cluster.
2237            Walk over all node structures and create a merged list of
2238            all public addresses that exist in the cluster.
2239
2240            keep the tree of ips around as ctdb->ip_tree
2241         */
2242         *all_ips_p = create_merged_ip_list(ctdb);
2243
2244         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2245                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p);
2246         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2247                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2248         } else {
2249                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2250         }
2251
2252         /* at this point ->pnn is the node which will own each IP
2253            or -1 if there is no node that can cover this ip
2254         */
2255
2256         return;
2257 }
2258
2259 struct get_tunable_callback_data {
2260         const char *tunable;
2261         uint32_t *out;
2262         bool fatal;
2263 };
2264
2265 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2266                                  int32_t res, TDB_DATA outdata,
2267                                  void *callback)
2268 {
2269         struct get_tunable_callback_data *cd =
2270                 (struct get_tunable_callback_data *)callback;
2271         int size;
2272
2273         if (res != 0) {
2274                 /* Already handled in fail callback */
2275                 return;
2276         }
2277
2278         if (outdata.dsize != sizeof(uint32_t)) {
2279                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2280                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2281                                  (int)outdata.dsize));
2282                 cd->fatal = true;
2283                 return;
2284         }
2285
2286         size = talloc_array_length(cd->out);
2287         if (pnn >= size) {
2288                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2289                                  cd->tunable, pnn, size));
2290                 return;
2291         }
2292
2293                 
2294         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2295 }
2296
2297 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2298                                        int32_t res, TDB_DATA outdata,
2299                                        void *callback)
2300 {
2301         struct get_tunable_callback_data *cd =
2302                 (struct get_tunable_callback_data *)callback;
2303
2304         switch (res) {
2305         case -ETIME:
2306                 DEBUG(DEBUG_ERR,
2307                       ("Timed out getting tunable \"%s\" from node %d\n",
2308                        cd->tunable, pnn));
2309                 cd->fatal = true;
2310                 break;
2311         case -EINVAL:
2312         case -1:
2313                 DEBUG(DEBUG_WARNING,
2314                       ("Tunable \"%s\" not implemented on node %d\n",
2315                        cd->tunable, pnn));
2316                 break;
2317         default:
2318                 DEBUG(DEBUG_ERR,
2319                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2320                        cd->tunable, pnn));
2321                 cd->fatal = true;
2322         }
2323 }
2324
2325 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2326                                         TALLOC_CTX *tmp_ctx,
2327                                         struct ctdb_node_map *nodemap,
2328                                         const char *tunable,
2329                                         uint32_t default_value)
2330 {
2331         TDB_DATA data;
2332         struct ctdb_control_get_tunable *t;
2333         uint32_t *nodes;
2334         uint32_t *tvals;
2335         struct get_tunable_callback_data callback_data;
2336         int i;
2337
2338         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2339         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2340         for (i=0; i<nodemap->num; i++) {
2341                 tvals[i] = default_value;
2342         }
2343                 
2344         callback_data.out = tvals;
2345         callback_data.tunable = tunable;
2346         callback_data.fatal = false;
2347
2348         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2349         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2350         t = (struct ctdb_control_get_tunable *)data.dptr;
2351         t->length = strlen(tunable)+1;
2352         memcpy(t->name, tunable, t->length);
2353         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2354         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2355                                       nodes, 0, TAKEOVER_TIMEOUT(),
2356                                       false, data,
2357                                       get_tunable_callback,
2358                                       get_tunable_fail_callback,
2359                                       &callback_data) != 0) {
2360                 if (callback_data.fatal) {
2361                         talloc_free(tvals);
2362                         tvals = NULL;
2363                 }
2364         }
2365         talloc_free(nodes);
2366         talloc_free(data.dptr);
2367
2368         return tvals;
2369 }
2370
2371 struct get_runstate_callback_data {
2372         enum ctdb_runstate *out;
2373         bool fatal;
2374 };
2375
2376 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2377                                   int32_t res, TDB_DATA outdata,
2378                                   void *callback_data)
2379 {
2380         struct get_runstate_callback_data *cd =
2381                 (struct get_runstate_callback_data *)callback_data;
2382         int size;
2383
2384         if (res != 0) {
2385                 /* Already handled in fail callback */
2386                 return;
2387         }
2388
2389         if (outdata.dsize != sizeof(uint32_t)) {
2390                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2391                                  pnn, (int)sizeof(uint32_t),
2392                                  (int)outdata.dsize));
2393                 cd->fatal = true;
2394                 return;
2395         }
2396
2397         size = talloc_array_length(cd->out);
2398         if (pnn >= size) {
2399                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2400                                  pnn, size));
2401                 return;
2402         }
2403
2404         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2405 }
2406
2407 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2408                                        int32_t res, TDB_DATA outdata,
2409                                        void *callback)
2410 {
2411         struct get_runstate_callback_data *cd =
2412                 (struct get_runstate_callback_data *)callback;
2413
2414         switch (res) {
2415         case -ETIME:
2416                 DEBUG(DEBUG_ERR,
2417                       ("Timed out getting runstate from node %d\n", pnn));
2418                 cd->fatal = true;
2419                 break;
2420         default:
2421                 DEBUG(DEBUG_WARNING,
2422                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2423                        pnn));
2424         }
2425 }
2426
2427 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2428                                                     TALLOC_CTX *tmp_ctx,
2429                                                     struct ctdb_node_map *nodemap,
2430                                                     enum ctdb_runstate default_value)
2431 {
2432         uint32_t *nodes;
2433         enum ctdb_runstate *rs;
2434         struct get_runstate_callback_data callback_data;
2435         int i;
2436
2437         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2438         CTDB_NO_MEMORY_NULL(ctdb, rs);
2439         for (i=0; i<nodemap->num; i++) {
2440                 rs[i] = default_value;
2441         }
2442
2443         callback_data.out = rs;
2444         callback_data.fatal = false;
2445
2446         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2447         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2448                                       nodes, 0, TAKEOVER_TIMEOUT(),
2449                                       true, tdb_null,
2450                                       get_runstate_callback,
2451                                       get_runstate_fail_callback,
2452                                       &callback_data) != 0) {
2453                 if (callback_data.fatal) {
2454                         free(rs);
2455                         rs = NULL;
2456                 }
2457         }
2458         talloc_free(nodes);
2459
2460         return rs;
2461 }
2462
2463 /* Set internal flags for IP allocation:
2464  *   Clear ip flags
2465  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2466  *   Set NOIPHOST ip flag for each INACTIVE node
2467  *   if all nodes are disabled:
2468  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2469  *   else
2470  *     Set NOIPHOST ip flags for disabled nodes
2471  */
2472 static struct ctdb_ipflags *
2473 set_ipflags_internal(struct ctdb_context *ctdb,
2474                      TALLOC_CTX *tmp_ctx,
2475                      struct ctdb_node_map *nodemap,
2476                      uint32_t *tval_noiptakeover,
2477                      uint32_t *tval_noiphostonalldisabled,
2478                      enum ctdb_runstate *runstate)
2479 {
2480         int i;
2481         struct ctdb_ipflags *ipflags;
2482
2483         /* Clear IP flags - implicit due to talloc_zero */
2484         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2485         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2486
2487         for (i=0;i<nodemap->num;i++) {
2488                 /* Can not take IPs on node with NoIPTakeover set */
2489                 if (tval_noiptakeover[i] != 0) {
2490                         ipflags[i].noiptakeover = true;
2491                 }
2492
2493                 /* Can not host IPs on node not in RUNNING state */
2494                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2495                         ipflags[i].noiphost = true;
2496                         continue;
2497                 }
2498                 /* Can not host IPs on INACTIVE node */
2499                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2500                         ipflags[i].noiphost = true;
2501                 }
2502         }
2503
2504         if (all_nodes_are_disabled(nodemap)) {
2505                 /* If all nodes are disabled, can not host IPs on node
2506                  * with NoIPHostOnAllDisabled set
2507                  */
2508                 for (i=0;i<nodemap->num;i++) {
2509                         if (tval_noiphostonalldisabled[i] != 0) {
2510                                 ipflags[i].noiphost = true;
2511                         }
2512                 }
2513         } else {
2514                 /* If some nodes are not disabled, then can not host
2515                  * IPs on DISABLED node
2516                  */
2517                 for (i=0;i<nodemap->num;i++) {
2518                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2519                                 ipflags[i].noiphost = true;
2520                         }
2521                 }
2522         }
2523
2524         return ipflags;
2525 }
2526
2527 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2528                                         TALLOC_CTX *tmp_ctx,
2529                                         struct ctdb_node_map *nodemap)
2530 {
2531         uint32_t *tval_noiptakeover;
2532         uint32_t *tval_noiphostonalldisabled;
2533         struct ctdb_ipflags *ipflags;
2534         enum ctdb_runstate *runstate;
2535
2536
2537         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2538                                                    "NoIPTakeover", 0);
2539         if (tval_noiptakeover == NULL) {
2540                 return NULL;
2541         }
2542
2543         tval_noiphostonalldisabled =
2544                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2545                                        "NoIPHostOnAllDisabled", 0);
2546         if (tval_noiphostonalldisabled == NULL) {
2547                 /* Caller frees tmp_ctx */
2548                 return NULL;
2549         }
2550
2551         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2552          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2553          * reasonable behaviour on a mixed cluster during upgrade.
2554          */
2555         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2556                                            CTDB_RUNSTATE_RUNNING);
2557         if (runstate == NULL) {
2558                 /* Caller frees tmp_ctx */
2559                 return NULL;
2560         }
2561
2562         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2563                                        tval_noiptakeover,
2564                                        tval_noiphostonalldisabled,
2565                                        runstate);
2566
2567         talloc_free(tval_noiptakeover);
2568         talloc_free(tval_noiphostonalldisabled);
2569         talloc_free(runstate);
2570
2571         return ipflags;
2572 }
2573
2574 struct iprealloc_callback_data {
2575         bool *retry_nodes;
2576         int retry_count;
2577         client_async_callback fail_callback;
2578         void *fail_callback_data;
2579         struct ctdb_node_map *nodemap;
2580 };
2581
2582 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2583                                         int32_t res, TDB_DATA outdata,
2584                                         void *callback)
2585 {
2586         int numnodes;
2587         struct iprealloc_callback_data *cd =
2588                 (struct iprealloc_callback_data *)callback;
2589
2590         switch (res) {
2591         case -ETIME:
2592                 /* If the control timed out then that's a real error,
2593                  * so call the real fail callback
2594                  */
2595                 cd->fail_callback(ctdb, pnn, res, outdata,
2596                                   cd->fail_callback_data);
2597                 break;
2598         default:
2599                 /* If not a timeout then either the ipreallocated
2600                  * eventscript (or some setup) failed.  This might
2601                  * have failed because the IPREALLOCATED control isn't
2602                  * implemented - right now there is no way of knowing
2603                  * because the error codes are all folded down to -1.
2604                  * Consider retrying using EVENTSCRIPT control...
2605                  */
2606
2607                 numnodes = talloc_array_length(cd->retry_nodes);
2608                 if (pnn > numnodes) {
2609                         DEBUG(DEBUG_ERR,
2610                               ("ipreallocated failure from node %d, but only %d nodes in nodemap\n",
2611                                pnn, numnodes));
2612                         return;
2613                 }
2614
2615                 /* Can't run the "ipreallocated" event on a STOPPED node */
2616                 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2617                         DEBUG(DEBUG_ERR,
2618                               ("ipreallocated failure from node %d, but node is stopped - not flagging a retry\n",
2619                                pnn));
2620                         return;
2621                 }
2622
2623                 DEBUG(DEBUG_WARNING,
2624                       ("ipreallocated failure from node %d, flagging retry\n",
2625                        pnn));
2626                 cd->retry_nodes[pnn] = true;
2627                 cd->retry_count++;
2628         }
2629 }
2630
2631 struct takeover_callback_data {
2632         bool *node_failed;
2633         client_async_callback fail_callback;
2634         void *fail_callback_data;
2635         struct ctdb_node_map *nodemap;
2636 };
2637
2638 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2639                                        uint32_t node_pnn, int32_t res,
2640                                        TDB_DATA outdata, void *callback_data)
2641 {
2642         struct takeover_callback_data *cd =
2643                 talloc_get_type_abort(callback_data,
2644                                       struct takeover_callback_data);
2645         int i;
2646
2647         for (i = 0; i < cd->nodemap->num; i++) {
2648                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2649                         break;
2650                 }
2651         }
2652
2653         if (i == cd->nodemap->num) {
2654                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2655                 return;
2656         }
2657
2658         if (!cd->node_failed[i]) {
2659                 cd->node_failed[i] = true;
2660                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2661                                   cd->fail_callback_data);
2662         }
2663 }
2664
2665 /*
2666   make any IP alias changes for public addresses that are necessary 
2667  */
2668 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2669                       client_async_callback fail_callback, void *callback_data)
2670 {
2671         int i, j;
2672         struct ctdb_public_ip ip;
2673         struct ctdb_public_ipv4 ipv4;
2674         uint32_t *nodes;
2675         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2676         TDB_DATA data;
2677         struct timeval timeout;
2678         struct client_async_data *async_data;
2679         struct ctdb_client_control_state *state;
2680         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2681         uint32_t disable_timeout;
2682         struct ctdb_ipflags *ipflags;
2683         struct takeover_callback_data *takeover_data;
2684         struct iprealloc_callback_data iprealloc_data;
2685         bool *retry_data;
2686
2687         /*
2688          * ip failover is completely disabled, just send out the 
2689          * ipreallocated event.
2690          */
2691         if (ctdb->tunable.disable_ip_failover != 0) {
2692                 goto ipreallocated;
2693         }
2694
2695         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2696         if (ipflags == NULL) {
2697                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2698                 talloc_free(tmp_ctx);
2699                 return -1;
2700         }
2701
2702         ZERO_STRUCT(ip);
2703
2704         /* Do the IP reassignment calculations */
2705         ctdb_takeover_run_core(ctdb, ipflags, &all_ips);
2706
2707         /* The recovery daemon does regular sanity checks of the IPs.
2708          * However, sometimes it is overzealous and thinks changes are
2709          * required when they're already underway.  This stops the
2710          * checks for a while before we start moving IPs.
2711          */
2712         disable_timeout = ctdb->tunable.takeover_timeout;
2713         data.dptr  = (uint8_t*)&disable_timeout;
2714         data.dsize = sizeof(disable_timeout);
2715         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2716                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2717                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2718         }
2719
2720         /* Now tell all nodes to release any public IPs should not
2721          * host.  This will be a NOOP on nodes that don't currently
2722          * hold the given IP.
2723          */
2724         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2725         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2726
2727         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2728                                                        bool, nodemap->num);
2729         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2730         takeover_data->fail_callback = fail_callback;
2731         takeover_data->fail_callback_data = callback_data;
2732         takeover_data->nodemap = nodemap;
2733
2734         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2735         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2736
2737         async_data->fail_callback = takeover_run_fail_callback;
2738         async_data->callback_data = takeover_data;
2739
2740         for (i=0;i<nodemap->num;i++) {
2741                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2742                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2743                         continue;
2744                 }
2745
2746                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2747                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2748                                 /* This node should be serving this
2749                                    vnn so dont tell it to release the ip
2750                                 */
2751                                 continue;
2752                         }
2753                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2754                                 ipv4.pnn = tmp_ip->pnn;
2755                                 ipv4.sin = tmp_ip->addr.ip;
2756
2757                                 timeout = TAKEOVER_TIMEOUT();
2758                                 data.dsize = sizeof(ipv4);
2759                                 data.dptr  = (uint8_t *)&ipv4;
2760                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2761                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2762                                                 data, async_data,
2763                                                 &timeout, NULL);
2764                         } else {
2765                                 ip.pnn  = tmp_ip->pnn;
2766                                 ip.addr = tmp_ip->addr;
2767
2768                                 timeout = TAKEOVER_TIMEOUT();
2769                                 data.dsize = sizeof(ip);
2770                                 data.dptr  = (uint8_t *)&ip;
2771                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2772                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2773                                                 data, async_data,
2774                                                 &timeout, NULL);
2775                         }
2776
2777                         if (state == NULL) {
2778                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2779                                 talloc_free(tmp_ctx);
2780                                 return -1;
2781                         }
2782                 
2783                         ctdb_client_async_add(async_data, state);
2784                 }
2785         }
2786         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2787                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2788                 talloc_free(tmp_ctx);
2789                 return -1;
2790         }
2791         talloc_free(async_data);
2792
2793
2794         /* tell all nodes to get their own IPs */
2795         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2796         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2797
2798         async_data->fail_callback = fail_callback;
2799         async_data->callback_data = callback_data;
2800
2801         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2802                 if (tmp_ip->pnn == -1) {
2803                         /* this IP won't be taken over */
2804                         continue;
2805                 }
2806
2807                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2808                         ipv4.pnn = tmp_ip->pnn;
2809                         ipv4.sin = tmp_ip->addr.ip;
2810
2811                         timeout = TAKEOVER_TIMEOUT();
2812                         data.dsize = sizeof(ipv4);
2813                         data.dptr  = (uint8_t *)&ipv4;
2814                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2815                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2816                                         data, async_data,
2817                                         &timeout, NULL);
2818                 } else {
2819                         ip.pnn  = tmp_ip->pnn;
2820                         ip.addr = tmp_ip->addr;
2821
2822                         timeout = TAKEOVER_TIMEOUT();
2823                         data.dsize = sizeof(ip);
2824                         data.dptr  = (uint8_t *)&ip;
2825                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2826                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2827                                         data, async_data,
2828                                         &timeout, NULL);
2829                 }
2830                 if (state == NULL) {
2831                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2832                         talloc_free(tmp_ctx);
2833                         return -1;
2834                 }
2835                 
2836                 ctdb_client_async_add(async_data, state);
2837         }
2838         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2839                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2840                 talloc_free(tmp_ctx);
2841                 return -1;
2842         }
2843
2844 ipreallocated:
2845         /* 
2846          * Tell all nodes to run eventscripts to process the
2847          * "ipreallocated" event.  This can do a lot of things,
2848          * including restarting services to reconfigure them if public
2849          * IPs have moved.  Once upon a time this event only used to
2850          * update natwg.
2851          */
2852         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2853         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2854         iprealloc_data.retry_nodes = retry_data;
2855         iprealloc_data.retry_count = 0;
2856         iprealloc_data.fail_callback = fail_callback;
2857         iprealloc_data.fail_callback_data = callback_data;
2858         iprealloc_data.nodemap = nodemap;
2859
2860         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2861         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2862                                       nodes, 0, TAKEOVER_TIMEOUT(),
2863                                       false, tdb_null,
2864                                       NULL, iprealloc_fail_callback,
2865                                       &iprealloc_data) != 0) {
2866
2867                 /* If the control failed then we should retry to any
2868                  * nodes flagged by iprealloc_fail_callback using the
2869                  * EVENTSCRIPT control.  This is a best-effort at
2870                  * backward compatiblity when running a mixed cluster
2871                  * where some nodes have not yet been upgraded to
2872                  * support the IPREALLOCATED control.
2873                  */
2874                 DEBUG(DEBUG_WARNING,
2875                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2876
2877                 nodes = talloc_array(tmp_ctx, uint32_t,
2878                                      iprealloc_data.retry_count);
2879                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2880
2881                 j = 0;
2882                 for (i=0; i<nodemap->num; i++) {
2883                         if (iprealloc_data.retry_nodes[i]) {
2884                                 nodes[j] = i;
2885                                 j++;
2886                         }
2887                 }
2888
2889                 data.dptr  = discard_const("ipreallocated");
2890                 data.dsize = strlen((char *)data.dptr) + 1; 
2891                 if (ctdb_client_async_control(ctdb,
2892                                               CTDB_CONTROL_RUN_EVENTSCRIPTS,
2893                                               nodes, 0, TAKEOVER_TIMEOUT(),
2894                                               false, data,
2895                                               NULL, fail_callback,
2896                                               callback_data) != 0) {
2897                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2898                 }
2899         }
2900
2901         talloc_free(tmp_ctx);
2902         return 0;
2903 }
2904
2905
2906 /*
2907   destroy a ctdb_client_ip structure
2908  */
2909 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2910 {
2911         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2912                 ctdb_addr_to_str(&ip->addr),
2913                 ntohs(ip->addr.ip.sin_port),
2914                 ip->client_id));
2915
2916         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2917         return 0;
2918 }
2919
2920 /*
2921   called by a client to inform us of a TCP connection that it is managing
2922   that should tickled with an ACK when IP takeover is done
2923   we handle both the old ipv4 style of packets as well as the new ipv4/6
2924   pdus.
2925  */
2926 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2927                                 TDB_DATA indata)
2928 {
2929         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2930         struct ctdb_control_tcp *old_addr = NULL;
2931         struct ctdb_control_tcp_addr new_addr;
2932         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2933         struct ctdb_tcp_list *tcp;
2934         struct ctdb_tcp_connection t;
2935         int ret;
2936         TDB_DATA data;
2937         struct ctdb_client_ip *ip;
2938         struct ctdb_vnn *vnn;
2939         ctdb_sock_addr addr;
2940
2941         switch (indata.dsize) {
2942         case sizeof(struct ctdb_control_tcp):
2943                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2944                 ZERO_STRUCT(new_addr);
2945                 tcp_sock = &new_addr;
2946                 tcp_sock->src.ip  = old_addr->src;
2947                 tcp_sock->dest.ip = old_addr->dest;
2948                 break;
2949         case sizeof(struct ctdb_control_tcp_addr):
2950                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2951                 break;
2952         default:
2953                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2954                                  "to ctdb_control_tcp_client. size was %d but "
2955                                  "only allowed sizes are %lu and %lu\n",
2956                                  (int)indata.dsize,
2957                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2958                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2959                 return -1;
2960         }
2961
2962         addr = tcp_sock->src;
2963         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2964         addr = tcp_sock->dest;
2965         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2966
2967         ZERO_STRUCT(addr);
2968         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2969         vnn = find_public_ip_vnn(ctdb, &addr);
2970         if (vnn == NULL) {
2971                 switch (addr.sa.sa_family) {
2972                 case AF_INET:
2973                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2974                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2975                                         ctdb_addr_to_str(&addr)));
2976                         }
2977                         break;
2978                 case AF_INET6:
2979                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2980                                 ctdb_addr_to_str(&addr)));
2981                         break;
2982                 default:
2983                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2984                 }
2985
2986                 return 0;
2987         }
2988
2989         if (vnn->pnn != ctdb->pnn) {
2990                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2991                         ctdb_addr_to_str(&addr),
2992                         client_id, client->pid));
2993                 /* failing this call will tell smbd to die */
2994                 return -1;
2995         }
2996
2997         ip = talloc(client, struct ctdb_client_ip);
2998         CTDB_NO_MEMORY(ctdb, ip);
2999
3000         ip->ctdb      = ctdb;
3001         ip->addr      = addr;
3002         ip->client_id = client_id;
3003         talloc_set_destructor(ip, ctdb_client_ip_destructor);
3004         DLIST_ADD(ctdb->client_ip_list, ip);
3005
3006         tcp = talloc(client, struct ctdb_tcp_list);
3007         CTDB_NO_MEMORY(ctdb, tcp);
3008
3009         tcp->connection.src_addr = tcp_sock->src;
3010         tcp->connection.dst_addr = tcp_sock->dest;
3011
3012         DLIST_ADD(client->tcp_list, tcp);
3013
3014         t.src_addr = tcp_sock->src;
3015         t.dst_addr = tcp_sock->dest;
3016
3017         data.dptr = (uint8_t *)&t;
3018         data.dsize = sizeof(t);
3019
3020         switch (addr.sa.sa_family) {
3021         case AF_INET:
3022                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3023                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
3024                         ctdb_addr_to_str(&tcp_sock->src),
3025                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
3026                 break;
3027         case AF_INET6:
3028                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3029                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
3030                         ctdb_addr_to_str(&tcp_sock->src),
3031                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3032                 break;
3033         default:
3034                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3035         }
3036
3037
3038         /* tell all nodes about this tcp connection */
3039         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3040                                        CTDB_CONTROL_TCP_ADD,
3041                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3042         if (ret != 0) {
3043                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3044                 return -1;
3045         }
3046
3047         return 0;
3048 }
3049
3050 /*
3051   find a tcp address on a list
3052  */
3053 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3054                                            struct ctdb_tcp_connection *tcp)
3055 {
3056         int i;
3057
3058         if (array == NULL) {
3059                 return NULL;
3060         }
3061
3062         for (i=0;i<array->num;i++) {
3063                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3064                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3065                         return &array->connections[i];
3066                 }
3067         }
3068         return NULL;
3069 }
3070
3071
3072
3073 /*
3074   called by a daemon to inform us of a TCP connection that one of its
3075   clients managing that should tickled with an ACK when IP takeover is
3076   done
3077  */
3078 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3079 {
3080         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3081         struct ctdb_tcp_array *tcparray;
3082         struct ctdb_tcp_connection tcp;
3083         struct ctdb_vnn *vnn;
3084
3085         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3086         if (vnn == NULL) {
3087                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3088                         ctdb_addr_to_str(&p->dst_addr)));
3089
3090                 return -1;
3091         }
3092
3093
3094         tcparray = vnn->tcp_array;
3095
3096         /* If this is the first tickle */
3097         if (tcparray == NULL) {
3098                 tcparray = talloc_size(ctdb->nodes, 
3099                         offsetof(struct ctdb_tcp_array, connections) +
3100                         sizeof(struct ctdb_tcp_connection) * 1);
3101                 CTDB_NO_MEMORY(ctdb, tcparray);
3102                 vnn->tcp_array = tcparray;
3103
3104                 tcparray->num = 0;
3105                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3106                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3107
3108                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3109                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3110                 tcparray->num++;
3111
3112                 if (tcp_update_needed) {
3113                         vnn->tcp_update_needed = true;
3114                 }
3115                 return 0;
3116         }
3117
3118
3119         /* Do we already have this tickle ?*/
3120         tcp.src_addr = p->src_addr;
3121         tcp.dst_addr = p->dst_addr;
3122         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
3123                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3124                         ctdb_addr_to_str(&tcp.dst_addr),
3125                         ntohs(tcp.dst_addr.ip.sin_port),
3126                         vnn->pnn));
3127                 return 0;
3128         }
3129
3130         /* A new tickle, we must add it to the array */
3131         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3132                                         struct ctdb_tcp_connection,
3133                                         tcparray->num+1);
3134         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3135
3136         vnn->tcp_array = tcparray;
3137         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3138         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3139         tcparray->num++;
3140                                 
3141         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3142                 ctdb_addr_to_str(&tcp.dst_addr),
3143                 ntohs(tcp.dst_addr.ip.sin_port),
3144                 vnn->pnn));
3145
3146         if (tcp_update_needed) {
3147                 vnn->tcp_update_needed = true;
3148         }
3149
3150         return 0;
3151 }
3152
3153
3154 /*
3155   called by a daemon to inform us of a TCP connection that one of its
3156   clients managing that should tickled with an ACK when IP takeover is
3157   done
3158  */
3159 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3160 {
3161         struct ctdb_tcp_connection *tcpp;
3162         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3163
3164         if (vnn == NULL) {
3165                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3166                         ctdb_addr_to_str(&conn->dst_addr)));
3167                 return;
3168         }
3169
3170         /* if the array is empty we cant remove it
3171            and we dont need to do anything
3172          */
3173         if (vnn->tcp_array == NULL) {
3174                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3175                         ctdb_addr_to_str(&conn->dst_addr),
3176                         ntohs(conn->dst_addr.ip.sin_port)));
3177                 return;
3178         }
3179
3180
3181         /* See if we know this connection
3182            if we dont know this connection  then we dont need to do anything
3183          */
3184         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3185         if (tcpp == NULL) {
3186                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3187                         ctdb_addr_to_str(&conn->dst_addr),
3188                         ntohs(conn->dst_addr.ip.sin_port)));
3189                 return;
3190         }
3191
3192
3193         /* We need to remove this entry from the array.
3194            Instead of allocating a new array and copying data to it
3195            we cheat and just copy the last entry in the existing array
3196            to the entry that is to be removed and just shring the 
3197            ->num field
3198          */
3199         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3200         vnn->tcp_array->num--;
3201
3202         /* If we deleted the last entry we also need to remove the entire array
3203          */
3204         if (vnn->tcp_array->num == 0) {
3205                 talloc_free(vnn->tcp_array);
3206                 vnn->tcp_array = NULL;
3207         }               
3208
3209         vnn->tcp_update_needed = true;
3210
3211         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3212                 ctdb_addr_to_str(&conn->src_addr),
3213                 ntohs(conn->src_addr.ip.sin_port)));
3214 }
3215
3216
3217 /*
3218   called by a daemon to inform us of a TCP connection that one of its
3219   clients used are no longer needed in the tickle database
3220  */
3221 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3222 {
3223         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3224
3225         ctdb_remove_tcp_connection(ctdb, conn);
3226
3227         return 0;
3228 }
3229
3230
3231 /*
3232   called when a daemon restarts - send all tickes for all public addresses
3233   we are serving immediately to the new node.
3234  */
3235 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
3236 {
3237 /*XXX here we should send all tickes we are serving to the new node */
3238         return 0;
3239 }
3240
3241
3242 /*
3243   called when a client structure goes away - hook to remove
3244   elements from the tcp_list in all daemons
3245  */
3246 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3247 {
3248         while (client->tcp_list) {
3249                 struct ctdb_tcp_list *tcp = client->tcp_list;
3250                 DLIST_REMOVE(client->tcp_list, tcp);
3251                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3252         }
3253 }
3254
3255
3256 /*
3257   release all IPs on shutdown
3258  */
3259 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3260 {
3261         struct ctdb_vnn *vnn;
3262         int count = 0;
3263
3264         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3265                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3266                         ctdb_vnn_unassign_iface(ctdb, vnn);
3267                         continue;
3268                 }
3269                 if (!vnn->iface) {
3270                         continue;
3271                 }
3272
3273                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3274                                     ctdb_addr_to_str(&vnn->public_address),
3275                                     vnn->public_netmask_bits,
3276                                     ctdb_vnn_iface_string(vnn)));
3277
3278                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3279                                   ctdb_vnn_iface_string(vnn),
3280                                   ctdb_addr_to_str(&vnn->public_address),
3281                                   vnn->public_netmask_bits);
3282                 release_kill_clients(ctdb, &vnn->public_address);
3283                 ctdb_vnn_unassign_iface(ctdb, vnn);
3284                 count++;
3285         }
3286
3287         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3288 }
3289
3290
3291 /*
3292   get list of public IPs
3293  */
3294 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3295                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3296 {
3297         int i, num, len;
3298         struct ctdb_all_public_ips *ips;
3299         struct ctdb_vnn *vnn;
3300         bool only_available = false;
3301
3302         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3303                 only_available = true;
3304         }
3305
3306         /* count how many public ip structures we have */
3307         num = 0;
3308         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3309                 num++;
3310         }
3311
3312         len = offsetof(struct ctdb_all_public_ips, ips) + 
3313                 num*sizeof(struct ctdb_public_ip);
3314         ips = talloc_zero_size(outdata, len);
3315         CTDB_NO_MEMORY(ctdb, ips);
3316
3317         i = 0;
3318         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3319                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3320                         continue;
3321                 }
3322                 ips->ips[i].pnn  = vnn->pnn;
3323                 ips->ips[i].addr = vnn->public_address;
3324                 i++;
3325         }
3326         ips->num = i;
3327         len = offsetof(struct ctdb_all_public_ips, ips) +
3328                 i*sizeof(struct ctdb_public_ip);
3329
3330         outdata->dsize = len;
3331         outdata->dptr  = (uint8_t *)ips;
3332
3333         return 0;
3334 }
3335
3336
3337 /*
3338   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3339  */
3340 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3341                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3342 {
3343         int i, num, len;
3344         struct ctdb_all_public_ipsv4 *ips;
3345         struct ctdb_vnn *vnn;
3346
3347         /* count how many public ip structures we have */
3348         num = 0;
3349         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3350                 if (vnn->public_address.sa.sa_family != AF_INET) {
3351                         continue;
3352                 }
3353                 num++;
3354         }
3355
3356         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3357                 num*sizeof(struct ctdb_public_ipv4);
3358         ips = talloc_zero_size(outdata, len);
3359         CTDB_NO_MEMORY(ctdb, ips);
3360
3361         outdata->dsize = len;
3362         outdata->dptr  = (uint8_t *)ips;
3363
3364         ips->num = num;
3365         i = 0;
3366         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3367                 if (vnn->public_address.sa.sa_family != AF_INET) {
3368                         continue;
3369                 }
3370                 ips->ips[i].pnn = vnn->pnn;
3371                 ips->ips[i].sin = vnn->public_address.ip;
3372                 i++;
3373         }
3374
3375         return 0;
3376 }
3377
3378 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3379                                         struct ctdb_req_control *c,
3380                                         TDB_DATA indata,
3381                                         TDB_DATA *outdata)
3382 {
3383         int i, num, len;
3384         ctdb_sock_addr *addr;
3385         struct ctdb_control_public_ip_info *info;
3386         struct ctdb_vnn *vnn;
3387
3388         addr = (ctdb_sock_addr *)indata.dptr;
3389
3390         vnn = find_public_ip_vnn(ctdb, addr);
3391         if (vnn == NULL) {
3392                 /* if it is not a public ip   it could be our 'single ip' */
3393                 if (ctdb->single_ip_vnn) {
3394                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3395                                 vnn = ctdb->single_ip_vnn;
3396                         }
3397                 }
3398         }
3399         if (vnn == NULL) {
3400                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3401                                  "'%s'not a public address\n",
3402                                  ctdb_addr_to_str(addr)));
3403                 return -1;
3404         }
3405
3406         /* count how many public ip structures we have */
3407         num = 0;
3408         for (;vnn->ifaces[num];) {
3409                 num++;
3410         }
3411
3412         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3413                 num*sizeof(struct ctdb_control_iface_info);
3414         info = talloc_zero_size(outdata, len);
3415         CTDB_NO_MEMORY(ctdb, info);
3416
3417         info->ip.addr = vnn->public_address;
3418         info->ip.pnn = vnn->pnn;
3419         info->active_idx = 0xFFFFFFFF;
3420
3421         for (i=0; vnn->ifaces[i]; i++) {
3422                 struct ctdb_iface *cur;
3423
3424                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3425                 if (cur == NULL) {
3426                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3427                                            vnn->ifaces[i]));
3428                         return -1;
3429                 }
3430                 if (vnn->iface == cur) {
3431                         info->active_idx = i;
3432                 }
3433                 strcpy(info->ifaces[i].name, cur->name);
3434                 info->ifaces[i].link_state = cur->link_up;
3435                 info->ifaces[i].references = cur->references;
3436         }
3437         info->num = i;
3438         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3439                 i*sizeof(struct ctdb_control_iface_info);
3440
3441         outdata->dsize = len;
3442         outdata->dptr  = (uint8_t *)info;
3443
3444         return 0;
3445 }
3446
3447 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3448                                 struct ctdb_req_control *c,
3449                                 TDB_DATA *outdata)
3450 {
3451         int i, num, len;
3452         struct ctdb_control_get_ifaces *ifaces;
3453         struct ctdb_iface *cur;
3454
3455         /* count how many public ip structures we have */
3456         num = 0;
3457         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3458                 num++;
3459         }
3460
3461         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3462                 num*sizeof(struct ctdb_control_iface_info);
3463         ifaces = talloc_zero_size(outdata, len);
3464         CTDB_NO_MEMORY(ctdb, ifaces);
3465
3466         i = 0;
3467         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3468                 strcpy(ifaces->ifaces[i].name, cur->name);
3469                 ifaces->ifaces[i].link_state = cur->link_up;
3470                 ifaces->ifaces[i].references = cur->references;
3471                 i++;
3472         }
3473         ifaces->num = i;
3474         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3475                 i*sizeof(struct ctdb_control_iface_info);
3476
3477         outdata->dsize = len;
3478         outdata->dptr  = (uint8_t *)ifaces;
3479
3480         return 0;
3481 }
3482
3483 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3484                                     struct ctdb_req_control *c,
3485                                     TDB_DATA indata)
3486 {
3487         struct ctdb_control_iface_info *info;
3488         struct ctdb_iface *iface;
3489         bool link_up = false;
3490
3491         info = (struct ctdb_control_iface_info *)indata.dptr;
3492
3493         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3494                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3495                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3496                                   len, len, info->name));
3497                 return -1;
3498         }
3499
3500         switch (info->link_state) {
3501         case 0:
3502                 link_up = false;
3503                 break;
3504         case 1:
3505                 link_up = true;
3506                 break;
3507         default:
3508                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3509                                   (unsigned int)info->link_state));
3510                 return -1;
3511         }
3512
3513         if (info->references != 0) {
3514                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3515                                   (unsigned int)info->references));
3516                 return -1;
3517         }
3518
3519         iface = ctdb_find_iface(ctdb, info->name);
3520         if (iface == NULL) {
3521                 return -1;
3522         }
3523
3524         if (link_up == iface->link_up) {
3525                 return 0;
3526         }
3527
3528         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3529               ("iface[%s] has changed it's link status %s => %s\n",
3530                iface->name,
3531                iface->link_up?"up":"down",
3532                link_up?"up":"down"));
3533
3534         iface->link_up = link_up;
3535         return 0;
3536 }
3537
3538
3539 /* 
3540    structure containing the listening socket and the list of tcp connections
3541    that the ctdb daemon is to kill
3542 */
3543 struct ctdb_kill_tcp {
3544         struct ctdb_vnn *vnn;
3545         struct ctdb_context *ctdb;
3546         int capture_fd;
3547         struct fd_event *fde;
3548         trbt_tree_t *connections;
3549         void *private_data;
3550 };
3551
3552 /*
3553   a tcp connection that is to be killed
3554  */
3555 struct ctdb_killtcp_con {
3556         ctdb_sock_addr src_addr;
3557         ctdb_sock_addr dst_addr;
3558         int count;
3559         struct ctdb_kill_tcp *killtcp;
3560 };
3561
3562 /* this function is used to create a key to represent this socketpair
3563    in the killtcp tree.
3564    this key is used to insert and lookup matching socketpairs that are
3565    to be tickled and RST
3566 */
3567 #define KILLTCP_KEYLEN  10
3568 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3569 {
3570         static uint32_t key[KILLTCP_KEYLEN];
3571
3572         bzero(key, sizeof(key));
3573
3574         if (src->sa.sa_family != dst->sa.sa_family) {
3575                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3576                 return key;
3577         }
3578         
3579         switch (src->sa.sa_family) {
3580         case AF_INET:
3581                 key[0]  = dst->ip.sin_addr.s_addr;
3582                 key[1]  = src->ip.sin_addr.s_addr;
3583                 key[2]  = dst->ip.sin_port;
3584                 key[3]  = src->ip.sin_port;
3585                 break;
3586         case AF_INET6: {
3587                 uint32_t *dst6_addr32 =
3588                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3589                 uint32_t *src6_addr32 =
3590                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3591                 key[0]  = dst6_addr32[3];
3592                 key[1]  = src6_addr32[3];
3593                 key[2]  = dst6_addr32[2];
3594                 key[3]  = src6_addr32[2];
3595                 key[4]  = dst6_addr32[1];
3596                 key[5]  = src6_addr32[1];
3597                 key[6]  = dst6_addr32[0];
3598                 key[7]  = src6_addr32[0];
3599                 key[8]  = dst->ip6.sin6_port;
3600                 key[9]  = src->ip6.sin6_port;
3601                 break;
3602         }
3603         default:
3604                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3605                 return key;
3606         }
3607
3608         return key;
3609 }
3610
3611 /*
3612   called when we get a read event on the raw socket
3613  */
3614 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3615                                 uint16_t flags, void *private_data)
3616 {
3617         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3618         struct ctdb_killtcp_con *con;
3619         ctdb_sock_addr src, dst;
3620         uint32_t ack_seq, seq;
3621
3622         if (!(flags & EVENT_FD_READ)) {
3623                 return;
3624         }
3625
3626         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3627                                 killtcp->private_data,
3628                                 &src, &dst,
3629                                 &ack_seq, &seq) != 0) {
3630                 /* probably a non-tcp ACK packet */
3631                 return;
3632         }
3633
3634         /* check if we have this guy in our list of connections
3635            to kill
3636         */
3637         con = trbt_lookuparray32(killtcp->connections, 
3638                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3639         if (con == NULL) {
3640                 /* no this was some other packet we can just ignore */
3641                 return;
3642         }
3643
3644         /* This one has been tickled !
3645            now reset him and remove him from the list.
3646          */
3647         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3648                 ntohs(con->dst_addr.ip.sin_port),
3649                 ctdb_addr_to_str(&con->src_addr),
3650                 ntohs(con->src_addr.ip.sin_port)));
3651
3652         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3653         talloc_free(con);
3654 }
3655
3656
3657 /* when traversing the list of all tcp connections to send tickle acks to
3658    (so that we can capture the ack coming back and kill the connection
3659     by a RST)
3660    this callback is called for each connection we are currently trying to kill
3661 */
3662 static int tickle_connection_traverse(void *param, void *data)
3663 {
3664         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3665
3666         /* have tried too many times, just give up */
3667         if (con->count >= 5) {
3668                 /* can't delete in traverse: reparent to delete_cons */
3669                 talloc_steal(param, con);
3670                 return 0;
3671         }
3672
3673         /* othervise, try tickling it again */
3674         con->count++;
3675         ctdb_sys_send_tcp(
3676                 (ctdb_sock_addr *)&con->dst_addr,
3677                 (ctdb_sock_addr *)&con->src_addr,
3678                 0, 0, 0);
3679         return 0;
3680 }
3681
3682
3683 /* 
3684    called every second until all sentenced connections have been reset
3685  */
3686 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3687                                               struct timeval t, void *private_data)
3688 {
3689         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3690         void *delete_cons = talloc_new(NULL);
3691
3692         /* loop over all connections sending tickle ACKs */
3693         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3694
3695         /* now we've finished traverse, it's safe to do deletion. */
3696         talloc_free(delete_cons);
3697
3698         /* If there are no more connections to kill we can remove the
3699            entire killtcp structure
3700          */
3701         if ( (killtcp->connections == NULL) || 
3702              (killtcp->connections->root == NULL) ) {
3703                 talloc_free(killtcp);
3704                 return;
3705         }
3706
3707         /* try tickling them again in a seconds time
3708          */
3709         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3710                         ctdb_tickle_sentenced_connections, killtcp);
3711 }
3712
3713 /*
3714   destroy the killtcp structure
3715  */
3716 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3717 {
3718         struct ctdb_vnn *tmpvnn;
3719
3720         /* verify that this vnn is still active */
3721         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3722                 if (tmpvnn == killtcp->vnn) {
3723                         break;
3724                 }
3725         }
3726
3727         if (tmpvnn == NULL) {
3728                 return 0;
3729         }
3730
3731         if (killtcp->vnn->killtcp != killtcp) {
3732                 return 0;
3733         }
3734
3735         killtcp->vnn->killtcp = NULL;
3736
3737         return 0;
3738 }
3739
3740
3741 /* nothing fancy here, just unconditionally replace any existing
3742    connection structure with the new one.
3743
3744    dont even free the old one if it did exist, that one is talloc_stolen
3745    by the same node in the tree anyway and will be deleted when the new data 
3746    is deleted
3747 */
3748 static void *add_killtcp_callback(void *parm, void *data)
3749 {
3750         return parm;
3751 }
3752
3753 /*
3754   add a tcp socket to the list of connections we want to RST
3755  */
3756 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3757                                        ctdb_sock_addr *s,
3758                                        ctdb_sock_addr *d)
3759 {
3760         ctdb_sock_addr src, dst;
3761         struct ctdb_kill_tcp *killtcp;
3762         struct ctdb_killtcp_con *con;
3763         struct ctdb_vnn *vnn;
3764
3765         ctdb_canonicalize_ip(s, &src);
3766         ctdb_canonicalize_ip(d, &dst);
3767
3768         vnn = find_public_ip_vnn(ctdb, &dst);
3769         if (vnn == NULL) {
3770                 vnn = find_public_ip_vnn(ctdb, &src);
3771         }
3772         if (vnn == NULL) {
3773                 /* if it is not a public ip   it could be our 'single ip' */
3774                 if (ctdb->single_ip_vnn) {
3775                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3776                                 vnn = ctdb->single_ip_vnn;
3777                         }
3778                 }
3779         }
3780         if (vnn == NULL) {
3781                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3782                 return -1;
3783         }
3784
3785         killtcp = vnn->killtcp;
3786         
3787         /* If this is the first connection to kill we must allocate
3788            a new structure
3789          */
3790         if (killtcp == NULL) {
3791                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3792                 CTDB_NO_MEMORY(ctdb, killtcp);
3793
3794                 killtcp->vnn         = vnn;
3795                 killtcp->ctdb        = ctdb;
3796                 killtcp->capture_fd  = -1;
3797                 killtcp->connections = trbt_create(killtcp, 0);
3798
3799                 vnn->killtcp         = killtcp;
3800                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3801         }
3802
3803
3804
3805         /* create a structure that describes this connection we want to
3806            RST and store it in killtcp->connections
3807         */
3808         con = talloc(killtcp, struct ctdb_killtcp_con);
3809         CTDB_NO_MEMORY(ctdb, con);
3810         con->src_addr = src;
3811         con->dst_addr = dst;
3812         con->count    = 0;
3813         con->killtcp  = killtcp;
3814
3815
3816         trbt_insertarray32_callback(killtcp->connections,
3817                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3818                         add_killtcp_callback, con);
3819
3820         /* 
3821            If we dont have a socket to listen on yet we must create it
3822          */
3823         if (killtcp->capture_fd == -1) {
3824                 const char *iface = ctdb_vnn_iface_string(vnn);
3825                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3826                 if (killtcp->capture_fd == -1) {
3827                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3828                                           "socket on iface '%s' for killtcp (%s)\n",
3829                                           iface, strerror(errno)));
3830                         goto failed;
3831                 }
3832         }
3833
3834
3835         if (killtcp->fde == NULL) {
3836                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3837                                             EVENT_FD_READ,
3838                                             capture_tcp_handler, killtcp);
3839                 tevent_fd_set_auto_close(killtcp->fde);
3840
3841                 /* We also need to set up some events to tickle all these connections
3842                    until they are all reset
3843                 */
3844                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3845                                 ctdb_tickle_sentenced_connections, killtcp);
3846         }
3847
3848         /* tickle him once now */
3849         ctdb_sys_send_tcp(
3850                 &con->dst_addr,
3851                 &con->src_addr,
3852                 0, 0, 0);
3853
3854         return 0;
3855
3856 failed:
3857         talloc_free(vnn->killtcp);
3858         vnn->killtcp = NULL;
3859         return -1;
3860 }
3861
3862 /*
3863   kill a TCP connection.
3864  */
3865 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3866 {
3867         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3868
3869         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3870 }
3871
3872 /*
3873   called by a daemon to inform us of the entire list of TCP tickles for
3874   a particular public address.
3875   this control should only be sent by the node that is currently serving
3876   that public address.
3877  */
3878 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3879 {
3880         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3881         struct ctdb_tcp_array *tcparray;
3882         struct ctdb_vnn *vnn;
3883
3884         /* We must at least have tickles.num or else we cant verify the size
3885            of the received data blob
3886          */
3887         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3888                                         tickles.connections)) {
3889                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3890                 return -1;
3891         }
3892
3893         /* verify that the size of data matches what we expect */
3894         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3895                                 tickles.connections)
3896                          + sizeof(struct ctdb_tcp_connection)
3897                                  * list->tickles.num) {
3898                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3899                 return -1;
3900         }       
3901
3902         vnn = find_public_ip_vnn(ctdb, &list->addr);
3903         if (vnn == NULL) {
3904                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3905                         ctdb_addr_to_str(&list->addr)));
3906
3907                 return 1;
3908         }
3909
3910         /* remove any old ticklelist we might have */
3911         talloc_free(vnn->tcp_array);
3912         vnn->tcp_array = NULL;
3913
3914         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3915         CTDB_NO_MEMORY(ctdb, tcparray);
3916
3917         tcparray->num = list->tickles.num;
3918
3919         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3920         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3921
3922         memcpy(tcparray->connections, &list->tickles.connections[0], 
3923                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3924
3925         /* We now have a new fresh tickle list array for this vnn */
3926         vnn->tcp_array = talloc_steal(vnn, tcparray);
3927         
3928         return 0;
3929 }
3930
3931 /*
3932   called to return the full list of tickles for the puclic address associated 
3933   with the provided vnn
3934  */
3935 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3936 {
3937         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3938         struct ctdb_control_tcp_tickle_list *list;
3939         struct ctdb_tcp_array *tcparray;
3940         int num;
3941         struct ctdb_vnn *vnn;
3942
3943         vnn = find_public_ip_vnn(ctdb, addr);
3944         if (vnn == NULL) {
3945                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3946                         ctdb_addr_to_str(addr)));
3947
3948                 return 1;
3949         }
3950
3951         tcparray = vnn->tcp_array;
3952         if (tcparray) {
3953                 num = tcparray->num;
3954         } else {
3955                 num = 0;
3956         }
3957
3958         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3959                                 tickles.connections)
3960                         + sizeof(struct ctdb_tcp_connection) * num;
3961
3962         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3963         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3964         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3965
3966         list->addr = *addr;
3967         list->tickles.num = num;
3968         if (num) {
3969                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3970                         sizeof(struct ctdb_tcp_connection) * num);
3971         }
3972
3973         return 0;
3974 }
3975
3976
3977 /*
3978   set the list of all tcp tickles for a public address
3979  */
3980 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3981                               struct timeval timeout, uint32_t destnode, 
3982                               ctdb_sock_addr *addr,
3983                               struct ctdb_tcp_array *tcparray)
3984 {
3985         int ret, num;
3986         TDB_DATA data;
3987         struct ctdb_control_tcp_tickle_list *list;
3988
3989         if (tcparray) {
3990                 num = tcparray->num;
3991         } else {
3992                 num = 0;
3993         }
3994
3995         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3996                                 tickles.connections) +
3997                         sizeof(struct ctdb_tcp_connection) * num;
3998         data.dptr = talloc_size(ctdb, data.dsize);
3999         CTDB_NO_MEMORY(ctdb, data.dptr);
4000
4001         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
4002         list->addr = *addr;
4003         list->tickles.num = num;
4004         if (tcparray) {
4005                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
4006         }
4007
4008         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
4009                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4010                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4011         if (ret != 0) {
4012                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4013                 return -1;
4014         }
4015
4016         talloc_free(data.dptr);
4017
4018         return ret;
4019 }
4020
4021
4022 /*
4023   perform tickle updates if required
4024  */
4025 static void ctdb_update_tcp_tickles(struct event_context *ev, 
4026                                 struct timed_event *te, 
4027                                 struct timeval t, void *private_data)
4028 {
4029         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4030         int ret;
4031         struct ctdb_vnn *vnn;
4032
4033         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4034                 /* we only send out updates for public addresses that 
4035                    we have taken over
4036                  */
4037                 if (ctdb->pnn != vnn->pnn) {
4038                         continue;
4039                 }
4040                 /* We only send out the updates if we need to */
4041                 if (!vnn->tcp_update_needed) {
4042                         continue;
4043                 }
4044                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
4045                                 TAKEOVER_TIMEOUT(),
4046                                 CTDB_BROADCAST_CONNECTED,
4047                                 &vnn->public_address,
4048                                 vnn->tcp_array);
4049                 if (ret != 0) {
4050                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4051                                 ctdb_addr_to_str(&vnn->public_address)));
4052                 }
4053         }
4054
4055         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4056                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4057                              ctdb_update_tcp_tickles, ctdb);
4058 }               
4059         
4060
4061 /*
4062   start periodic update of tcp tickles
4063  */
4064 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4065 {
4066         ctdb->tickle_update_context = talloc_new(ctdb);
4067
4068         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4069                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4070                              ctdb_update_tcp_tickles, ctdb);
4071 }
4072
4073
4074
4075
4076 struct control_gratious_arp {
4077         struct ctdb_context *ctdb;
4078         ctdb_sock_addr addr;
4079         const char *iface;
4080         int count;
4081 };
4082
4083 /*
4084   send a control_gratuitous arp
4085  */
4086 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4087                                   struct timeval t, void *private_data)
4088 {
4089         int ret;
4090         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4091                                                         struct control_gratious_arp);
4092
4093         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4094         if (ret != 0) {
4095                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4096                                  arp->iface, strerror(errno)));
4097         }
4098
4099
4100         arp->count++;
4101         if (arp->count == CTDB_ARP_REPEAT) {
4102                 talloc_free(arp);
4103                 return;
4104         }
4105
4106         event_add_timed(arp->ctdb->ev, arp, 
4107                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4108                         send_gratious_arp, arp);
4109 }
4110
4111
4112 /*
4113   send a gratious arp 
4114  */
4115 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4116 {
4117         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4118         struct control_gratious_arp *arp;
4119
4120         /* verify the size of indata */
4121         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4122                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4123                                  (unsigned)indata.dsize, 
4124                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4125                 return -1;
4126         }
4127         if (indata.dsize != 
4128                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4129                 + gratious_arp->len ) ){
4130
4131                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4132                         "but should be %u bytes\n", 
4133                          (unsigned)indata.dsize, 
4134                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4135                 return -1;
4136         }
4137
4138
4139         arp = talloc(ctdb, struct control_gratious_arp);
4140         CTDB_NO_MEMORY(ctdb, arp);
4141
4142         arp->ctdb  = ctdb;
4143         arp->addr   = gratious_arp->addr;
4144         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4145         CTDB_NO_MEMORY(ctdb, arp->iface);
4146         arp->count = 0;
4147         
4148         event_add_timed(arp->ctdb->ev, arp, 
4149                         timeval_zero(), send_gratious_arp, arp);
4150
4151         return 0;
4152 }
4153
4154 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4155 {
4156         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4157         int ret;
4158
4159         /* verify the size of indata */
4160         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4161                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4162                 return -1;
4163         }
4164         if (indata.dsize != 
4165                 ( offsetof(struct ctdb_control_ip_iface, iface)
4166                 + pub->len ) ){
4167
4168                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4169                         "but should be %u bytes\n", 
4170                          (unsigned)indata.dsize, 
4171                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4172                 return -1;
4173         }
4174
4175         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4176
4177         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4178
4179         if (ret != 0) {
4180                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4181                 return -1;
4182         }
4183
4184         return 0;
4185 }
4186
4187 /*
4188   called when releaseip event finishes for del_public_address
4189  */
4190 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
4191                                 void *private_data)
4192 {
4193         talloc_free(private_data);
4194 }
4195
4196 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4197 {
4198         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4199         struct ctdb_vnn *vnn;
4200         int ret;
4201
4202         /* verify the size of indata */
4203         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4204                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4205                 return -1;
4206         }
4207         if (indata.dsize != 
4208                 ( offsetof(struct ctdb_control_ip_iface, iface)
4209                 + pub->len ) ){
4210
4211                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4212                         "but should be %u bytes\n", 
4213                          (unsigned)indata.dsize, 
4214                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4215                 return -1;
4216         }
4217
4218         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4219
4220         /* walk over all public addresses until we find a match */
4221         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4222                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4223                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4224
4225                         DLIST_REMOVE(ctdb->vnn, vnn);
4226                         talloc_steal(mem_ctx, vnn);
4227                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
4228                         if (vnn->pnn != ctdb->pnn) {
4229                                 if (vnn->iface != NULL) {
4230                                         ctdb_vnn_unassign_iface(ctdb, vnn);
4231                                 }
4232                                 talloc_free(mem_ctx);
4233                                 return 0;
4234                         }
4235                         vnn->pnn = -1;
4236
4237                         ret = ctdb_event_script_callback(ctdb, 
4238                                          mem_ctx, delete_ip_callback, mem_ctx,
4239                                          false,
4240                                          CTDB_EVENT_RELEASE_IP,
4241                                          "%s %s %u",
4242                                          ctdb_vnn_iface_string(vnn),
4243                                          ctdb_addr_to_str(&vnn->public_address),
4244                                          vnn->public_netmask_bits);
4245                         if (vnn->iface != NULL) {
4246                                 ctdb_vnn_unassign_iface(ctdb, vnn);
4247                         }
4248                         if (ret != 0) {
4249                                 return -1;
4250                         }
4251                         return 0;
4252                 }
4253         }
4254
4255         return -1;
4256 }
4257
4258
4259 struct ipreallocated_callback_state {
4260         struct ctdb_req_control *c;
4261 };
4262
4263 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4264                                         int status, void *p)
4265 {
4266         struct ipreallocated_callback_state *state =
4267                 talloc_get_type(p, struct ipreallocated_callback_state);
4268
4269         if (status != 0) {
4270                 DEBUG(DEBUG_ERR,
4271                       (" \"ipreallocated\" event script failed (status %d)\n",
4272                        status));
4273                 if (status == -ETIME) {
4274                         ctdb_ban_self(ctdb);
4275                 }
4276         }
4277
4278         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4279         talloc_free(state);
4280 }
4281
4282 /* A control to run the ipreallocated event */
4283 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4284                                    struct ctdb_req_control *c,
4285                                    bool *async_reply)
4286 {
4287         int ret;
4288         struct ipreallocated_callback_state *state;
4289
4290         state = talloc(ctdb, struct ipreallocated_callback_state);
4291         CTDB_NO_MEMORY(ctdb, state);
4292
4293         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4294
4295         ret = ctdb_event_script_callback(ctdb, state,
4296                                          ctdb_ipreallocated_callback, state,
4297                                          false, CTDB_EVENT_IPREALLOCATED,
4298                                          "%s", "");
4299
4300         if (ret != 0) {
4301                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4302                 talloc_free(state);
4303                 return -1;
4304         }
4305
4306         /* tell the control that we will be reply asynchronously */
4307         state->c    = talloc_steal(state, c);
4308         *async_reply = true;
4309
4310         return 0;
4311 }
4312
4313
4314 /* This function is called from the recovery daemon to verify that a remote
4315    node has the expected ip allocation.
4316    This is verified against ctdb->ip_tree
4317 */
4318 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4319                                 struct ctdb_all_public_ips *ips,
4320                                 uint32_t pnn)
4321 {
4322         struct ctdb_public_ip_list *tmp_ip; 
4323         int i;
4324
4325         if (ctdb->ip_tree == NULL) {
4326                 /* dont know the expected allocation yet, assume remote node
4327                    is correct. */
4328                 return 0;
4329         }
4330
4331         if (ips == NULL) {
4332                 return 0;
4333         }
4334
4335         for (i=0; i<ips->num; i++) {
4336                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4337                 if (tmp_ip == NULL) {
4338                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4339                         return -1;
4340                 }
4341
4342                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4343                         continue;
4344                 }
4345
4346                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4347                         DEBUG(DEBUG_ERR,
4348                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4349                                pnn,
4350                                ctdb_addr_to_str(&ips->ips[i].addr),
4351                                ips->ips[i].pnn, tmp_ip->pnn));
4352                         return -1;
4353                 }
4354         }
4355
4356         return 0;
4357 }
4358
4359 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4360 {
4361         struct ctdb_public_ip_list *tmp_ip; 
4362
4363         if (ctdb->ip_tree == NULL) {
4364                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4365                 return -1;
4366         }
4367
4368         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4369         if (tmp_ip == NULL) {
4370                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4371                 return -1;
4372         }
4373
4374         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4375         tmp_ip->pnn = ip->pnn;
4376
4377         return 0;
4378 }
4379
4380
4381 struct ctdb_reloadips_handle {
4382         struct ctdb_context *ctdb;
4383         struct ctdb_req_control *c;
4384         int status;
4385         int fd[2];
4386         pid_t child;
4387         struct fd_event *fde;
4388 };
4389
4390 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4391 {
4392         if (h == h->ctdb->reload_ips) {
4393                 h->ctdb->reload_ips = NULL;
4394         }
4395         if (h->c != NULL) {
4396                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4397                 h->c = NULL;
4398         }
4399         ctdb_kill(h->ctdb, h->child, SIGKILL);
4400         return 0;
4401 }
4402
4403 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4404                                 struct timed_event *te,
4405                                 struct timeval t, void *private_data)
4406 {
4407         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4408
4409         talloc_free(h);
4410 }       
4411
4412 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4413                              uint16_t flags, void *private_data)
4414 {
4415         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4416
4417         char res;
4418         int ret;
4419
4420         ret = read(h->fd[0], &res, 1);
4421         if (ret < 1 || res != 0) {
4422                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4423                 res = 1;
4424         }
4425         h->status = res;
4426
4427         talloc_free(h);
4428 }
4429
4430 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4431 {
4432         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4433         struct ctdb_all_public_ips *ips;
4434         struct ctdb_vnn *vnn;
4435         int i, ret;
4436
4437         CTDB_NO_MEMORY(ctdb, mem_ctx);
4438
4439         /* read the ip allocation from the local node */
4440         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4441         if (ret != 0) {
4442                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4443                 talloc_free(mem_ctx);
4444                 return -1;
4445         }
4446
4447         /* re-read the public ips file */
4448         ctdb->vnn = NULL;
4449         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4450                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4451                 talloc_free(mem_ctx);
4452                 return -1;
4453         }
4454
4455
4456         /* check the previous list of ips and scan for ips that have been
4457            dropped.
4458          */
4459         for (i = 0; i < ips->num; i++) {
4460                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4461                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4462                                 break;
4463                         }
4464                 }
4465
4466                 /* we need to delete this ip, no longer available on this node */
4467                 if (vnn == NULL) {
4468                         struct ctdb_control_ip_iface pub;
4469
4470                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4471                         pub.addr  = ips->ips[i].addr;
4472                         pub.mask  = 0;
4473                         pub.len   = 0;
4474
4475                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4476                         if (ret != 0) {
4477                                 talloc_free(mem_ctx);
4478                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4479                                 return -1;
4480                         }
4481                 }
4482         }
4483
4484
4485         /* loop over all new ones and check the ones we need to add */
4486         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4487                 for (i = 0; i < ips->num; i++) {
4488                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4489                                 break;
4490                         }
4491                 }
4492                 if (i == ips->num) {
4493                         struct ctdb_control_ip_iface *pub;
4494                         const char *ifaces = NULL;
4495                         int iface = 0;
4496
4497                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4498
4499                         pub = talloc_zero(mem_ctx, struct ctdb_control_ip_iface);
4500                         pub->addr  = vnn->public_address;
4501                         pub->mask  = vnn->public_netmask_bits;
4502
4503                         ifaces = vnn->ifaces[0];
4504                         iface = 1;
4505                         while (vnn->ifaces[iface] != NULL) {
4506                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4507                                 iface++;
4508                         }
4509                         pub->len   = strlen(ifaces)+1;
4510                         pub = talloc_realloc_size(mem_ctx, pub,
4511                                 offsetof(struct ctdb_control_ip_iface, iface) + pub->len);
4512                         if (pub == NULL) {
4513                                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory\n"));
4514                                 talloc_free(mem_ctx);
4515                                 return -1;
4516                         }
4517                         memcpy(&pub->iface[0], ifaces, pub->len);
4518
4519                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(),
4520                                                       CTDB_CURRENT_NODE, pub);
4521                         if (ret != 0) {
4522                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4523                                 talloc_free(mem_ctx);
4524                                 return -1;
4525                         }
4526                 }
4527         }
4528
4529         talloc_free(mem_ctx);
4530         return 0;
4531 }
4532
4533 /* This control is sent to force the node to re-read the public addresses file
4534    and drop any addresses we should nnot longer host, and add new addresses
4535    that we are now able to host
4536 */
4537 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4538 {
4539         struct ctdb_reloadips_handle *h;
4540         pid_t parent = getpid();
4541
4542         if (ctdb->reload_ips != NULL) {
4543                 talloc_free(ctdb->reload_ips);
4544                 ctdb->reload_ips = NULL;
4545         }
4546
4547         h = talloc(ctdb, struct ctdb_reloadips_handle);
4548         CTDB_NO_MEMORY(ctdb, h);
4549         h->ctdb     = ctdb;
4550         h->c        = NULL;
4551         h->status   = -1;
4552         
4553         if (pipe(h->fd) == -1) {
4554                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4555                 talloc_free(h);
4556                 return -1;
4557         }
4558
4559         h->child = ctdb_fork(ctdb);
4560         if (h->child == (pid_t)-1) {
4561                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4562                 close(h->fd[0]);
4563                 close(h->fd[1]);
4564                 talloc_free(h);
4565                 return -1;
4566         }
4567
4568         /* child process */
4569         if (h->child == 0) {
4570                 signed char res = 0;
4571
4572                 close(h->fd[0]);
4573                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4574
4575                 ctdb_set_process_name("ctdb_reloadips");
4576                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4577                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4578                         res = -1;
4579                 } else {
4580                         res = ctdb_reloadips_child(ctdb);
4581                         if (res != 0) {
4582                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4583                         }
4584                 }
4585
4586                 write(h->fd[1], &res, 1);
4587                 /* make sure we die when our parent dies */
4588                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4589                         sleep(5);
4590                 }
4591                 _exit(0);
4592         }
4593
4594         h->c             = talloc_steal(h, c);
4595
4596         close(h->fd[1]);
4597         set_close_on_exec(h->fd[0]);
4598
4599         talloc_set_destructor(h, ctdb_reloadips_destructor);
4600
4601
4602         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4603                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4604                         (void *)h);
4605         tevent_fd_set_auto_close(h->fde);
4606
4607         event_add_timed(ctdb->ev, h,
4608                         timeval_current_ofs(120, 0),
4609                         ctdb_reloadips_timeout_event, h);
4610
4611         /* we reply later */
4612         *async_reply = true;
4613         return 0;
4614 }