ctdb/daemon: reloadips must register state of asynchronous controls
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          CTDB_EVENT_TAKE_IP,
514                                          "%s %s %u",
515                                          ctdb_vnn_iface_string(vnn),
516                                          ctdb_addr_to_str(&vnn->public_address),
517                                          vnn->public_netmask_bits);
518
519         if (ret != 0) {
520                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
521                         ctdb_addr_to_str(&vnn->public_address),
522                         ctdb_vnn_iface_string(vnn)));
523                 talloc_free(state);
524                 return -1;
525         }
526
527         return 0;
528 }
529
530 struct ctdb_do_updateip_state {
531         struct ctdb_req_control *c;
532         struct ctdb_iface *old;
533         struct ctdb_vnn *vnn;
534 };
535
536 /*
537   called when updateip event finishes
538  */
539 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
540                                       void *private_data)
541 {
542         struct ctdb_do_updateip_state *state =
543                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
544         int32_t ret;
545
546         if (status != 0) {
547                 if (status == -ETIME) {
548                         ctdb_ban_self(ctdb);
549                 }
550                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
551                         ctdb_addr_to_str(&state->vnn->public_address),
552                         state->old->name,
553                         ctdb_vnn_iface_string(state->vnn)));
554
555                 /*
556                  * All we can do is reset the old interface
557                  * and let the next run fix it
558                  */
559                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
560                 state->vnn->iface = state->old;
561                 state->vnn->iface->references++;
562
563                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
564                 talloc_free(state);
565                 return;
566         }
567
568         if (ctdb->do_checkpublicip) {
569
570         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
571         if (ret != 0) {
572                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
573                 talloc_free(state);
574                 return;
575         }
576
577         }
578
579         /* the control succeeded */
580         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
581         talloc_free(state);
582         return;
583 }
584
585 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
586 {
587         state->vnn->update_in_flight = false;
588         return 0;
589 }
590
591 /*
592   update (move) an ip address
593  */
594 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
595                                 struct ctdb_req_control *c,
596                                 struct ctdb_vnn *vnn)
597 {
598         int ret;
599         struct ctdb_do_updateip_state *state;
600         struct ctdb_iface *old = vnn->iface;
601         const char *new_name;
602
603         if (vnn->update_in_flight) {
604                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
605                                     "update for this IP already in flight\n",
606                                     ctdb_addr_to_str(&vnn->public_address),
607                                     vnn->public_netmask_bits));
608                 return -1;
609         }
610
611         ctdb_vnn_unassign_iface(ctdb, vnn);
612         ret = ctdb_vnn_assign_iface(ctdb, vnn);
613         if (ret != 0) {
614                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
615                                  "assin a usable interface (old iface '%s')\n",
616                                  ctdb_addr_to_str(&vnn->public_address),
617                                  vnn->public_netmask_bits,
618                                  old->name));
619                 return -1;
620         }
621
622         new_name = ctdb_vnn_iface_string(vnn);
623         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
624                 /* A benign update from one interface onto itself.
625                  * no need to run the eventscripts in this case, just return
626                  * success.
627                  */
628                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
629                 return 0;
630         }
631
632         state = talloc(vnn, struct ctdb_do_updateip_state);
633         CTDB_NO_MEMORY(ctdb, state);
634
635         state->c = talloc_steal(ctdb, c);
636         state->old = old;
637         state->vnn = vnn;
638
639         vnn->update_in_flight = true;
640         talloc_set_destructor(state, ctdb_updateip_destructor);
641
642         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
643                             "interface %s to %s\n",
644                             ctdb_addr_to_str(&vnn->public_address),
645                             vnn->public_netmask_bits,
646                             old->name,
647                             new_name));
648
649         ret = ctdb_event_script_callback(ctdb,
650                                          state,
651                                          ctdb_do_updateip_callback,
652                                          state,
653                                          CTDB_EVENT_UPDATE_IP,
654                                          "%s %s %s %u",
655                                          state->old->name,
656                                          new_name,
657                                          ctdb_addr_to_str(&vnn->public_address),
658                                          vnn->public_netmask_bits);
659         if (ret != 0) {
660                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
661                                  ctdb_addr_to_str(&vnn->public_address),
662                                  old->name, new_name));
663                 talloc_free(state);
664                 return -1;
665         }
666
667         return 0;
668 }
669
670 /*
671   Find the vnn of the node that has a public ip address
672   returns -1 if the address is not known as a public address
673  */
674 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
675 {
676         struct ctdb_vnn *vnn;
677
678         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
679                 if (ctdb_same_ip(&vnn->public_address, addr)) {
680                         return vnn;
681                 }
682         }
683
684         return NULL;
685 }
686
687 /*
688   take over an ip address
689  */
690 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
691                                  struct ctdb_req_control *c,
692                                  TDB_DATA indata,
693                                  bool *async_reply)
694 {
695         int ret;
696         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
697         struct ctdb_vnn *vnn;
698         bool have_ip = false;
699         bool do_updateip = false;
700         bool do_takeip = false;
701         struct ctdb_iface *best_iface = NULL;
702
703         if (pip->pnn != ctdb->pnn) {
704                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
705                                  "with pnn %d, but we're node %d\n",
706                                  ctdb_addr_to_str(&pip->addr),
707                                  pip->pnn, ctdb->pnn));
708                 return -1;
709         }
710
711         /* update out vnn list */
712         vnn = find_public_ip_vnn(ctdb, &pip->addr);
713         if (vnn == NULL) {
714                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
715                         ctdb_addr_to_str(&pip->addr)));
716                 return 0;
717         }
718
719         if (ctdb->do_checkpublicip) {
720                 have_ip = ctdb_sys_have_ip(&pip->addr);
721         }
722         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
723         if (best_iface == NULL) {
724                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
725                                  "a usable interface (old %s, have_ip %d)\n",
726                                  ctdb_addr_to_str(&vnn->public_address),
727                                  vnn->public_netmask_bits,
728                                  ctdb_vnn_iface_string(vnn),
729                                  have_ip));
730                 return -1;
731         }
732
733         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
734                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
735                 have_ip = false;
736         }
737
738
739         if (vnn->iface == NULL && have_ip) {
740                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
741                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
742                                  ctdb_addr_to_str(&vnn->public_address)));
743                 return 0;
744         }
745
746         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
747                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
748                                   "and we have it on iface[%s], but it was assigned to node %d"
749                                   "and we are node %d, banning ourself\n",
750                                  ctdb_addr_to_str(&vnn->public_address),
751                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
752                 ctdb_ban_self(ctdb);
753                 return -1;
754         }
755
756         if (vnn->pnn == -1 && have_ip) {
757                 vnn->pnn = ctdb->pnn;
758                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
759                                   "and we already have it on iface[%s], update local daemon\n",
760                                  ctdb_addr_to_str(&vnn->public_address),
761                                   ctdb_vnn_iface_string(vnn)));
762                 return 0;
763         }
764
765         if (vnn->iface) {
766                 if (vnn->iface != best_iface) {
767                         if (!vnn->iface->link_up) {
768                                 do_updateip = true;
769                         } else if (vnn->iface->references > (best_iface->references + 1)) {
770                                 /* only move when the rebalance gains something */
771                                         do_updateip = true;
772                         }
773                 }
774         }
775
776         if (!have_ip) {
777                 if (do_updateip) {
778                         ctdb_vnn_unassign_iface(ctdb, vnn);
779                         do_updateip = false;
780                 }
781                 do_takeip = true;
782         }
783
784         if (do_takeip) {
785                 ret = ctdb_do_takeip(ctdb, c, vnn);
786                 if (ret != 0) {
787                         return -1;
788                 }
789         } else if (do_updateip) {
790                 ret = ctdb_do_updateip(ctdb, c, vnn);
791                 if (ret != 0) {
792                         return -1;
793                 }
794         } else {
795                 /*
796                  * The interface is up and the kernel known the ip
797                  * => do nothing
798                  */
799                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
800                         ctdb_addr_to_str(&pip->addr),
801                         vnn->public_netmask_bits,
802                         ctdb_vnn_iface_string(vnn)));
803                 return 0;
804         }
805
806         /* tell ctdb_control.c that we will be replying asynchronously */
807         *async_reply = true;
808
809         return 0;
810 }
811
812 /*
813   takeover an ip address old v4 style
814  */
815 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
816                                 struct ctdb_req_control *c,
817                                 TDB_DATA indata, 
818                                 bool *async_reply)
819 {
820         TDB_DATA data;
821         
822         data.dsize = sizeof(struct ctdb_public_ip);
823         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
824         CTDB_NO_MEMORY(ctdb, data.dptr);
825         
826         memcpy(data.dptr, indata.dptr, indata.dsize);
827         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
828 }
829
830 /*
831   kill any clients that are registered with a IP that is being released
832  */
833 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
834 {
835         struct ctdb_client_ip *ip;
836
837         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
838                 ctdb_addr_to_str(addr)));
839
840         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
841                 ctdb_sock_addr tmp_addr;
842
843                 tmp_addr = ip->addr;
844                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
845                         ip->client_id,
846                         ctdb_addr_to_str(&ip->addr)));
847
848                 if (ctdb_same_ip(&tmp_addr, addr)) {
849                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
850                                                                      ip->client_id, 
851                                                                      struct ctdb_client);
852                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
853                                 ip->client_id,
854                                 ctdb_addr_to_str(&ip->addr),
855                                 client->pid));
856
857                         if (client->pid != 0) {
858                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
859                                         (unsigned)client->pid,
860                                         ctdb_addr_to_str(addr),
861                                         ip->client_id));
862                                 kill(client->pid, SIGKILL);
863                         }
864                 }
865         }
866 }
867
868 /*
869   called when releaseip event finishes
870  */
871 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
872                                 void *private_data)
873 {
874         struct takeover_callback_state *state = 
875                 talloc_get_type(private_data, struct takeover_callback_state);
876         TDB_DATA data;
877
878         if (status == -ETIME) {
879                 ctdb_ban_self(ctdb);
880         }
881
882         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
883                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
884                                   ctdb_addr_to_str(state->addr)));
885                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
886                 talloc_free(state);
887                 return;
888         }
889
890         /* send a message to all clients of this node telling them
891            that the cluster has been reconfigured and they should
892            release any sockets on this IP */
893         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
894         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
895         data.dsize = strlen((char *)data.dptr)+1;
896
897         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
898
899         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
900
901         /* kill clients that have registered with this IP */
902         release_kill_clients(ctdb, state->addr);
903
904         ctdb_vnn_unassign_iface(ctdb, state->vnn);
905
906         /* the control succeeded */
907         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
908         talloc_free(state);
909 }
910
911 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
912 {
913         state->vnn->update_in_flight = false;
914         return 0;
915 }
916
917 /*
918   release an ip address
919  */
920 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
921                                 struct ctdb_req_control *c,
922                                 TDB_DATA indata, 
923                                 bool *async_reply)
924 {
925         int ret;
926         struct takeover_callback_state *state;
927         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
928         struct ctdb_vnn *vnn;
929         char *iface;
930
931         /* update our vnn list */
932         vnn = find_public_ip_vnn(ctdb, &pip->addr);
933         if (vnn == NULL) {
934                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
935                         ctdb_addr_to_str(&pip->addr)));
936                 return 0;
937         }
938         vnn->pnn = pip->pnn;
939
940         /* stop any previous arps */
941         talloc_free(vnn->takeover_ctx);
942         vnn->takeover_ctx = NULL;
943
944         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
945          * lazy multicast to drop an IP from any node that isn't the
946          * intended new node.  The following causes makes ctdbd ignore
947          * a release for any address it doesn't host.
948          */
949         if (ctdb->do_checkpublicip) {
950                 if (!ctdb_sys_have_ip(&pip->addr)) {
951                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
952                                 ctdb_addr_to_str(&pip->addr),
953                                 vnn->public_netmask_bits,
954                                 ctdb_vnn_iface_string(vnn)));
955                         ctdb_vnn_unassign_iface(ctdb, vnn);
956                         return 0;
957                 }
958         } else {
959                 if (vnn->iface == NULL) {
960                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
961                                            ctdb_addr_to_str(&pip->addr),
962                                            vnn->public_netmask_bits));
963                         return 0;
964                 }
965         }
966
967         /* There is a potential race between take_ip and us because we
968          * update the VNN via a callback that run when the
969          * eventscripts have been run.  Avoid the race by allowing one
970          * update to be in flight at a time.
971          */
972         if (vnn->update_in_flight) {
973                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
974                                     "update for this IP already in flight\n",
975                                     ctdb_addr_to_str(&vnn->public_address),
976                                     vnn->public_netmask_bits));
977                 return -1;
978         }
979
980         if (ctdb->do_checkpublicip) {
981                 iface = ctdb_sys_find_ifname(&pip->addr);
982                 if (iface == NULL) {
983                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
984                         return 0;
985                 }
986                 if (vnn->iface == NULL) {
987                         DEBUG(DEBUG_WARNING,
988                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
989                                ctdb_addr_to_str(&pip->addr),
990                                iface));
991                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
992                         DEBUG(DEBUG_WARNING,
993                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
994                                ctdb_addr_to_str(&pip->addr),
995                                iface,
996                                ctdb_vnn_iface_string(vnn)));
997                         /* Should we fix vnn->iface?  If we do, what
998                          * happens to reference counts?
999                          */
1000                 }
1001         } else {
1002                 iface = strdup(ctdb_vnn_iface_string(vnn));
1003         }
1004
1005         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1006                 ctdb_addr_to_str(&pip->addr),
1007                 vnn->public_netmask_bits,
1008                 iface,
1009                 pip->pnn));
1010
1011         state = talloc(ctdb, struct takeover_callback_state);
1012         CTDB_NO_MEMORY(ctdb, state);
1013
1014         state->c = talloc_steal(state, c);
1015         state->addr = talloc(state, ctdb_sock_addr);       
1016         CTDB_NO_MEMORY(ctdb, state->addr);
1017         *state->addr = pip->addr;
1018         state->vnn   = vnn;
1019
1020         vnn->update_in_flight = true;
1021         talloc_set_destructor(state, ctdb_releaseip_destructor);
1022
1023         ret = ctdb_event_script_callback(ctdb, 
1024                                          state, release_ip_callback, state,
1025                                          CTDB_EVENT_RELEASE_IP,
1026                                          "%s %s %u",
1027                                          iface,
1028                                          ctdb_addr_to_str(&pip->addr),
1029                                          vnn->public_netmask_bits);
1030         free(iface);
1031         if (ret != 0) {
1032                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1033                         ctdb_addr_to_str(&pip->addr),
1034                         ctdb_vnn_iface_string(vnn)));
1035                 talloc_free(state);
1036                 return -1;
1037         }
1038
1039         /* tell the control that we will be reply asynchronously */
1040         *async_reply = true;
1041         return 0;
1042 }
1043
1044 /*
1045   release an ip address old v4 style
1046  */
1047 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1048                                 struct ctdb_req_control *c,
1049                                 TDB_DATA indata, 
1050                                 bool *async_reply)
1051 {
1052         TDB_DATA data;
1053         
1054         data.dsize = sizeof(struct ctdb_public_ip);
1055         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1056         CTDB_NO_MEMORY(ctdb, data.dptr);
1057         
1058         memcpy(data.dptr, indata.dptr, indata.dsize);
1059         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1060 }
1061
1062
1063 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1064                                    ctdb_sock_addr *addr,
1065                                    unsigned mask, const char *ifaces,
1066                                    bool check_address)
1067 {
1068         struct ctdb_vnn      *vnn;
1069         uint32_t num = 0;
1070         char *tmp;
1071         const char *iface;
1072         int i;
1073         int ret;
1074
1075         tmp = strdup(ifaces);
1076         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1077                 if (!ctdb_sys_check_iface_exists(iface)) {
1078                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1079                         free(tmp);
1080                         return -1;
1081                 }
1082         }
1083         free(tmp);
1084
1085         /* Verify that we dont have an entry for this ip yet */
1086         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1087                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1088                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1089                                 ctdb_addr_to_str(addr)));
1090                         return -1;
1091                 }               
1092         }
1093
1094         /* create a new vnn structure for this ip address */
1095         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1096         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1097         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1098         tmp = talloc_strdup(vnn, ifaces);
1099         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1100         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1101                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1102                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1103                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1104                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1105                 num++;
1106         }
1107         talloc_free(tmp);
1108         vnn->ifaces[num] = NULL;
1109         vnn->public_address      = *addr;
1110         vnn->public_netmask_bits = mask;
1111         vnn->pnn                 = -1;
1112         if (check_address) {
1113                 if (ctdb_sys_have_ip(addr)) {
1114                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1115                         vnn->pnn = ctdb->pnn;
1116                 }
1117         }
1118
1119         for (i=0; vnn->ifaces[i]; i++) {
1120                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121                 if (ret != 0) {
1122                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1123                                            "for public_address[%s]\n",
1124                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1125                         talloc_free(vnn);
1126                         return -1;
1127                 }
1128         }
1129
1130         DLIST_ADD(ctdb->vnn, vnn);
1131
1132         return 0;
1133 }
1134
1135 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1136                                   struct timeval t, void *private_data)
1137 {
1138         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1139                                                         struct ctdb_context);
1140         struct ctdb_vnn *vnn;
1141
1142         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1143                 int i;
1144
1145                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1146                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1147                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1148                                         vnn->ifaces[i],
1149                                         ctdb_addr_to_str(&vnn->public_address)));
1150                         }
1151                 }
1152         }
1153
1154         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1155                 timeval_current_ofs(30, 0), 
1156                 ctdb_check_interfaces_event, ctdb);
1157 }
1158
1159
1160 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1161 {
1162         if (ctdb->check_public_ifaces_ctx != NULL) {
1163                 talloc_free(ctdb->check_public_ifaces_ctx);
1164                 ctdb->check_public_ifaces_ctx = NULL;
1165         }
1166
1167         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1168         if (ctdb->check_public_ifaces_ctx == NULL) {
1169                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1170         }
1171
1172         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1173                 timeval_current_ofs(30, 0), 
1174                 ctdb_check_interfaces_event, ctdb);
1175
1176         return 0;
1177 }
1178
1179
1180 /*
1181   setup the public address lists from a file
1182 */
1183 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1184 {
1185         char **lines;
1186         int nlines;
1187         int i;
1188
1189         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1190         if (lines == NULL) {
1191                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1192                 return -1;
1193         }
1194         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1195                 nlines--;
1196         }
1197
1198         for (i=0;i<nlines;i++) {
1199                 unsigned mask;
1200                 ctdb_sock_addr addr;
1201                 const char *addrstr;
1202                 const char *ifaces;
1203                 char *tok, *line;
1204
1205                 line = lines[i];
1206                 while ((*line == ' ') || (*line == '\t')) {
1207                         line++;
1208                 }
1209                 if (*line == '#') {
1210                         continue;
1211                 }
1212                 if (strcmp(line, "") == 0) {
1213                         continue;
1214                 }
1215                 tok = strtok(line, " \t");
1216                 addrstr = tok;
1217                 tok = strtok(NULL, " \t");
1218                 if (tok == NULL) {
1219                         if (NULL == ctdb->default_public_interface) {
1220                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1221                                          i+1));
1222                                 talloc_free(lines);
1223                                 return -1;
1224                         }
1225                         ifaces = ctdb->default_public_interface;
1226                 } else {
1227                         ifaces = tok;
1228                 }
1229
1230                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1231                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1232                         talloc_free(lines);
1233                         return -1;
1234                 }
1235                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1236                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1237                         talloc_free(lines);
1238                         return -1;
1239                 }
1240         }
1241
1242
1243         talloc_free(lines);
1244         return 0;
1245 }
1246
1247 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1248                               const char *iface,
1249                               const char *ip)
1250 {
1251         struct ctdb_vnn *svnn;
1252         struct ctdb_iface *cur = NULL;
1253         bool ok;
1254         int ret;
1255
1256         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1257         CTDB_NO_MEMORY(ctdb, svnn);
1258
1259         svnn->ifaces = talloc_array(svnn, const char *, 2);
1260         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1261         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1262         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1263         svnn->ifaces[1] = NULL;
1264
1265         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1266         if (!ok) {
1267                 talloc_free(svnn);
1268                 return -1;
1269         }
1270
1271         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1272         if (ret != 0) {
1273                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1274                                    "for single_ip[%s]\n",
1275                                    svnn->ifaces[0],
1276                                    ctdb_addr_to_str(&svnn->public_address)));
1277                 talloc_free(svnn);
1278                 return -1;
1279         }
1280
1281         /* assume the single public ip interface is initially "good" */
1282         cur = ctdb_find_iface(ctdb, iface);
1283         if (cur == NULL) {
1284                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1285                 return -1;
1286         }
1287         cur->link_up = true;
1288
1289         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1290         if (ret != 0) {
1291                 talloc_free(svnn);
1292                 return -1;
1293         }
1294
1295         ctdb->single_ip_vnn = svnn;
1296         return 0;
1297 }
1298
1299 struct ctdb_public_ip_list {
1300         struct ctdb_public_ip_list *next;
1301         uint32_t pnn;
1302         ctdb_sock_addr addr;
1303 };
1304
1305 /* Given a physical node, return the number of
1306    public addresses that is currently assigned to this node.
1307 */
1308 static int node_ip_coverage(struct ctdb_context *ctdb, 
1309         int32_t pnn,
1310         struct ctdb_public_ip_list *ips)
1311 {
1312         int num=0;
1313
1314         for (;ips;ips=ips->next) {
1315                 if (ips->pnn == pnn) {
1316                         num++;
1317                 }
1318         }
1319         return num;
1320 }
1321
1322
1323 /* Can the given node host the given IP: is the public IP known to the
1324  * node and is NOIPHOST unset?
1325 */
1326 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1327                              struct ctdb_ipflags ipflags,
1328                              struct ctdb_public_ip_list *ip)
1329 {
1330         struct ctdb_all_public_ips *public_ips;
1331         int i;
1332
1333         if (ipflags.noiphost) {
1334                 return false;
1335         }
1336
1337         public_ips = ctdb->nodes[pnn]->available_public_ips;
1338
1339         if (public_ips == NULL) {
1340                 return false;
1341         }
1342
1343         for (i=0; i<public_ips->num; i++) {
1344                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1345                         /* yes, this node can serve this public ip */
1346                         return true;
1347                 }
1348         }
1349
1350         return false;
1351 }
1352
1353 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1354                                  struct ctdb_ipflags ipflags,
1355                                  struct ctdb_public_ip_list *ip)
1356 {
1357         if (ipflags.noiptakeover) {
1358                 return false;
1359         }
1360
1361         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1362 }
1363
1364 /* search the node lists list for a node to takeover this ip.
1365    pick the node that currently are serving the least number of ips
1366    so that the ips get spread out evenly.
1367 */
1368 static int find_takeover_node(struct ctdb_context *ctdb, 
1369                 struct ctdb_ipflags *ipflags,
1370                 struct ctdb_public_ip_list *ip,
1371                 struct ctdb_public_ip_list *all_ips)
1372 {
1373         int pnn, min=0, num;
1374         int i, numnodes;
1375
1376         numnodes = talloc_array_length(ipflags);
1377         pnn    = -1;
1378         for (i=0; i<numnodes; i++) {
1379                 /* verify that this node can serve this ip */
1380                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1381                         /* no it couldnt   so skip to the next node */
1382                         continue;
1383                 }
1384
1385                 num = node_ip_coverage(ctdb, i, all_ips);
1386                 /* was this the first node we checked ? */
1387                 if (pnn == -1) {
1388                         pnn = i;
1389                         min  = num;
1390                 } else {
1391                         if (num < min) {
1392                                 pnn = i;
1393                                 min  = num;
1394                         }
1395                 }
1396         }       
1397         if (pnn == -1) {
1398                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1399                         ctdb_addr_to_str(&ip->addr)));
1400
1401                 return -1;
1402         }
1403
1404         ip->pnn = pnn;
1405         return 0;
1406 }
1407
1408 #define IP_KEYLEN       4
1409 static uint32_t *ip_key(ctdb_sock_addr *ip)
1410 {
1411         static uint32_t key[IP_KEYLEN];
1412
1413         bzero(key, sizeof(key));
1414
1415         switch (ip->sa.sa_family) {
1416         case AF_INET:
1417                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1418                 break;
1419         case AF_INET6: {
1420                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1421                 key[0]  = htonl(s6_a32[0]);
1422                 key[1]  = htonl(s6_a32[1]);
1423                 key[2]  = htonl(s6_a32[2]);
1424                 key[3]  = htonl(s6_a32[3]);
1425                 break;
1426         }
1427         default:
1428                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1429                 return key;
1430         }
1431
1432         return key;
1433 }
1434
1435 static void *add_ip_callback(void *parm, void *data)
1436 {
1437         struct ctdb_public_ip_list *this_ip = parm; 
1438         struct ctdb_public_ip_list *prev_ip = data; 
1439
1440         if (prev_ip == NULL) {
1441                 return parm;
1442         }
1443         if (this_ip->pnn == -1) {
1444                 this_ip->pnn = prev_ip->pnn;
1445         }
1446
1447         return parm;
1448 }
1449
1450 static int getips_count_callback(void *param, void *data)
1451 {
1452         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1453         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1454
1455         new_ip->next = *ip_list;
1456         *ip_list     = new_ip;
1457         return 0;
1458 }
1459
1460 static struct ctdb_public_ip_list *
1461 create_merged_ip_list(struct ctdb_context *ctdb)
1462 {
1463         int i, j;
1464         struct ctdb_public_ip_list *ip_list;
1465         struct ctdb_all_public_ips *public_ips;
1466
1467         if (ctdb->ip_tree != NULL) {
1468                 talloc_free(ctdb->ip_tree);
1469                 ctdb->ip_tree = NULL;
1470         }
1471         ctdb->ip_tree = trbt_create(ctdb, 0);
1472
1473         for (i=0;i<ctdb->num_nodes;i++) {
1474                 public_ips = ctdb->nodes[i]->known_public_ips;
1475
1476                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1477                         continue;
1478                 }
1479
1480                 /* there were no public ips for this node */
1481                 if (public_ips == NULL) {
1482                         continue;
1483                 }               
1484
1485                 for (j=0;j<public_ips->num;j++) {
1486                         struct ctdb_public_ip_list *tmp_ip; 
1487
1488                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1489                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1490                         /* Do not use information about IP addresses hosted
1491                          * on other nodes, it may not be accurate */
1492                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1493                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1494                         } else {
1495                                 tmp_ip->pnn = -1;
1496                         }
1497                         tmp_ip->addr = public_ips->ips[j].addr;
1498                         tmp_ip->next = NULL;
1499
1500                         trbt_insertarray32_callback(ctdb->ip_tree,
1501                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1502                                 add_ip_callback,
1503                                 tmp_ip);
1504                 }
1505         }
1506
1507         ip_list = NULL;
1508         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1509
1510         return ip_list;
1511 }
1512
1513 /* 
1514  * This is the length of the longtest common prefix between the IPs.
1515  * It is calculated by XOR-ing the 2 IPs together and counting the
1516  * number of leading zeroes.  The implementation means that all
1517  * addresses end up being 128 bits long.
1518  *
1519  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1520  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1521  * lots of nodes and IP addresses?
1522  */
1523 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1524 {
1525         uint32_t ip1_k[IP_KEYLEN];
1526         uint32_t *t;
1527         int i;
1528         uint32_t x;
1529
1530         uint32_t distance = 0;
1531
1532         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1533         t = ip_key(ip2);
1534         for (i=0; i<IP_KEYLEN; i++) {
1535                 x = ip1_k[i] ^ t[i];
1536                 if (x == 0) {
1537                         distance += 32;
1538                 } else {
1539                         /* Count number of leading zeroes. 
1540                          * FIXME? This could be optimised...
1541                          */
1542                         while ((x & (1 << 31)) == 0) {
1543                                 x <<= 1;
1544                                 distance += 1;
1545                         }
1546                 }
1547         }
1548
1549         return distance;
1550 }
1551
1552 /* Calculate the IP distance for the given IP relative to IPs on the
1553    given node.  The ips argument is generally the all_ips variable
1554    used in the main part of the algorithm.
1555  */
1556 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1557                                   struct ctdb_public_ip_list *ips,
1558                                   int pnn)
1559 {
1560         struct ctdb_public_ip_list *t;
1561         uint32_t d;
1562
1563         uint32_t sum = 0;
1564
1565         for (t=ips; t != NULL; t=t->next) {
1566                 if (t->pnn != pnn) {
1567                         continue;
1568                 }
1569
1570                 /* Optimisation: We never calculate the distance
1571                  * between an address and itself.  This allows us to
1572                  * calculate the effect of removing an address from a
1573                  * node by simply calculating the distance between
1574                  * that address and all of the exitsing addresses.
1575                  * Moreover, we assume that we're only ever dealing
1576                  * with addresses from all_ips so we can identify an
1577                  * address via a pointer rather than doing a more
1578                  * expensive address comparison. */
1579                 if (&(t->addr) == ip) {
1580                         continue;
1581                 }
1582
1583                 d = ip_distance(ip, &(t->addr));
1584                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1585         }
1586
1587         return sum;
1588 }
1589
1590 /* Return the LCP2 imbalance metric for addresses currently assigned
1591    to the given node.
1592  */
1593 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1594 {
1595         struct ctdb_public_ip_list *t;
1596
1597         uint32_t imbalance = 0;
1598
1599         for (t=all_ips; t!=NULL; t=t->next) {
1600                 if (t->pnn != pnn) {
1601                         continue;
1602                 }
1603                 /* Pass the rest of the IPs rather than the whole
1604                    all_ips input list.
1605                 */
1606                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1607         }
1608
1609         return imbalance;
1610 }
1611
1612 /* Allocate any unassigned IPs just by looping through the IPs and
1613  * finding the best node for each.
1614  */
1615 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1616                                       struct ctdb_ipflags *ipflags,
1617                                       struct ctdb_public_ip_list *all_ips)
1618 {
1619         struct ctdb_public_ip_list *tmp_ip;
1620
1621         /* loop over all ip's and find a physical node to cover for 
1622            each unassigned ip.
1623         */
1624         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1625                 if (tmp_ip->pnn == -1) {
1626                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1627                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1628                                         ctdb_addr_to_str(&tmp_ip->addr)));
1629                         }
1630                 }
1631         }
1632 }
1633
1634 /* Basic non-deterministic rebalancing algorithm.
1635  */
1636 static void basic_failback(struct ctdb_context *ctdb,
1637                            struct ctdb_ipflags *ipflags,
1638                            struct ctdb_public_ip_list *all_ips,
1639                            int num_ips)
1640 {
1641         int i, numnodes;
1642         int maxnode, maxnum, minnode, minnum, num, retries;
1643         struct ctdb_public_ip_list *tmp_ip;
1644
1645         numnodes = talloc_array_length(ipflags);
1646         retries = 0;
1647
1648 try_again:
1649         maxnum=0;
1650         minnum=0;
1651
1652         /* for each ip address, loop over all nodes that can serve
1653            this ip and make sure that the difference between the node
1654            serving the most and the node serving the least ip's are
1655            not greater than 1.
1656         */
1657         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1658                 if (tmp_ip->pnn == -1) {
1659                         continue;
1660                 }
1661
1662                 /* Get the highest and lowest number of ips's served by any 
1663                    valid node which can serve this ip.
1664                 */
1665                 maxnode = -1;
1666                 minnode = -1;
1667                 for (i=0; i<numnodes; i++) {
1668                         /* only check nodes that can actually serve this ip */
1669                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1670                                 /* no it couldnt   so skip to the next node */
1671                                 continue;
1672                         }
1673
1674                         num = node_ip_coverage(ctdb, i, all_ips);
1675                         if (maxnode == -1) {
1676                                 maxnode = i;
1677                                 maxnum  = num;
1678                         } else {
1679                                 if (num > maxnum) {
1680                                         maxnode = i;
1681                                         maxnum  = num;
1682                                 }
1683                         }
1684                         if (minnode == -1) {
1685                                 minnode = i;
1686                                 minnum  = num;
1687                         } else {
1688                                 if (num < minnum) {
1689                                         minnode = i;
1690                                         minnum  = num;
1691                                 }
1692                         }
1693                 }
1694                 if (maxnode == -1) {
1695                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1696                                 ctdb_addr_to_str(&tmp_ip->addr)));
1697
1698                         continue;
1699                 }
1700
1701                 /* if the spread between the smallest and largest coverage by
1702                    a node is >=2 we steal one of the ips from the node with
1703                    most coverage to even things out a bit.
1704                    try to do this a limited number of times since we dont
1705                    want to spend too much time balancing the ip coverage.
1706                 */
1707                 if ( (maxnum > minnum+1)
1708                      && (retries < (num_ips + 5)) ){
1709                         struct ctdb_public_ip_list *tmp;
1710
1711                         /* Reassign one of maxnode's VNNs */
1712                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1713                                 if (tmp->pnn == maxnode) {
1714                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1715                                         retries++;
1716                                         goto try_again;;
1717                                 }
1718                         }
1719                 }
1720         }
1721 }
1722
1723 static void lcp2_init(struct ctdb_context *tmp_ctx,
1724                       struct ctdb_ipflags *ipflags,
1725                       struct ctdb_public_ip_list *all_ips,
1726                       uint32_t *force_rebalance_nodes,
1727                       uint32_t **lcp2_imbalances,
1728                       bool **rebalance_candidates)
1729 {
1730         int i, numnodes;
1731         struct ctdb_public_ip_list *tmp_ip;
1732
1733         numnodes = talloc_array_length(ipflags);
1734
1735         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1736         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1737         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1738         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1739
1740         for (i=0; i<numnodes; i++) {
1741                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1742                 /* First step: assume all nodes are candidates */
1743                 (*rebalance_candidates)[i] = true;
1744         }
1745
1746         /* 2nd step: if a node has IPs assigned then it must have been
1747          * healthy before, so we remove it from consideration.  This
1748          * is overkill but is all we have because we don't maintain
1749          * state between takeover runs.  An alternative would be to
1750          * keep state and invalidate it every time the recovery master
1751          * changes.
1752          */
1753         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1754                 if (tmp_ip->pnn != -1) {
1755                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1756                 }
1757         }
1758
1759         /* 3rd step: if a node is forced to re-balance then
1760            we allow failback onto the node */
1761         if (force_rebalance_nodes == NULL) {
1762                 return;
1763         }
1764         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1765                 uint32_t pnn = force_rebalance_nodes[i];
1766                 if (pnn >= numnodes) {
1767                         DEBUG(DEBUG_ERR,
1768                               (__location__ "unknown node %u\n", pnn));
1769                         continue;
1770                 }
1771
1772                 DEBUG(DEBUG_NOTICE,
1773                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1774                 (*rebalance_candidates)[pnn] = true;
1775         }
1776 }
1777
1778 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1779  * the IP/node combination that will cost the least.
1780  */
1781 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1782                                      struct ctdb_ipflags *ipflags,
1783                                      struct ctdb_public_ip_list *all_ips,
1784                                      uint32_t *lcp2_imbalances)
1785 {
1786         struct ctdb_public_ip_list *tmp_ip;
1787         int dstnode, numnodes;
1788
1789         int minnode;
1790         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1791         struct ctdb_public_ip_list *minip;
1792
1793         bool should_loop = true;
1794         bool have_unassigned = true;
1795
1796         numnodes = talloc_array_length(ipflags);
1797
1798         while (have_unassigned && should_loop) {
1799                 should_loop = false;
1800
1801                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1802                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1803
1804                 minnode = -1;
1805                 mindsum = 0;
1806                 minip = NULL;
1807
1808                 /* loop over each unassigned ip. */
1809                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1810                         if (tmp_ip->pnn != -1) {
1811                                 continue;
1812                         }
1813
1814                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1815                                 /* only check nodes that can actually takeover this ip */
1816                                 if (!can_node_takeover_ip(ctdb, dstnode,
1817                                                           ipflags[dstnode],
1818                                                           tmp_ip)) {
1819                                         /* no it couldnt   so skip to the next node */
1820                                         continue;
1821                                 }
1822
1823                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1824                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1825                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1826                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1827                                                    dstnode,
1828                                                    dstimbl - lcp2_imbalances[dstnode]));
1829
1830
1831                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1832                                         minnode = dstnode;
1833                                         minimbl = dstimbl;
1834                                         mindsum = dstdsum;
1835                                         minip = tmp_ip;
1836                                         should_loop = true;
1837                                 }
1838                         }
1839                 }
1840
1841                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1842
1843                 /* If we found one then assign it to the given node. */
1844                 if (minnode != -1) {
1845                         minip->pnn = minnode;
1846                         lcp2_imbalances[minnode] = minimbl;
1847                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1848                                           ctdb_addr_to_str(&(minip->addr)),
1849                                           minnode,
1850                                           mindsum));
1851                 }
1852
1853                 /* There might be a better way but at least this is clear. */
1854                 have_unassigned = false;
1855                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1856                         if (tmp_ip->pnn == -1) {
1857                                 have_unassigned = true;
1858                         }
1859                 }
1860         }
1861
1862         /* We know if we have an unassigned addresses so we might as
1863          * well optimise.
1864          */
1865         if (have_unassigned) {
1866                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1867                         if (tmp_ip->pnn == -1) {
1868                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1869                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1870                         }
1871                 }
1872         }
1873 }
1874
1875 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1876  * to move IPs from, determines the best IP/destination node
1877  * combination to move from the source node.
1878  */
1879 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1880                                     struct ctdb_ipflags *ipflags,
1881                                     struct ctdb_public_ip_list *all_ips,
1882                                     int srcnode,
1883                                     uint32_t candimbl,
1884                                     uint32_t *lcp2_imbalances,
1885                                     bool *rebalance_candidates)
1886 {
1887         int dstnode, mindstnode, numnodes;
1888         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1889         uint32_t minsrcimbl, mindstimbl;
1890         struct ctdb_public_ip_list *minip;
1891         struct ctdb_public_ip_list *tmp_ip;
1892
1893         /* Find an IP and destination node that best reduces imbalance. */
1894         srcimbl = 0;
1895         minip = NULL;
1896         minsrcimbl = 0;
1897         mindstnode = -1;
1898         mindstimbl = 0;
1899
1900         numnodes = talloc_array_length(ipflags);
1901
1902         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1903         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1904
1905         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1906                 /* Only consider addresses on srcnode. */
1907                 if (tmp_ip->pnn != srcnode) {
1908                         continue;
1909                 }
1910
1911                 /* What is this IP address costing the source node? */
1912                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1913                 srcimbl = candimbl - srcdsum;
1914
1915                 /* Consider this IP address would cost each potential
1916                  * destination node.  Destination nodes are limited to
1917                  * those that are newly healthy, since we don't want
1918                  * to do gratuitous failover of IPs just to make minor
1919                  * balance improvements.
1920                  */
1921                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1922                         if (!rebalance_candidates[dstnode]) {
1923                                 continue;
1924                         }
1925
1926                         /* only check nodes that can actually takeover this ip */
1927                         if (!can_node_takeover_ip(ctdb, dstnode,
1928                                                   ipflags[dstnode], tmp_ip)) {
1929                                 /* no it couldnt   so skip to the next node */
1930                                 continue;
1931                         }
1932
1933                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1934                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1935                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1936                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1937                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1938                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1939
1940                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1941                             ((mindstnode == -1) ||                              \
1942                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1943
1944                                 minip = tmp_ip;
1945                                 minsrcimbl = srcimbl;
1946                                 mindstnode = dstnode;
1947                                 mindstimbl = dstimbl;
1948                         }
1949                 }
1950         }
1951         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1952
1953         if (mindstnode != -1) {
1954                 /* We found a move that makes things better... */
1955                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1956                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1957                                   ctdb_addr_to_str(&(minip->addr)),
1958                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1959
1960
1961                 lcp2_imbalances[srcnode] = srcimbl;
1962                 lcp2_imbalances[mindstnode] = mindstimbl;
1963                 minip->pnn = mindstnode;
1964
1965                 return true;
1966         }
1967
1968         return false;
1969         
1970 }
1971
1972 struct lcp2_imbalance_pnn {
1973         uint32_t imbalance;
1974         int pnn;
1975 };
1976
1977 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1978 {
1979         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1980         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1981
1982         if (lipa->imbalance > lipb->imbalance) {
1983                 return -1;
1984         } else if (lipa->imbalance == lipb->imbalance) {
1985                 return 0;
1986         } else {
1987                 return 1;
1988         }
1989 }
1990
1991 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1992  * node with the highest LCP2 imbalance, and then determines the best
1993  * IP/destination node combination to move from the source node.
1994  */
1995 static void lcp2_failback(struct ctdb_context *ctdb,
1996                           struct ctdb_ipflags *ipflags,
1997                           struct ctdb_public_ip_list *all_ips,
1998                           uint32_t *lcp2_imbalances,
1999                           bool *rebalance_candidates)
2000 {
2001         int i, num_rebalance_candidates, numnodes;
2002         struct lcp2_imbalance_pnn * lips;
2003         bool again;
2004
2005         numnodes = talloc_array_length(ipflags);
2006
2007 try_again:
2008
2009         /* It is only worth continuing if we have suitable target
2010          * nodes to transfer IPs to.  This check is much cheaper than
2011          * continuing on...
2012          */
2013         num_rebalance_candidates = 0;
2014         for (i=0; i<numnodes; i++) {
2015                 if (rebalance_candidates[i]) {
2016                         num_rebalance_candidates++;
2017                 }
2018         }
2019         if (num_rebalance_candidates == 0) {
2020                 return;
2021         }
2022
2023         /* Put the imbalances and nodes into an array, sort them and
2024          * iterate through candidates.  Usually the 1st one will be
2025          * used, so this doesn't cost much...
2026          */
2027         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2028         for (i=0; i<numnodes; i++) {
2029                 lips[i].imbalance = lcp2_imbalances[i];
2030                 lips[i].pnn = i;
2031         }
2032         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2033               lcp2_cmp_imbalance_pnn);
2034
2035         again = false;
2036         for (i=0; i<numnodes; i++) {
2037                 /* This means that all nodes had 0 or 1 addresses, so
2038                  * can't be imbalanced.
2039                  */
2040                 if (lips[i].imbalance == 0) {
2041                         break;
2042                 }
2043
2044                 if (lcp2_failback_candidate(ctdb,
2045                                             ipflags,
2046                                             all_ips,
2047                                             lips[i].pnn,
2048                                             lips[i].imbalance,
2049                                             lcp2_imbalances,
2050                                             rebalance_candidates)) {
2051                         again = true;
2052                         break;
2053                 }
2054         }
2055
2056         talloc_free(lips);
2057         if (again) {
2058                 goto try_again;
2059         }
2060 }
2061
2062 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2063                                     struct ctdb_ipflags *ipflags,
2064                                     struct ctdb_public_ip_list *all_ips)
2065 {
2066         struct ctdb_public_ip_list *tmp_ip;
2067
2068         /* verify that the assigned nodes can serve that public ip
2069            and set it to -1 if not
2070         */
2071         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2072                 if (tmp_ip->pnn == -1) {
2073                         continue;
2074                 }
2075                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2076                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2077                         /* this node can not serve this ip. */
2078                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2079                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2080                                            tmp_ip->pnn));
2081                         tmp_ip->pnn = -1;
2082                 }
2083         }
2084 }
2085
2086 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2087                                        struct ctdb_ipflags *ipflags,
2088                                        struct ctdb_public_ip_list *all_ips)
2089 {
2090         struct ctdb_public_ip_list *tmp_ip;
2091         int i, numnodes;
2092
2093         numnodes = talloc_array_length(ipflags);
2094
2095         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2096        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2097         *  always be allocated the same way for a specific set of
2098         *  available/unavailable nodes.
2099         */
2100
2101         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2102                 tmp_ip->pnn = i % numnodes;
2103         }
2104
2105         /* IP failback doesn't make sense with deterministic
2106          * IPs, since the modulo step above implicitly fails
2107          * back IPs to their "home" node.
2108          */
2109         if (1 == ctdb->tunable.no_ip_failback) {
2110                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2111         }
2112
2113         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2114
2115         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2116
2117         /* No failback here! */
2118 }
2119
2120 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2121                                           struct ctdb_ipflags *ipflags,
2122                                           struct ctdb_public_ip_list *all_ips)
2123 {
2124         /* This should be pushed down into basic_failback. */
2125         struct ctdb_public_ip_list *tmp_ip;
2126         int num_ips = 0;
2127         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2128                 num_ips++;
2129         }
2130
2131         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2132
2133         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2134
2135         /* If we don't want IPs to fail back then don't rebalance IPs. */
2136         if (1 == ctdb->tunable.no_ip_failback) {
2137                 return;
2138         }
2139
2140         /* Now, try to make sure the ip adresses are evenly distributed
2141            across the nodes.
2142         */
2143         basic_failback(ctdb, ipflags, all_ips, num_ips);
2144 }
2145
2146 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2147                           struct ctdb_ipflags *ipflags,
2148                           struct ctdb_public_ip_list *all_ips,
2149                           uint32_t *force_rebalance_nodes)
2150 {
2151         uint32_t *lcp2_imbalances;
2152         bool *rebalance_candidates;
2153
2154         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2155
2156         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2157
2158         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2159                   &lcp2_imbalances, &rebalance_candidates);
2160
2161         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2162
2163         /* If we don't want IPs to fail back then don't rebalance IPs. */
2164         if (1 == ctdb->tunable.no_ip_failback) {
2165                 goto finished;
2166         }
2167
2168         /* Now, try to make sure the ip adresses are evenly distributed
2169            across the nodes.
2170         */
2171         lcp2_failback(ctdb, ipflags, all_ips,
2172                       lcp2_imbalances, rebalance_candidates);
2173
2174 finished:
2175         talloc_free(tmp_ctx);
2176 }
2177
2178 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2179 {
2180         int i, num_healthy;
2181
2182         /* Count how many completely healthy nodes we have */
2183         num_healthy = 0;
2184         for (i=0;i<nodemap->num;i++) {
2185                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2186                         num_healthy++;
2187                 }
2188         }
2189
2190         return num_healthy == 0;
2191 }
2192
2193 /* The calculation part of the IP allocation algorithm. */
2194 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2195                                    struct ctdb_ipflags *ipflags,
2196                                    struct ctdb_public_ip_list **all_ips_p,
2197                                    uint32_t *force_rebalance_nodes)
2198 {
2199         /* since nodes only know about those public addresses that
2200            can be served by that particular node, no single node has
2201            a full list of all public addresses that exist in the cluster.
2202            Walk over all node structures and create a merged list of
2203            all public addresses that exist in the cluster.
2204
2205            keep the tree of ips around as ctdb->ip_tree
2206         */
2207         *all_ips_p = create_merged_ip_list(ctdb);
2208
2209         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2210                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2211         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2212                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2213         } else {
2214                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2215         }
2216
2217         /* at this point ->pnn is the node which will own each IP
2218            or -1 if there is no node that can cover this ip
2219         */
2220
2221         return;
2222 }
2223
2224 struct get_tunable_callback_data {
2225         const char *tunable;
2226         uint32_t *out;
2227         bool fatal;
2228 };
2229
2230 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2231                                  int32_t res, TDB_DATA outdata,
2232                                  void *callback)
2233 {
2234         struct get_tunable_callback_data *cd =
2235                 (struct get_tunable_callback_data *)callback;
2236         int size;
2237
2238         if (res != 0) {
2239                 /* Already handled in fail callback */
2240                 return;
2241         }
2242
2243         if (outdata.dsize != sizeof(uint32_t)) {
2244                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2245                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2246                                  (int)outdata.dsize));
2247                 cd->fatal = true;
2248                 return;
2249         }
2250
2251         size = talloc_array_length(cd->out);
2252         if (pnn >= size) {
2253                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2254                                  cd->tunable, pnn, size));
2255                 return;
2256         }
2257
2258                 
2259         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2260 }
2261
2262 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2263                                        int32_t res, TDB_DATA outdata,
2264                                        void *callback)
2265 {
2266         struct get_tunable_callback_data *cd =
2267                 (struct get_tunable_callback_data *)callback;
2268
2269         switch (res) {
2270         case -ETIME:
2271                 DEBUG(DEBUG_ERR,
2272                       ("Timed out getting tunable \"%s\" from node %d\n",
2273                        cd->tunable, pnn));
2274                 cd->fatal = true;
2275                 break;
2276         case -EINVAL:
2277         case -1:
2278                 DEBUG(DEBUG_WARNING,
2279                       ("Tunable \"%s\" not implemented on node %d\n",
2280                        cd->tunable, pnn));
2281                 break;
2282         default:
2283                 DEBUG(DEBUG_ERR,
2284                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2285                        cd->tunable, pnn));
2286                 cd->fatal = true;
2287         }
2288 }
2289
2290 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2291                                         TALLOC_CTX *tmp_ctx,
2292                                         struct ctdb_node_map *nodemap,
2293                                         const char *tunable,
2294                                         uint32_t default_value)
2295 {
2296         TDB_DATA data;
2297         struct ctdb_control_get_tunable *t;
2298         uint32_t *nodes;
2299         uint32_t *tvals;
2300         struct get_tunable_callback_data callback_data;
2301         int i;
2302
2303         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2304         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2305         for (i=0; i<nodemap->num; i++) {
2306                 tvals[i] = default_value;
2307         }
2308                 
2309         callback_data.out = tvals;
2310         callback_data.tunable = tunable;
2311         callback_data.fatal = false;
2312
2313         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2314         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2315         t = (struct ctdb_control_get_tunable *)data.dptr;
2316         t->length = strlen(tunable)+1;
2317         memcpy(t->name, tunable, t->length);
2318         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2319         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2320                                       nodes, 0, TAKEOVER_TIMEOUT(),
2321                                       false, data,
2322                                       get_tunable_callback,
2323                                       get_tunable_fail_callback,
2324                                       &callback_data) != 0) {
2325                 if (callback_data.fatal) {
2326                         talloc_free(tvals);
2327                         tvals = NULL;
2328                 }
2329         }
2330         talloc_free(nodes);
2331         talloc_free(data.dptr);
2332
2333         return tvals;
2334 }
2335
2336 struct get_runstate_callback_data {
2337         enum ctdb_runstate *out;
2338         bool fatal;
2339 };
2340
2341 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2342                                   int32_t res, TDB_DATA outdata,
2343                                   void *callback_data)
2344 {
2345         struct get_runstate_callback_data *cd =
2346                 (struct get_runstate_callback_data *)callback_data;
2347         int size;
2348
2349         if (res != 0) {
2350                 /* Already handled in fail callback */
2351                 return;
2352         }
2353
2354         if (outdata.dsize != sizeof(uint32_t)) {
2355                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2356                                  pnn, (int)sizeof(uint32_t),
2357                                  (int)outdata.dsize));
2358                 cd->fatal = true;
2359                 return;
2360         }
2361
2362         size = talloc_array_length(cd->out);
2363         if (pnn >= size) {
2364                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2365                                  pnn, size));
2366                 return;
2367         }
2368
2369         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2370 }
2371
2372 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2373                                        int32_t res, TDB_DATA outdata,
2374                                        void *callback)
2375 {
2376         struct get_runstate_callback_data *cd =
2377                 (struct get_runstate_callback_data *)callback;
2378
2379         switch (res) {
2380         case -ETIME:
2381                 DEBUG(DEBUG_ERR,
2382                       ("Timed out getting runstate from node %d\n", pnn));
2383                 cd->fatal = true;
2384                 break;
2385         default:
2386                 DEBUG(DEBUG_WARNING,
2387                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2388                        pnn));
2389         }
2390 }
2391
2392 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2393                                                     TALLOC_CTX *tmp_ctx,
2394                                                     struct ctdb_node_map *nodemap,
2395                                                     enum ctdb_runstate default_value)
2396 {
2397         uint32_t *nodes;
2398         enum ctdb_runstate *rs;
2399         struct get_runstate_callback_data callback_data;
2400         int i;
2401
2402         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2403         CTDB_NO_MEMORY_NULL(ctdb, rs);
2404         for (i=0; i<nodemap->num; i++) {
2405                 rs[i] = default_value;
2406         }
2407
2408         callback_data.out = rs;
2409         callback_data.fatal = false;
2410
2411         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2412         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2413                                       nodes, 0, TAKEOVER_TIMEOUT(),
2414                                       true, tdb_null,
2415                                       get_runstate_callback,
2416                                       get_runstate_fail_callback,
2417                                       &callback_data) != 0) {
2418                 if (callback_data.fatal) {
2419                         free(rs);
2420                         rs = NULL;
2421                 }
2422         }
2423         talloc_free(nodes);
2424
2425         return rs;
2426 }
2427
2428 /* Set internal flags for IP allocation:
2429  *   Clear ip flags
2430  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2431  *   Set NOIPHOST ip flag for each INACTIVE node
2432  *   if all nodes are disabled:
2433  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2434  *   else
2435  *     Set NOIPHOST ip flags for disabled nodes
2436  */
2437 static struct ctdb_ipflags *
2438 set_ipflags_internal(struct ctdb_context *ctdb,
2439                      TALLOC_CTX *tmp_ctx,
2440                      struct ctdb_node_map *nodemap,
2441                      uint32_t *tval_noiptakeover,
2442                      uint32_t *tval_noiphostonalldisabled,
2443                      enum ctdb_runstate *runstate)
2444 {
2445         int i;
2446         struct ctdb_ipflags *ipflags;
2447
2448         /* Clear IP flags - implicit due to talloc_zero */
2449         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2450         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2451
2452         for (i=0;i<nodemap->num;i++) {
2453                 /* Can not take IPs on node with NoIPTakeover set */
2454                 if (tval_noiptakeover[i] != 0) {
2455                         ipflags[i].noiptakeover = true;
2456                 }
2457
2458                 /* Can not host IPs on node not in RUNNING state */
2459                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2460                         ipflags[i].noiphost = true;
2461                         continue;
2462                 }
2463                 /* Can not host IPs on INACTIVE node */
2464                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2465                         ipflags[i].noiphost = true;
2466                 }
2467         }
2468
2469         if (all_nodes_are_disabled(nodemap)) {
2470                 /* If all nodes are disabled, can not host IPs on node
2471                  * with NoIPHostOnAllDisabled set
2472                  */
2473                 for (i=0;i<nodemap->num;i++) {
2474                         if (tval_noiphostonalldisabled[i] != 0) {
2475                                 ipflags[i].noiphost = true;
2476                         }
2477                 }
2478         } else {
2479                 /* If some nodes are not disabled, then can not host
2480                  * IPs on DISABLED node
2481                  */
2482                 for (i=0;i<nodemap->num;i++) {
2483                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2484                                 ipflags[i].noiphost = true;
2485                         }
2486                 }
2487         }
2488
2489         return ipflags;
2490 }
2491
2492 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2493                                         TALLOC_CTX *tmp_ctx,
2494                                         struct ctdb_node_map *nodemap)
2495 {
2496         uint32_t *tval_noiptakeover;
2497         uint32_t *tval_noiphostonalldisabled;
2498         struct ctdb_ipflags *ipflags;
2499         enum ctdb_runstate *runstate;
2500
2501
2502         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2503                                                    "NoIPTakeover", 0);
2504         if (tval_noiptakeover == NULL) {
2505                 return NULL;
2506         }
2507
2508         tval_noiphostonalldisabled =
2509                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2510                                        "NoIPHostOnAllDisabled", 0);
2511         if (tval_noiphostonalldisabled == NULL) {
2512                 /* Caller frees tmp_ctx */
2513                 return NULL;
2514         }
2515
2516         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2517          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2518          * reasonable behaviour on a mixed cluster during upgrade.
2519          */
2520         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2521                                            CTDB_RUNSTATE_RUNNING);
2522         if (runstate == NULL) {
2523                 /* Caller frees tmp_ctx */
2524                 return NULL;
2525         }
2526
2527         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2528                                        tval_noiptakeover,
2529                                        tval_noiphostonalldisabled,
2530                                        runstate);
2531
2532         talloc_free(tval_noiptakeover);
2533         talloc_free(tval_noiphostonalldisabled);
2534         talloc_free(runstate);
2535
2536         return ipflags;
2537 }
2538
2539 struct iprealloc_callback_data {
2540         bool *retry_nodes;
2541         int retry_count;
2542         client_async_callback fail_callback;
2543         void *fail_callback_data;
2544         struct ctdb_node_map *nodemap;
2545 };
2546
2547 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2548                                         int32_t res, TDB_DATA outdata,
2549                                         void *callback)
2550 {
2551         int numnodes;
2552         struct iprealloc_callback_data *cd =
2553                 (struct iprealloc_callback_data *)callback;
2554
2555         numnodes = talloc_array_length(cd->retry_nodes);
2556         if (pnn > numnodes) {
2557                 DEBUG(DEBUG_ERR,
2558                       ("ipreallocated failure from node %d, "
2559                        "but only %d nodes in nodemap\n",
2560                        pnn, numnodes));
2561                 return;
2562         }
2563
2564         /* Can't run the "ipreallocated" event on a INACTIVE node */
2565         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2566                 DEBUG(DEBUG_WARNING,
2567                       ("ipreallocated failed on inactive node %d, ignoring\n",
2568                        pnn));
2569                 return;
2570         }
2571
2572         switch (res) {
2573         case -ETIME:
2574                 /* If the control timed out then that's a real error,
2575                  * so call the real fail callback
2576                  */
2577                 cd->fail_callback(ctdb, pnn, res, outdata,
2578                                   cd->fail_callback_data);
2579                 break;
2580         default:
2581                 /* If not a timeout then either the ipreallocated
2582                  * eventscript (or some setup) failed.  This might
2583                  * have failed because the IPREALLOCATED control isn't
2584                  * implemented - right now there is no way of knowing
2585                  * because the error codes are all folded down to -1.
2586                  * Consider retrying using EVENTSCRIPT control...
2587                  */
2588                 DEBUG(DEBUG_WARNING,
2589                       ("ipreallocated failure from node %d, flagging retry\n",
2590                        pnn));
2591                 cd->retry_nodes[pnn] = true;
2592                 cd->retry_count++;
2593         }
2594 }
2595
2596 struct takeover_callback_data {
2597         bool *node_failed;
2598         client_async_callback fail_callback;
2599         void *fail_callback_data;
2600         struct ctdb_node_map *nodemap;
2601 };
2602
2603 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2604                                        uint32_t node_pnn, int32_t res,
2605                                        TDB_DATA outdata, void *callback_data)
2606 {
2607         struct takeover_callback_data *cd =
2608                 talloc_get_type_abort(callback_data,
2609                                       struct takeover_callback_data);
2610         int i;
2611
2612         for (i = 0; i < cd->nodemap->num; i++) {
2613                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2614                         break;
2615                 }
2616         }
2617
2618         if (i == cd->nodemap->num) {
2619                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2620                 return;
2621         }
2622
2623         if (!cd->node_failed[i]) {
2624                 cd->node_failed[i] = true;
2625                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2626                                   cd->fail_callback_data);
2627         }
2628 }
2629
2630 /*
2631   make any IP alias changes for public addresses that are necessary 
2632  */
2633 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2634                       uint32_t *force_rebalance_nodes,
2635                       client_async_callback fail_callback, void *callback_data)
2636 {
2637         int i, j, ret;
2638         struct ctdb_public_ip ip;
2639         struct ctdb_public_ipv4 ipv4;
2640         uint32_t *nodes;
2641         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2642         TDB_DATA data;
2643         struct timeval timeout;
2644         struct client_async_data *async_data;
2645         struct ctdb_client_control_state *state;
2646         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2647         struct ctdb_ipflags *ipflags;
2648         struct takeover_callback_data *takeover_data;
2649         struct iprealloc_callback_data iprealloc_data;
2650         bool *retry_data;
2651
2652         /*
2653          * ip failover is completely disabled, just send out the 
2654          * ipreallocated event.
2655          */
2656         if (ctdb->tunable.disable_ip_failover != 0) {
2657                 goto ipreallocated;
2658         }
2659
2660         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2661         if (ipflags == NULL) {
2662                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2663                 talloc_free(tmp_ctx);
2664                 return -1;
2665         }
2666
2667         ZERO_STRUCT(ip);
2668
2669         /* Do the IP reassignment calculations */
2670         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2671
2672         /* Now tell all nodes to release any public IPs should not
2673          * host.  This will be a NOOP on nodes that don't currently
2674          * hold the given IP.
2675          */
2676         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2677         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2678
2679         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2680                                                        bool, nodemap->num);
2681         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2682         takeover_data->fail_callback = fail_callback;
2683         takeover_data->fail_callback_data = callback_data;
2684         takeover_data->nodemap = nodemap;
2685
2686         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2687         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2688
2689         async_data->fail_callback = takeover_run_fail_callback;
2690         async_data->callback_data = takeover_data;
2691
2692         for (i=0;i<nodemap->num;i++) {
2693                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2694                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2695                         continue;
2696                 }
2697
2698                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2699                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2700                                 /* This node should be serving this
2701                                    vnn so dont tell it to release the ip
2702                                 */
2703                                 continue;
2704                         }
2705                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2706                                 ipv4.pnn = tmp_ip->pnn;
2707                                 ipv4.sin = tmp_ip->addr.ip;
2708
2709                                 timeout = TAKEOVER_TIMEOUT();
2710                                 data.dsize = sizeof(ipv4);
2711                                 data.dptr  = (uint8_t *)&ipv4;
2712                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2713                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2714                                                 data, async_data,
2715                                                 &timeout, NULL);
2716                         } else {
2717                                 ip.pnn  = tmp_ip->pnn;
2718                                 ip.addr = tmp_ip->addr;
2719
2720                                 timeout = TAKEOVER_TIMEOUT();
2721                                 data.dsize = sizeof(ip);
2722                                 data.dptr  = (uint8_t *)&ip;
2723                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2724                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2725                                                 data, async_data,
2726                                                 &timeout, NULL);
2727                         }
2728
2729                         if (state == NULL) {
2730                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2731                                 talloc_free(tmp_ctx);
2732                                 return -1;
2733                         }
2734                 
2735                         ctdb_client_async_add(async_data, state);
2736                 }
2737         }
2738         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2739                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2740                 talloc_free(tmp_ctx);
2741                 return -1;
2742         }
2743         talloc_free(async_data);
2744
2745
2746         /* tell all nodes to get their own IPs */
2747         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2748         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2749
2750         async_data->fail_callback = fail_callback;
2751         async_data->callback_data = callback_data;
2752
2753         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2754                 if (tmp_ip->pnn == -1) {
2755                         /* this IP won't be taken over */
2756                         continue;
2757                 }
2758
2759                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2760                         ipv4.pnn = tmp_ip->pnn;
2761                         ipv4.sin = tmp_ip->addr.ip;
2762
2763                         timeout = TAKEOVER_TIMEOUT();
2764                         data.dsize = sizeof(ipv4);
2765                         data.dptr  = (uint8_t *)&ipv4;
2766                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2767                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2768                                         data, async_data,
2769                                         &timeout, NULL);
2770                 } else {
2771                         ip.pnn  = tmp_ip->pnn;
2772                         ip.addr = tmp_ip->addr;
2773
2774                         timeout = TAKEOVER_TIMEOUT();
2775                         data.dsize = sizeof(ip);
2776                         data.dptr  = (uint8_t *)&ip;
2777                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2778                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2779                                         data, async_data,
2780                                         &timeout, NULL);
2781                 }
2782                 if (state == NULL) {
2783                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2784                         talloc_free(tmp_ctx);
2785                         return -1;
2786                 }
2787                 
2788                 ctdb_client_async_add(async_data, state);
2789         }
2790         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2791                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2792                 talloc_free(tmp_ctx);
2793                 return -1;
2794         }
2795
2796 ipreallocated:
2797         /* 
2798          * Tell all nodes to run eventscripts to process the
2799          * "ipreallocated" event.  This can do a lot of things,
2800          * including restarting services to reconfigure them if public
2801          * IPs have moved.  Once upon a time this event only used to
2802          * update natwg.
2803          */
2804         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2805         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2806         iprealloc_data.retry_nodes = retry_data;
2807         iprealloc_data.retry_count = 0;
2808         iprealloc_data.fail_callback = fail_callback;
2809         iprealloc_data.fail_callback_data = callback_data;
2810         iprealloc_data.nodemap = nodemap;
2811
2812         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2813         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2814                                         nodes, 0, TAKEOVER_TIMEOUT(),
2815                                         false, tdb_null,
2816                                         NULL, iprealloc_fail_callback,
2817                                         &iprealloc_data);
2818         if (ret != 0) {
2819                 /* If the control failed then we should retry to any
2820                  * nodes flagged by iprealloc_fail_callback using the
2821                  * EVENTSCRIPT control.  This is a best-effort at
2822                  * backward compatiblity when running a mixed cluster
2823                  * where some nodes have not yet been upgraded to
2824                  * support the IPREALLOCATED control.
2825                  */
2826                 DEBUG(DEBUG_WARNING,
2827                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2828
2829                 nodes = talloc_array(tmp_ctx, uint32_t,
2830                                      iprealloc_data.retry_count);
2831                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2832
2833                 j = 0;
2834                 for (i=0; i<nodemap->num; i++) {
2835                         if (iprealloc_data.retry_nodes[i]) {
2836                                 nodes[j] = i;
2837                                 j++;
2838                         }
2839                 }
2840
2841                 data.dptr  = discard_const("ipreallocated");
2842                 data.dsize = strlen((char *)data.dptr) + 1; 
2843                 ret = ctdb_client_async_control(ctdb,
2844                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2845                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2846                                                 false, data,
2847                                                 NULL, fail_callback,
2848                                                 callback_data);
2849                 if (ret != 0) {
2850                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2851                 }
2852         }
2853
2854         talloc_free(tmp_ctx);
2855         return ret;
2856 }
2857
2858
2859 /*
2860   destroy a ctdb_client_ip structure
2861  */
2862 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2863 {
2864         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2865                 ctdb_addr_to_str(&ip->addr),
2866                 ntohs(ip->addr.ip.sin_port),
2867                 ip->client_id));
2868
2869         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2870         return 0;
2871 }
2872
2873 /*
2874   called by a client to inform us of a TCP connection that it is managing
2875   that should tickled with an ACK when IP takeover is done
2876   we handle both the old ipv4 style of packets as well as the new ipv4/6
2877   pdus.
2878  */
2879 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2880                                 TDB_DATA indata)
2881 {
2882         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2883         struct ctdb_control_tcp *old_addr = NULL;
2884         struct ctdb_control_tcp_addr new_addr;
2885         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2886         struct ctdb_tcp_list *tcp;
2887         struct ctdb_tcp_connection t;
2888         int ret;
2889         TDB_DATA data;
2890         struct ctdb_client_ip *ip;
2891         struct ctdb_vnn *vnn;
2892         ctdb_sock_addr addr;
2893
2894         switch (indata.dsize) {
2895         case sizeof(struct ctdb_control_tcp):
2896                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2897                 ZERO_STRUCT(new_addr);
2898                 tcp_sock = &new_addr;
2899                 tcp_sock->src.ip  = old_addr->src;
2900                 tcp_sock->dest.ip = old_addr->dest;
2901                 break;
2902         case sizeof(struct ctdb_control_tcp_addr):
2903                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2904                 break;
2905         default:
2906                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2907                                  "to ctdb_control_tcp_client. size was %d but "
2908                                  "only allowed sizes are %lu and %lu\n",
2909                                  (int)indata.dsize,
2910                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2911                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2912                 return -1;
2913         }
2914
2915         addr = tcp_sock->src;
2916         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2917         addr = tcp_sock->dest;
2918         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2919
2920         ZERO_STRUCT(addr);
2921         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2922         vnn = find_public_ip_vnn(ctdb, &addr);
2923         if (vnn == NULL) {
2924                 switch (addr.sa.sa_family) {
2925                 case AF_INET:
2926                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2927                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2928                                         ctdb_addr_to_str(&addr)));
2929                         }
2930                         break;
2931                 case AF_INET6:
2932                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2933                                 ctdb_addr_to_str(&addr)));
2934                         break;
2935                 default:
2936                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2937                 }
2938
2939                 return 0;
2940         }
2941
2942         if (vnn->pnn != ctdb->pnn) {
2943                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2944                         ctdb_addr_to_str(&addr),
2945                         client_id, client->pid));
2946                 /* failing this call will tell smbd to die */
2947                 return -1;
2948         }
2949
2950         ip = talloc(client, struct ctdb_client_ip);
2951         CTDB_NO_MEMORY(ctdb, ip);
2952
2953         ip->ctdb      = ctdb;
2954         ip->addr      = addr;
2955         ip->client_id = client_id;
2956         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2957         DLIST_ADD(ctdb->client_ip_list, ip);
2958
2959         tcp = talloc(client, struct ctdb_tcp_list);
2960         CTDB_NO_MEMORY(ctdb, tcp);
2961
2962         tcp->connection.src_addr = tcp_sock->src;
2963         tcp->connection.dst_addr = tcp_sock->dest;
2964
2965         DLIST_ADD(client->tcp_list, tcp);
2966
2967         t.src_addr = tcp_sock->src;
2968         t.dst_addr = tcp_sock->dest;
2969
2970         data.dptr = (uint8_t *)&t;
2971         data.dsize = sizeof(t);
2972
2973         switch (addr.sa.sa_family) {
2974         case AF_INET:
2975                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2976                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2977                         ctdb_addr_to_str(&tcp_sock->src),
2978                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2979                 break;
2980         case AF_INET6:
2981                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2982                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2983                         ctdb_addr_to_str(&tcp_sock->src),
2984                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2985                 break;
2986         default:
2987                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2988         }
2989
2990
2991         /* tell all nodes about this tcp connection */
2992         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2993                                        CTDB_CONTROL_TCP_ADD,
2994                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2995         if (ret != 0) {
2996                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2997                 return -1;
2998         }
2999
3000         return 0;
3001 }
3002
3003 /*
3004   find a tcp address on a list
3005  */
3006 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3007                                            struct ctdb_tcp_connection *tcp)
3008 {
3009         int i;
3010
3011         if (array == NULL) {
3012                 return NULL;
3013         }
3014
3015         for (i=0;i<array->num;i++) {
3016                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3017                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3018                         return &array->connections[i];
3019                 }
3020         }
3021         return NULL;
3022 }
3023
3024
3025
3026 /*
3027   called by a daemon to inform us of a TCP connection that one of its
3028   clients managing that should tickled with an ACK when IP takeover is
3029   done
3030  */
3031 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3032 {
3033         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3034         struct ctdb_tcp_array *tcparray;
3035         struct ctdb_tcp_connection tcp;
3036         struct ctdb_vnn *vnn;
3037
3038         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3039         if (vnn == NULL) {
3040                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3041                         ctdb_addr_to_str(&p->dst_addr)));
3042
3043                 return -1;
3044         }
3045
3046
3047         tcparray = vnn->tcp_array;
3048
3049         /* If this is the first tickle */
3050         if (tcparray == NULL) {
3051                 tcparray = talloc_size(ctdb->nodes, 
3052                         offsetof(struct ctdb_tcp_array, connections) +
3053                         sizeof(struct ctdb_tcp_connection) * 1);
3054                 CTDB_NO_MEMORY(ctdb, tcparray);
3055                 vnn->tcp_array = tcparray;
3056
3057                 tcparray->num = 0;
3058                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3059                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3060
3061                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3062                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3063                 tcparray->num++;
3064
3065                 if (tcp_update_needed) {
3066                         vnn->tcp_update_needed = true;
3067                 }
3068                 return 0;
3069         }
3070
3071
3072         /* Do we already have this tickle ?*/
3073         tcp.src_addr = p->src_addr;
3074         tcp.dst_addr = p->dst_addr;
3075         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
3076                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3077                         ctdb_addr_to_str(&tcp.dst_addr),
3078                         ntohs(tcp.dst_addr.ip.sin_port),
3079                         vnn->pnn));
3080                 return 0;
3081         }
3082
3083         /* A new tickle, we must add it to the array */
3084         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3085                                         struct ctdb_tcp_connection,
3086                                         tcparray->num+1);
3087         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3088
3089         vnn->tcp_array = tcparray;
3090         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3091         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3092         tcparray->num++;
3093                                 
3094         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3095                 ctdb_addr_to_str(&tcp.dst_addr),
3096                 ntohs(tcp.dst_addr.ip.sin_port),
3097                 vnn->pnn));
3098
3099         if (tcp_update_needed) {
3100                 vnn->tcp_update_needed = true;
3101         }
3102
3103         return 0;
3104 }
3105
3106
3107 /*
3108   called by a daemon to inform us of a TCP connection that one of its
3109   clients managing that should tickled with an ACK when IP takeover is
3110   done
3111  */
3112 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3113 {
3114         struct ctdb_tcp_connection *tcpp;
3115         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3116
3117         if (vnn == NULL) {
3118                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3119                         ctdb_addr_to_str(&conn->dst_addr)));
3120                 return;
3121         }
3122
3123         /* if the array is empty we cant remove it
3124            and we dont need to do anything
3125          */
3126         if (vnn->tcp_array == NULL) {
3127                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3128                         ctdb_addr_to_str(&conn->dst_addr),
3129                         ntohs(conn->dst_addr.ip.sin_port)));
3130                 return;
3131         }
3132
3133
3134         /* See if we know this connection
3135            if we dont know this connection  then we dont need to do anything
3136          */
3137         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3138         if (tcpp == NULL) {
3139                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3140                         ctdb_addr_to_str(&conn->dst_addr),
3141                         ntohs(conn->dst_addr.ip.sin_port)));
3142                 return;
3143         }
3144
3145
3146         /* We need to remove this entry from the array.
3147            Instead of allocating a new array and copying data to it
3148            we cheat and just copy the last entry in the existing array
3149            to the entry that is to be removed and just shring the 
3150            ->num field
3151          */
3152         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3153         vnn->tcp_array->num--;
3154
3155         /* If we deleted the last entry we also need to remove the entire array
3156          */
3157         if (vnn->tcp_array->num == 0) {
3158                 talloc_free(vnn->tcp_array);
3159                 vnn->tcp_array = NULL;
3160         }               
3161
3162         vnn->tcp_update_needed = true;
3163
3164         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3165                 ctdb_addr_to_str(&conn->src_addr),
3166                 ntohs(conn->src_addr.ip.sin_port)));
3167 }
3168
3169
3170 /*
3171   called by a daemon to inform us of a TCP connection that one of its
3172   clients used are no longer needed in the tickle database
3173  */
3174 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3175 {
3176         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3177
3178         ctdb_remove_tcp_connection(ctdb, conn);
3179
3180         return 0;
3181 }
3182
3183
3184 /*
3185   called when a daemon restarts - send all tickes for all public addresses
3186   we are serving immediately to the new node.
3187  */
3188 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
3189 {
3190 /*XXX here we should send all tickes we are serving to the new node */
3191         return 0;
3192 }
3193
3194
3195 /*
3196   called when a client structure goes away - hook to remove
3197   elements from the tcp_list in all daemons
3198  */
3199 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3200 {
3201         while (client->tcp_list) {
3202                 struct ctdb_tcp_list *tcp = client->tcp_list;
3203                 DLIST_REMOVE(client->tcp_list, tcp);
3204                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3205         }
3206 }
3207
3208
3209 /*
3210   release all IPs on shutdown
3211  */
3212 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3213 {
3214         struct ctdb_vnn *vnn;
3215         int count = 0;
3216
3217         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3218                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3219                         ctdb_vnn_unassign_iface(ctdb, vnn);
3220                         continue;
3221                 }
3222                 if (!vnn->iface) {
3223                         continue;
3224                 }
3225
3226                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3227                                     ctdb_addr_to_str(&vnn->public_address),
3228                                     vnn->public_netmask_bits,
3229                                     ctdb_vnn_iface_string(vnn)));
3230
3231                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3232                                   ctdb_vnn_iface_string(vnn),
3233                                   ctdb_addr_to_str(&vnn->public_address),
3234                                   vnn->public_netmask_bits);
3235                 release_kill_clients(ctdb, &vnn->public_address);
3236                 ctdb_vnn_unassign_iface(ctdb, vnn);
3237                 count++;
3238         }
3239
3240         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3241 }
3242
3243
3244 /*
3245   get list of public IPs
3246  */
3247 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3248                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3249 {
3250         int i, num, len;
3251         struct ctdb_all_public_ips *ips;
3252         struct ctdb_vnn *vnn;
3253         bool only_available = false;
3254
3255         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3256                 only_available = true;
3257         }
3258
3259         /* count how many public ip structures we have */
3260         num = 0;
3261         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3262                 num++;
3263         }
3264
3265         len = offsetof(struct ctdb_all_public_ips, ips) + 
3266                 num*sizeof(struct ctdb_public_ip);
3267         ips = talloc_zero_size(outdata, len);
3268         CTDB_NO_MEMORY(ctdb, ips);
3269
3270         i = 0;
3271         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3272                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3273                         continue;
3274                 }
3275                 ips->ips[i].pnn  = vnn->pnn;
3276                 ips->ips[i].addr = vnn->public_address;
3277                 i++;
3278         }
3279         ips->num = i;
3280         len = offsetof(struct ctdb_all_public_ips, ips) +
3281                 i*sizeof(struct ctdb_public_ip);
3282
3283         outdata->dsize = len;
3284         outdata->dptr  = (uint8_t *)ips;
3285
3286         return 0;
3287 }
3288
3289
3290 /*
3291   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3292  */
3293 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3294                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3295 {
3296         int i, num, len;
3297         struct ctdb_all_public_ipsv4 *ips;
3298         struct ctdb_vnn *vnn;
3299
3300         /* count how many public ip structures we have */
3301         num = 0;
3302         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3303                 if (vnn->public_address.sa.sa_family != AF_INET) {
3304                         continue;
3305                 }
3306                 num++;
3307         }
3308
3309         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3310                 num*sizeof(struct ctdb_public_ipv4);
3311         ips = talloc_zero_size(outdata, len);
3312         CTDB_NO_MEMORY(ctdb, ips);
3313
3314         outdata->dsize = len;
3315         outdata->dptr  = (uint8_t *)ips;
3316
3317         ips->num = num;
3318         i = 0;
3319         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3320                 if (vnn->public_address.sa.sa_family != AF_INET) {
3321                         continue;
3322                 }
3323                 ips->ips[i].pnn = vnn->pnn;
3324                 ips->ips[i].sin = vnn->public_address.ip;
3325                 i++;
3326         }
3327
3328         return 0;
3329 }
3330
3331 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3332                                         struct ctdb_req_control *c,
3333                                         TDB_DATA indata,
3334                                         TDB_DATA *outdata)
3335 {
3336         int i, num, len;
3337         ctdb_sock_addr *addr;
3338         struct ctdb_control_public_ip_info *info;
3339         struct ctdb_vnn *vnn;
3340
3341         addr = (ctdb_sock_addr *)indata.dptr;
3342
3343         vnn = find_public_ip_vnn(ctdb, addr);
3344         if (vnn == NULL) {
3345                 /* if it is not a public ip   it could be our 'single ip' */
3346                 if (ctdb->single_ip_vnn) {
3347                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3348                                 vnn = ctdb->single_ip_vnn;
3349                         }
3350                 }
3351         }
3352         if (vnn == NULL) {
3353                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3354                                  "'%s'not a public address\n",
3355                                  ctdb_addr_to_str(addr)));
3356                 return -1;
3357         }
3358
3359         /* count how many public ip structures we have */
3360         num = 0;
3361         for (;vnn->ifaces[num];) {
3362                 num++;
3363         }
3364
3365         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3366                 num*sizeof(struct ctdb_control_iface_info);
3367         info = talloc_zero_size(outdata, len);
3368         CTDB_NO_MEMORY(ctdb, info);
3369
3370         info->ip.addr = vnn->public_address;
3371         info->ip.pnn = vnn->pnn;
3372         info->active_idx = 0xFFFFFFFF;
3373
3374         for (i=0; vnn->ifaces[i]; i++) {
3375                 struct ctdb_iface *cur;
3376
3377                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3378                 if (cur == NULL) {
3379                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3380                                            vnn->ifaces[i]));
3381                         return -1;
3382                 }
3383                 if (vnn->iface == cur) {
3384                         info->active_idx = i;
3385                 }
3386                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3387                 info->ifaces[i].link_state = cur->link_up;
3388                 info->ifaces[i].references = cur->references;
3389         }
3390         info->num = i;
3391         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3392                 i*sizeof(struct ctdb_control_iface_info);
3393
3394         outdata->dsize = len;
3395         outdata->dptr  = (uint8_t *)info;
3396
3397         return 0;
3398 }
3399
3400 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3401                                 struct ctdb_req_control *c,
3402                                 TDB_DATA *outdata)
3403 {
3404         int i, num, len;
3405         struct ctdb_control_get_ifaces *ifaces;
3406         struct ctdb_iface *cur;
3407
3408         /* count how many public ip structures we have */
3409         num = 0;
3410         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3411                 num++;
3412         }
3413
3414         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3415                 num*sizeof(struct ctdb_control_iface_info);
3416         ifaces = talloc_zero_size(outdata, len);
3417         CTDB_NO_MEMORY(ctdb, ifaces);
3418
3419         i = 0;
3420         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3421                 strcpy(ifaces->ifaces[i].name, cur->name);
3422                 ifaces->ifaces[i].link_state = cur->link_up;
3423                 ifaces->ifaces[i].references = cur->references;
3424                 i++;
3425         }
3426         ifaces->num = i;
3427         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3428                 i*sizeof(struct ctdb_control_iface_info);
3429
3430         outdata->dsize = len;
3431         outdata->dptr  = (uint8_t *)ifaces;
3432
3433         return 0;
3434 }
3435
3436 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3437                                     struct ctdb_req_control *c,
3438                                     TDB_DATA indata)
3439 {
3440         struct ctdb_control_iface_info *info;
3441         struct ctdb_iface *iface;
3442         bool link_up = false;
3443
3444         info = (struct ctdb_control_iface_info *)indata.dptr;
3445
3446         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3447                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3448                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3449                                   len, len, info->name));
3450                 return -1;
3451         }
3452
3453         switch (info->link_state) {
3454         case 0:
3455                 link_up = false;
3456                 break;
3457         case 1:
3458                 link_up = true;
3459                 break;
3460         default:
3461                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3462                                   (unsigned int)info->link_state));
3463                 return -1;
3464         }
3465
3466         if (info->references != 0) {
3467                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3468                                   (unsigned int)info->references));
3469                 return -1;
3470         }
3471
3472         iface = ctdb_find_iface(ctdb, info->name);
3473         if (iface == NULL) {
3474                 return -1;
3475         }
3476
3477         if (link_up == iface->link_up) {
3478                 return 0;
3479         }
3480
3481         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3482               ("iface[%s] has changed it's link status %s => %s\n",
3483                iface->name,
3484                iface->link_up?"up":"down",
3485                link_up?"up":"down"));
3486
3487         iface->link_up = link_up;
3488         return 0;
3489 }
3490
3491
3492 /* 
3493    structure containing the listening socket and the list of tcp connections
3494    that the ctdb daemon is to kill
3495 */
3496 struct ctdb_kill_tcp {
3497         struct ctdb_vnn *vnn;
3498         struct ctdb_context *ctdb;
3499         int capture_fd;
3500         struct fd_event *fde;
3501         trbt_tree_t *connections;
3502         void *private_data;
3503 };
3504
3505 /*
3506   a tcp connection that is to be killed
3507  */
3508 struct ctdb_killtcp_con {
3509         ctdb_sock_addr src_addr;
3510         ctdb_sock_addr dst_addr;
3511         int count;
3512         struct ctdb_kill_tcp *killtcp;
3513 };
3514
3515 /* this function is used to create a key to represent this socketpair
3516    in the killtcp tree.
3517    this key is used to insert and lookup matching socketpairs that are
3518    to be tickled and RST
3519 */
3520 #define KILLTCP_KEYLEN  10
3521 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3522 {
3523         static uint32_t key[KILLTCP_KEYLEN];
3524
3525         bzero(key, sizeof(key));
3526
3527         if (src->sa.sa_family != dst->sa.sa_family) {
3528                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3529                 return key;
3530         }
3531         
3532         switch (src->sa.sa_family) {
3533         case AF_INET:
3534                 key[0]  = dst->ip.sin_addr.s_addr;
3535                 key[1]  = src->ip.sin_addr.s_addr;
3536                 key[2]  = dst->ip.sin_port;
3537                 key[3]  = src->ip.sin_port;
3538                 break;
3539         case AF_INET6: {
3540                 uint32_t *dst6_addr32 =
3541                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3542                 uint32_t *src6_addr32 =
3543                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3544                 key[0]  = dst6_addr32[3];
3545                 key[1]  = src6_addr32[3];
3546                 key[2]  = dst6_addr32[2];
3547                 key[3]  = src6_addr32[2];
3548                 key[4]  = dst6_addr32[1];
3549                 key[5]  = src6_addr32[1];
3550                 key[6]  = dst6_addr32[0];
3551                 key[7]  = src6_addr32[0];
3552                 key[8]  = dst->ip6.sin6_port;
3553                 key[9]  = src->ip6.sin6_port;
3554                 break;
3555         }
3556         default:
3557                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3558                 return key;
3559         }
3560
3561         return key;
3562 }
3563
3564 /*
3565   called when we get a read event on the raw socket
3566  */
3567 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3568                                 uint16_t flags, void *private_data)
3569 {
3570         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3571         struct ctdb_killtcp_con *con;
3572         ctdb_sock_addr src, dst;
3573         uint32_t ack_seq, seq;
3574
3575         if (!(flags & EVENT_FD_READ)) {
3576                 return;
3577         }
3578
3579         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3580                                 killtcp->private_data,
3581                                 &src, &dst,
3582                                 &ack_seq, &seq) != 0) {
3583                 /* probably a non-tcp ACK packet */
3584                 return;
3585         }
3586
3587         /* check if we have this guy in our list of connections
3588            to kill
3589         */
3590         con = trbt_lookuparray32(killtcp->connections, 
3591                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3592         if (con == NULL) {
3593                 /* no this was some other packet we can just ignore */
3594                 return;
3595         }
3596
3597         /* This one has been tickled !
3598            now reset him and remove him from the list.
3599          */
3600         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3601                 ntohs(con->dst_addr.ip.sin_port),
3602                 ctdb_addr_to_str(&con->src_addr),
3603                 ntohs(con->src_addr.ip.sin_port)));
3604
3605         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3606         talloc_free(con);
3607 }
3608
3609
3610 /* when traversing the list of all tcp connections to send tickle acks to
3611    (so that we can capture the ack coming back and kill the connection
3612     by a RST)
3613    this callback is called for each connection we are currently trying to kill
3614 */
3615 static int tickle_connection_traverse(void *param, void *data)
3616 {
3617         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3618
3619         /* have tried too many times, just give up */
3620         if (con->count >= 5) {
3621                 /* can't delete in traverse: reparent to delete_cons */
3622                 talloc_steal(param, con);
3623                 return 0;
3624         }
3625
3626         /* othervise, try tickling it again */
3627         con->count++;
3628         ctdb_sys_send_tcp(
3629                 (ctdb_sock_addr *)&con->dst_addr,
3630                 (ctdb_sock_addr *)&con->src_addr,
3631                 0, 0, 0);
3632         return 0;
3633 }
3634
3635
3636 /* 
3637    called every second until all sentenced connections have been reset
3638  */
3639 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3640                                               struct timeval t, void *private_data)
3641 {
3642         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3643         void *delete_cons = talloc_new(NULL);
3644
3645         /* loop over all connections sending tickle ACKs */
3646         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3647
3648         /* now we've finished traverse, it's safe to do deletion. */
3649         talloc_free(delete_cons);
3650
3651         /* If there are no more connections to kill we can remove the
3652            entire killtcp structure
3653          */
3654         if ( (killtcp->connections == NULL) || 
3655              (killtcp->connections->root == NULL) ) {
3656                 talloc_free(killtcp);
3657                 return;
3658         }
3659
3660         /* try tickling them again in a seconds time
3661          */
3662         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3663                         ctdb_tickle_sentenced_connections, killtcp);
3664 }
3665
3666 /*
3667   destroy the killtcp structure
3668  */
3669 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3670 {
3671         struct ctdb_vnn *tmpvnn;
3672
3673         /* verify that this vnn is still active */
3674         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3675                 if (tmpvnn == killtcp->vnn) {
3676                         break;
3677                 }
3678         }
3679
3680         if (tmpvnn == NULL) {
3681                 return 0;
3682         }
3683
3684         if (killtcp->vnn->killtcp != killtcp) {
3685                 return 0;
3686         }
3687
3688         killtcp->vnn->killtcp = NULL;
3689
3690         return 0;
3691 }
3692
3693
3694 /* nothing fancy here, just unconditionally replace any existing
3695    connection structure with the new one.
3696
3697    dont even free the old one if it did exist, that one is talloc_stolen
3698    by the same node in the tree anyway and will be deleted when the new data 
3699    is deleted
3700 */
3701 static void *add_killtcp_callback(void *parm, void *data)
3702 {
3703         return parm;
3704 }
3705
3706 /*
3707   add a tcp socket to the list of connections we want to RST
3708  */
3709 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3710                                        ctdb_sock_addr *s,
3711                                        ctdb_sock_addr *d)
3712 {
3713         ctdb_sock_addr src, dst;
3714         struct ctdb_kill_tcp *killtcp;
3715         struct ctdb_killtcp_con *con;
3716         struct ctdb_vnn *vnn;
3717
3718         ctdb_canonicalize_ip(s, &src);
3719         ctdb_canonicalize_ip(d, &dst);
3720
3721         vnn = find_public_ip_vnn(ctdb, &dst);
3722         if (vnn == NULL) {
3723                 vnn = find_public_ip_vnn(ctdb, &src);
3724         }
3725         if (vnn == NULL) {
3726                 /* if it is not a public ip   it could be our 'single ip' */
3727                 if (ctdb->single_ip_vnn) {
3728                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3729                                 vnn = ctdb->single_ip_vnn;
3730                         }
3731                 }
3732         }
3733         if (vnn == NULL) {
3734                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3735                 return -1;
3736         }
3737
3738         killtcp = vnn->killtcp;
3739         
3740         /* If this is the first connection to kill we must allocate
3741            a new structure
3742          */
3743         if (killtcp == NULL) {
3744                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3745                 CTDB_NO_MEMORY(ctdb, killtcp);
3746
3747                 killtcp->vnn         = vnn;
3748                 killtcp->ctdb        = ctdb;
3749                 killtcp->capture_fd  = -1;
3750                 killtcp->connections = trbt_create(killtcp, 0);
3751
3752                 vnn->killtcp         = killtcp;
3753                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3754         }
3755
3756
3757
3758         /* create a structure that describes this connection we want to
3759            RST and store it in killtcp->connections
3760         */
3761         con = talloc(killtcp, struct ctdb_killtcp_con);
3762         CTDB_NO_MEMORY(ctdb, con);
3763         con->src_addr = src;
3764         con->dst_addr = dst;
3765         con->count    = 0;
3766         con->killtcp  = killtcp;
3767
3768
3769         trbt_insertarray32_callback(killtcp->connections,
3770                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3771                         add_killtcp_callback, con);
3772
3773         /* 
3774            If we dont have a socket to listen on yet we must create it
3775          */
3776         if (killtcp->capture_fd == -1) {
3777                 const char *iface = ctdb_vnn_iface_string(vnn);
3778                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3779                 if (killtcp->capture_fd == -1) {
3780                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3781                                           "socket on iface '%s' for killtcp (%s)\n",
3782                                           iface, strerror(errno)));
3783                         goto failed;
3784                 }
3785         }
3786
3787
3788         if (killtcp->fde == NULL) {
3789                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3790                                             EVENT_FD_READ,
3791                                             capture_tcp_handler, killtcp);
3792                 tevent_fd_set_auto_close(killtcp->fde);
3793
3794                 /* We also need to set up some events to tickle all these connections
3795                    until they are all reset
3796                 */
3797                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3798                                 ctdb_tickle_sentenced_connections, killtcp);
3799         }
3800
3801         /* tickle him once now */
3802         ctdb_sys_send_tcp(
3803                 &con->dst_addr,
3804                 &con->src_addr,
3805                 0, 0, 0);
3806
3807         return 0;
3808
3809 failed:
3810         talloc_free(vnn->killtcp);
3811         vnn->killtcp = NULL;
3812         return -1;
3813 }
3814
3815 /*
3816   kill a TCP connection.
3817  */
3818 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3819 {
3820         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3821
3822         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3823 }
3824
3825 /*
3826   called by a daemon to inform us of the entire list of TCP tickles for
3827   a particular public address.
3828   this control should only be sent by the node that is currently serving
3829   that public address.
3830  */
3831 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3832 {
3833         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3834         struct ctdb_tcp_array *tcparray;
3835         struct ctdb_vnn *vnn;
3836
3837         /* We must at least have tickles.num or else we cant verify the size
3838            of the received data blob
3839          */
3840         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3841                                         tickles.connections)) {
3842                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3843                 return -1;
3844         }
3845
3846         /* verify that the size of data matches what we expect */
3847         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3848                                 tickles.connections)
3849                          + sizeof(struct ctdb_tcp_connection)
3850                                  * list->tickles.num) {
3851                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3852                 return -1;
3853         }       
3854
3855         vnn = find_public_ip_vnn(ctdb, &list->addr);
3856         if (vnn == NULL) {
3857                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3858                         ctdb_addr_to_str(&list->addr)));
3859
3860                 return 1;
3861         }
3862
3863         /* remove any old ticklelist we might have */
3864         talloc_free(vnn->tcp_array);
3865         vnn->tcp_array = NULL;
3866
3867         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3868         CTDB_NO_MEMORY(ctdb, tcparray);
3869
3870         tcparray->num = list->tickles.num;
3871
3872         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3873         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3874
3875         memcpy(tcparray->connections, &list->tickles.connections[0], 
3876                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3877
3878         /* We now have a new fresh tickle list array for this vnn */
3879         vnn->tcp_array = talloc_steal(vnn, tcparray);
3880         
3881         return 0;
3882 }
3883
3884 /*
3885   called to return the full list of tickles for the puclic address associated 
3886   with the provided vnn
3887  */
3888 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3889 {
3890         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3891         struct ctdb_control_tcp_tickle_list *list;
3892         struct ctdb_tcp_array *tcparray;
3893         int num;
3894         struct ctdb_vnn *vnn;
3895
3896         vnn = find_public_ip_vnn(ctdb, addr);
3897         if (vnn == NULL) {
3898                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3899                         ctdb_addr_to_str(addr)));
3900
3901                 return 1;
3902         }
3903
3904         tcparray = vnn->tcp_array;
3905         if (tcparray) {
3906                 num = tcparray->num;
3907         } else {
3908                 num = 0;
3909         }
3910
3911         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3912                                 tickles.connections)
3913                         + sizeof(struct ctdb_tcp_connection) * num;
3914
3915         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3916         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3917         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3918
3919         list->addr = *addr;
3920         list->tickles.num = num;
3921         if (num) {
3922                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3923                         sizeof(struct ctdb_tcp_connection) * num);
3924         }
3925
3926         return 0;
3927 }
3928
3929
3930 /*
3931   set the list of all tcp tickles for a public address
3932  */
3933 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3934                               struct timeval timeout, uint32_t destnode, 
3935                               ctdb_sock_addr *addr,
3936                               struct ctdb_tcp_array *tcparray)
3937 {
3938         int ret, num;
3939         TDB_DATA data;
3940         struct ctdb_control_tcp_tickle_list *list;
3941
3942         if (tcparray) {
3943                 num = tcparray->num;
3944         } else {
3945                 num = 0;
3946         }
3947
3948         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3949                                 tickles.connections) +
3950                         sizeof(struct ctdb_tcp_connection) * num;
3951         data.dptr = talloc_size(ctdb, data.dsize);
3952         CTDB_NO_MEMORY(ctdb, data.dptr);
3953
3954         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3955         list->addr = *addr;
3956         list->tickles.num = num;
3957         if (tcparray) {
3958                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3959         }
3960
3961         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3962                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3963                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3964         if (ret != 0) {
3965                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3966                 return -1;
3967         }
3968
3969         talloc_free(data.dptr);
3970
3971         return ret;
3972 }
3973
3974
3975 /*
3976   perform tickle updates if required
3977  */
3978 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3979                                 struct timed_event *te, 
3980                                 struct timeval t, void *private_data)
3981 {
3982         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3983         int ret;
3984         struct ctdb_vnn *vnn;
3985
3986         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3987                 /* we only send out updates for public addresses that 
3988                    we have taken over
3989                  */
3990                 if (ctdb->pnn != vnn->pnn) {
3991                         continue;
3992                 }
3993                 /* We only send out the updates if we need to */
3994                 if (!vnn->tcp_update_needed) {
3995                         continue;
3996                 }
3997                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3998                                 TAKEOVER_TIMEOUT(),
3999                                 CTDB_BROADCAST_CONNECTED,
4000                                 &vnn->public_address,
4001                                 vnn->tcp_array);
4002                 if (ret != 0) {
4003                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4004                                 ctdb_addr_to_str(&vnn->public_address)));
4005                 }
4006         }
4007
4008         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4009                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4010                              ctdb_update_tcp_tickles, ctdb);
4011 }               
4012         
4013
4014 /*
4015   start periodic update of tcp tickles
4016  */
4017 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4018 {
4019         ctdb->tickle_update_context = talloc_new(ctdb);
4020
4021         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4022                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4023                              ctdb_update_tcp_tickles, ctdb);
4024 }
4025
4026
4027
4028
4029 struct control_gratious_arp {
4030         struct ctdb_context *ctdb;
4031         ctdb_sock_addr addr;
4032         const char *iface;
4033         int count;
4034 };
4035
4036 /*
4037   send a control_gratuitous arp
4038  */
4039 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4040                                   struct timeval t, void *private_data)
4041 {
4042         int ret;
4043         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4044                                                         struct control_gratious_arp);
4045
4046         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4047         if (ret != 0) {
4048                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4049                                  arp->iface, strerror(errno)));
4050         }
4051
4052
4053         arp->count++;
4054         if (arp->count == CTDB_ARP_REPEAT) {
4055                 talloc_free(arp);
4056                 return;
4057         }
4058
4059         event_add_timed(arp->ctdb->ev, arp, 
4060                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4061                         send_gratious_arp, arp);
4062 }
4063
4064
4065 /*
4066   send a gratious arp 
4067  */
4068 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4069 {
4070         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4071         struct control_gratious_arp *arp;
4072
4073         /* verify the size of indata */
4074         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4075                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4076                                  (unsigned)indata.dsize, 
4077                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4078                 return -1;
4079         }
4080         if (indata.dsize != 
4081                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4082                 + gratious_arp->len ) ){
4083
4084                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4085                         "but should be %u bytes\n", 
4086                          (unsigned)indata.dsize, 
4087                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4088                 return -1;
4089         }
4090
4091
4092         arp = talloc(ctdb, struct control_gratious_arp);
4093         CTDB_NO_MEMORY(ctdb, arp);
4094
4095         arp->ctdb  = ctdb;
4096         arp->addr   = gratious_arp->addr;
4097         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4098         CTDB_NO_MEMORY(ctdb, arp->iface);
4099         arp->count = 0;
4100         
4101         event_add_timed(arp->ctdb->ev, arp, 
4102                         timeval_zero(), send_gratious_arp, arp);
4103
4104         return 0;
4105 }
4106
4107 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4108 {
4109         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4110         int ret;
4111
4112         /* verify the size of indata */
4113         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4114                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4115                 return -1;
4116         }
4117         if (indata.dsize != 
4118                 ( offsetof(struct ctdb_control_ip_iface, iface)
4119                 + pub->len ) ){
4120
4121                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4122                         "but should be %u bytes\n", 
4123                          (unsigned)indata.dsize, 
4124                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4125                 return -1;
4126         }
4127
4128         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4129
4130         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4131
4132         if (ret != 0) {
4133                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4134                 return -1;
4135         }
4136
4137         return 0;
4138 }
4139
4140 /*
4141   called when releaseip event finishes for del_public_address
4142  */
4143 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
4144                                 void *private_data)
4145 {
4146         talloc_free(private_data);
4147 }
4148
4149 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4150 {
4151         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4152         struct ctdb_vnn *vnn;
4153         int ret;
4154
4155         /* verify the size of indata */
4156         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4157                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4158                 return -1;
4159         }
4160         if (indata.dsize != 
4161                 ( offsetof(struct ctdb_control_ip_iface, iface)
4162                 + pub->len ) ){
4163
4164                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4165                         "but should be %u bytes\n", 
4166                          (unsigned)indata.dsize, 
4167                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4168                 return -1;
4169         }
4170
4171         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4172
4173         /* walk over all public addresses until we find a match */
4174         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4175                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4176                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4177
4178                         DLIST_REMOVE(ctdb->vnn, vnn);
4179                         talloc_steal(mem_ctx, vnn);
4180                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
4181                         if (vnn->pnn != ctdb->pnn) {
4182                                 if (vnn->iface != NULL) {
4183                                         ctdb_vnn_unassign_iface(ctdb, vnn);
4184                                 }
4185                                 talloc_free(mem_ctx);
4186                                 return 0;
4187                         }
4188                         vnn->pnn = -1;
4189
4190                         ret = ctdb_event_script_callback(ctdb, 
4191                                          mem_ctx, delete_ip_callback, mem_ctx,
4192                                          CTDB_EVENT_RELEASE_IP,
4193                                          "%s %s %u",
4194                                          ctdb_vnn_iface_string(vnn),
4195                                          ctdb_addr_to_str(&vnn->public_address),
4196                                          vnn->public_netmask_bits);
4197                         if (vnn->iface != NULL) {
4198                                 ctdb_vnn_unassign_iface(ctdb, vnn);
4199                         }
4200                         if (ret != 0) {
4201                                 return -1;
4202                         }
4203                         return 0;
4204                 }
4205         }
4206
4207         return -1;
4208 }
4209
4210
4211 struct ipreallocated_callback_state {
4212         struct ctdb_req_control *c;
4213 };
4214
4215 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4216                                         int status, void *p)
4217 {
4218         struct ipreallocated_callback_state *state =
4219                 talloc_get_type(p, struct ipreallocated_callback_state);
4220
4221         if (status != 0) {
4222                 DEBUG(DEBUG_ERR,
4223                       (" \"ipreallocated\" event script failed (status %d)\n",
4224                        status));
4225                 if (status == -ETIME) {
4226                         ctdb_ban_self(ctdb);
4227                 }
4228         }
4229
4230         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4231         talloc_free(state);
4232 }
4233
4234 /* A control to run the ipreallocated event */
4235 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4236                                    struct ctdb_req_control *c,
4237                                    bool *async_reply)
4238 {
4239         int ret;
4240         struct ipreallocated_callback_state *state;
4241
4242         state = talloc(ctdb, struct ipreallocated_callback_state);
4243         CTDB_NO_MEMORY(ctdb, state);
4244
4245         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4246
4247         ret = ctdb_event_script_callback(ctdb, state,
4248                                          ctdb_ipreallocated_callback, state,
4249                                          CTDB_EVENT_IPREALLOCATED,
4250                                          "%s", "");
4251
4252         if (ret != 0) {
4253                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4254                 talloc_free(state);
4255                 return -1;
4256         }
4257
4258         /* tell the control that we will be reply asynchronously */
4259         state->c    = talloc_steal(state, c);
4260         *async_reply = true;
4261
4262         return 0;
4263 }
4264
4265
4266 /* This function is called from the recovery daemon to verify that a remote
4267    node has the expected ip allocation.
4268    This is verified against ctdb->ip_tree
4269 */
4270 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4271                                 struct ctdb_all_public_ips *ips,
4272                                 uint32_t pnn)
4273 {
4274         struct ctdb_public_ip_list *tmp_ip; 
4275         int i;
4276
4277         if (ctdb->ip_tree == NULL) {
4278                 /* dont know the expected allocation yet, assume remote node
4279                    is correct. */
4280                 return 0;
4281         }
4282
4283         if (ips == NULL) {
4284                 return 0;
4285         }
4286
4287         for (i=0; i<ips->num; i++) {
4288                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4289                 if (tmp_ip == NULL) {
4290                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4291                         return -1;
4292                 }
4293
4294                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4295                         continue;
4296                 }
4297
4298                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4299                         DEBUG(DEBUG_ERR,
4300                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4301                                pnn,
4302                                ctdb_addr_to_str(&ips->ips[i].addr),
4303                                ips->ips[i].pnn, tmp_ip->pnn));
4304                         return -1;
4305                 }
4306         }
4307
4308         return 0;
4309 }
4310
4311 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4312 {
4313         struct ctdb_public_ip_list *tmp_ip; 
4314
4315         if (ctdb->ip_tree == NULL) {
4316                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4317                 return -1;
4318         }
4319
4320         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4321         if (tmp_ip == NULL) {
4322                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4323                 return -1;
4324         }
4325
4326         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4327         tmp_ip->pnn = ip->pnn;
4328
4329         return 0;
4330 }
4331
4332
4333 struct ctdb_reloadips_handle {
4334         struct ctdb_context *ctdb;
4335         struct ctdb_req_control *c;
4336         int status;
4337         int fd[2];
4338         pid_t child;
4339         struct fd_event *fde;
4340 };
4341
4342 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4343 {
4344         if (h == h->ctdb->reload_ips) {
4345                 h->ctdb->reload_ips = NULL;
4346         }
4347         if (h->c != NULL) {
4348                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4349                 h->c = NULL;
4350         }
4351         ctdb_kill(h->ctdb, h->child, SIGKILL);
4352         return 0;
4353 }
4354
4355 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4356                                 struct timed_event *te,
4357                                 struct timeval t, void *private_data)
4358 {
4359         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4360
4361         talloc_free(h);
4362 }       
4363
4364 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4365                              uint16_t flags, void *private_data)
4366 {
4367         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4368
4369         char res;
4370         int ret;
4371
4372         ret = read(h->fd[0], &res, 1);
4373         if (ret < 1 || res != 0) {
4374                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4375                 res = 1;
4376         }
4377         h->status = res;
4378
4379         talloc_free(h);
4380 }
4381
4382 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4383 {
4384         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4385         struct ctdb_all_public_ips *ips;
4386         struct ctdb_vnn *vnn;
4387         struct client_async_data *async_data;
4388         struct timeval timeout;
4389         TDB_DATA data;
4390         struct ctdb_client_control_state *state;
4391         bool first_add;
4392         int i, ret;
4393
4394         CTDB_NO_MEMORY(ctdb, mem_ctx);
4395
4396         /* Read IPs from local node */
4397         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4398                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4399         if (ret != 0) {
4400                 DEBUG(DEBUG_ERR,
4401                       ("Unable to fetch public IPs from local node\n"));
4402                 talloc_free(mem_ctx);
4403                 return -1;
4404         }
4405
4406         /* Read IPs file - this is safe since this is a child process */
4407         ctdb->vnn = NULL;
4408         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4409                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4410                 talloc_free(mem_ctx);
4411                 return -1;
4412         }
4413
4414         async_data = talloc_zero(mem_ctx, struct client_async_data);
4415         CTDB_NO_MEMORY(ctdb, async_data);
4416
4417         /* Compare IPs between node and file for IPs to be deleted */
4418         for (i = 0; i < ips->num; i++) {
4419                 /* */
4420                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4421                         if (ctdb_same_ip(&vnn->public_address,
4422                                          &ips->ips[i].addr)) {
4423                                 /* IP is still in file */
4424                                 break;
4425                         }
4426                 }
4427
4428                 if (vnn == NULL) {
4429                         /* Delete IP ips->ips[i] */
4430                         struct ctdb_control_ip_iface *pub;
4431
4432                         DEBUG(DEBUG_NOTICE,
4433                               ("IP %s no longer configured, deleting it\n",
4434                                ctdb_addr_to_str(&ips->ips[i].addr)));
4435
4436                         pub = talloc_zero(mem_ctx,
4437                                           struct ctdb_control_ip_iface);
4438                         CTDB_NO_MEMORY(ctdb, pub);
4439
4440                         pub->addr  = ips->ips[i].addr;
4441                         pub->mask  = 0;
4442                         pub->len   = 0;
4443
4444                         timeout = TAKEOVER_TIMEOUT();
4445
4446                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4447                                               iface) + pub->len;
4448                         data.dptr = (uint8_t *)pub;
4449
4450                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4451                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4452                                                   0, data, async_data,
4453                                                   &timeout, NULL);
4454                         if (state == NULL) {
4455                                 DEBUG(DEBUG_ERR,
4456                                       (__location__
4457                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4458                                 goto failed;
4459                         }
4460
4461                         ctdb_client_async_add(async_data, state);
4462                 }
4463         }
4464
4465         /* Compare IPs between node and file for IPs to be added */
4466         first_add = true;
4467         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4468                 for (i = 0; i < ips->num; i++) {
4469                         if (ctdb_same_ip(&vnn->public_address,
4470                                          &ips->ips[i].addr)) {
4471                                 /* IP already on node */
4472                                 break;
4473                         }
4474                 }
4475                 if (i == ips->num) {
4476                         /* Add IP ips->ips[i] */
4477                         struct ctdb_control_ip_iface *pub;
4478                         const char *ifaces = NULL;
4479                         uint32_t len;
4480                         int iface = 0;
4481
4482                         DEBUG(DEBUG_NOTICE,
4483                               ("New IP %s configured, adding it\n",
4484                                ctdb_addr_to_str(&vnn->public_address)));
4485                         if (first_add) {
4486                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4487
4488                                 data.dsize = sizeof(pnn);
4489                                 data.dptr  = (uint8_t *)&pnn;
4490
4491                                 ret = ctdb_client_send_message(
4492                                         ctdb,
4493                                         CTDB_BROADCAST_CONNECTED,
4494                                         CTDB_SRVID_REBALANCE_NODE,
4495                                         data);
4496                                 if (ret != 0) {
4497                                         DEBUG(DEBUG_WARNING,
4498                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4499                                 }
4500
4501                                 first_add = false;
4502                         }
4503
4504                         ifaces = vnn->ifaces[0];
4505                         iface = 1;
4506                         while (vnn->ifaces[iface] != NULL) {
4507                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4508                                                          vnn->ifaces[iface]);
4509                                 iface++;
4510                         }
4511
4512                         len   = strlen(ifaces) + 1;
4513                         pub = talloc_zero_size(mem_ctx,
4514                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4515                         CTDB_NO_MEMORY(ctdb, pub);
4516
4517                         pub->addr  = vnn->public_address;
4518                         pub->mask  = vnn->public_netmask_bits;
4519                         pub->len   = len;
4520                         memcpy(&pub->iface[0], ifaces, pub->len);
4521
4522                         timeout = TAKEOVER_TIMEOUT();
4523
4524                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4525                                               iface) + pub->len;
4526                         data.dptr = (uint8_t *)pub;
4527
4528                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4529                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4530                                                   0, data, async_data,
4531                                                   &timeout, NULL);
4532                         if (state == NULL) {
4533                                 DEBUG(DEBUG_ERR,
4534                                       (__location__
4535                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4536                                 goto failed;
4537                         }
4538
4539                         ctdb_client_async_add(async_data, state);
4540                 }
4541         }
4542
4543         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4544                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4545                 goto failed;
4546         }
4547
4548         talloc_free(mem_ctx);
4549         return 0;
4550
4551 failed:
4552         talloc_free(mem_ctx);
4553         return -1;
4554 }
4555
4556 /* This control is sent to force the node to re-read the public addresses file
4557    and drop any addresses we should nnot longer host, and add new addresses
4558    that we are now able to host
4559 */
4560 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4561 {
4562         struct ctdb_reloadips_handle *h;
4563         pid_t parent = getpid();
4564
4565         if (ctdb->reload_ips != NULL) {
4566                 talloc_free(ctdb->reload_ips);
4567                 ctdb->reload_ips = NULL;
4568         }
4569
4570         h = talloc(ctdb, struct ctdb_reloadips_handle);
4571         CTDB_NO_MEMORY(ctdb, h);
4572         h->ctdb     = ctdb;
4573         h->c        = NULL;
4574         h->status   = -1;
4575         
4576         if (pipe(h->fd) == -1) {
4577                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4578                 talloc_free(h);
4579                 return -1;
4580         }
4581
4582         h->child = ctdb_fork(ctdb);
4583         if (h->child == (pid_t)-1) {
4584                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4585                 close(h->fd[0]);
4586                 close(h->fd[1]);
4587                 talloc_free(h);
4588                 return -1;
4589         }
4590
4591         /* child process */
4592         if (h->child == 0) {
4593                 signed char res = 0;
4594
4595                 close(h->fd[0]);
4596                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4597
4598                 ctdb_set_process_name("ctdb_reloadips");
4599                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4600                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4601                         res = -1;
4602                 } else {
4603                         res = ctdb_reloadips_child(ctdb);
4604                         if (res != 0) {
4605                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4606                         }
4607                 }
4608
4609                 write(h->fd[1], &res, 1);
4610                 /* make sure we die when our parent dies */
4611                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4612                         sleep(5);
4613                 }
4614                 _exit(0);
4615         }
4616
4617         h->c             = talloc_steal(h, c);
4618
4619         close(h->fd[1]);
4620         set_close_on_exec(h->fd[0]);
4621
4622         talloc_set_destructor(h, ctdb_reloadips_destructor);
4623
4624
4625         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4626                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4627                         (void *)h);
4628         tevent_fd_set_auto_close(h->fde);
4629
4630         event_add_timed(ctdb->ev, h,
4631                         timeval_current_ofs(120, 0),
4632                         ctdb_reloadips_timeout_event, h);
4633
4634         /* we reply later */
4635         *async_reply = true;
4636         return 0;
4637 }