ctdb-daemon: Check if updates are in flight when releasing all IPs
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40         enum ctdb_runstate runstate;
41 };
42
43 struct ctdb_iface {
44         struct ctdb_iface *prev, *next;
45         const char *name;
46         bool link_up;
47         uint32_t references;
48 };
49
50 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
51 {
52         if (vnn->iface) {
53                 return vnn->iface->name;
54         }
55
56         return "__none__";
57 }
58
59 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
60 {
61         struct ctdb_iface *i;
62
63         /* Verify that we dont have an entry for this ip yet */
64         for (i=ctdb->ifaces;i;i=i->next) {
65                 if (strcmp(i->name, iface) == 0) {
66                         return 0;
67                 }
68         }
69
70         /* create a new structure for this interface */
71         i = talloc_zero(ctdb, struct ctdb_iface);
72         CTDB_NO_MEMORY_FATAL(ctdb, i);
73         i->name = talloc_strdup(i, iface);
74         CTDB_NO_MEMORY(ctdb, i->name);
75
76         i->link_up = true;
77
78         DLIST_ADD(ctdb->ifaces, i);
79
80         return 0;
81 }
82
83 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
84                                         const char *name)
85 {
86         int n;
87
88         for (n = 0; vnn->ifaces[n] != NULL; n++) {
89                 if (strcmp(name, vnn->ifaces[n]) == 0) {
90                         return true;
91                 }
92         }
93
94         return false;
95 }
96
97 /* If any interfaces now have no possible IPs then delete them.  This
98  * implementation is naive (i.e. simple) rather than clever
99  * (i.e. complex).  Given that this is run on delip and that operation
100  * is rare, this doesn't need to be efficient - it needs to be
101  * foolproof.  One alternative is reference counting, where the logic
102  * is distributed and can, therefore, be broken in multiple places.
103  * Another alternative is to build a red-black tree of interfaces that
104  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
105  * once) and then walking ctdb->ifaces once and deleting those not in
106  * the tree.  Let's go to one of those if the naive implementation
107  * causes problems...  :-)
108  */
109 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
110                                         struct ctdb_vnn *vnn)
111 {
112         struct ctdb_iface *i, *next;
113
114         /* For each interface, check if there's an IP using it. */
115         for (i = ctdb->ifaces; i != NULL; i = next) {
116                 struct ctdb_vnn *tv;
117                 bool found;
118                 next = i->next;
119
120                 /* Only consider interfaces named in the given VNN. */
121                 if (!vnn_has_interface_with_name(vnn, i->name)) {
122                         continue;
123                 }
124
125                 /* Is the "single IP" on this interface? */
126                 if ((ctdb->single_ip_vnn != NULL) &&
127                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
128                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
129                         /* Found, next interface please... */
130                         continue;
131                 }
132                 /* Search for a vnn with this interface. */
133                 found = false;
134                 for (tv=ctdb->vnn; tv; tv=tv->next) {
135                         if (vnn_has_interface_with_name(tv, i->name)) {
136                                 found = true;
137                                 break;
138                         }
139                 }
140
141                 if (!found) {
142                         /* None of the VNNs are using this interface. */
143                         DLIST_REMOVE(ctdb->ifaces, i);
144                         talloc_free(i);
145                 }
146         }
147 }
148
149
150 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
151                                           const char *iface)
152 {
153         struct ctdb_iface *i;
154
155         for (i=ctdb->ifaces;i;i=i->next) {
156                 if (strcmp(i->name, iface) == 0) {
157                         return i;
158                 }
159         }
160
161         return NULL;
162 }
163
164 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
165                                               struct ctdb_vnn *vnn)
166 {
167         int i;
168         struct ctdb_iface *cur = NULL;
169         struct ctdb_iface *best = NULL;
170
171         for (i=0; vnn->ifaces[i]; i++) {
172
173                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
174                 if (cur == NULL) {
175                         continue;
176                 }
177
178                 if (!cur->link_up) {
179                         continue;
180                 }
181
182                 if (best == NULL) {
183                         best = cur;
184                         continue;
185                 }
186
187                 if (cur->references < best->references) {
188                         best = cur;
189                         continue;
190                 }
191         }
192
193         return best;
194 }
195
196 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
197                                      struct ctdb_vnn *vnn)
198 {
199         struct ctdb_iface *best = NULL;
200
201         if (vnn->iface) {
202                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
203                                    "still assigned to iface '%s'\n",
204                                    ctdb_addr_to_str(&vnn->public_address),
205                                    ctdb_vnn_iface_string(vnn)));
206                 return 0;
207         }
208
209         best = ctdb_vnn_best_iface(ctdb, vnn);
210         if (best == NULL) {
211                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
212                                   "cannot assign to iface any iface\n",
213                                   ctdb_addr_to_str(&vnn->public_address)));
214                 return -1;
215         }
216
217         vnn->iface = best;
218         best->references++;
219         vnn->pnn = ctdb->pnn;
220
221         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
222                            "now assigned to iface '%s' refs[%d]\n",
223                            ctdb_addr_to_str(&vnn->public_address),
224                            ctdb_vnn_iface_string(vnn),
225                            best->references));
226         return 0;
227 }
228
229 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
230                                     struct ctdb_vnn *vnn)
231 {
232         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                            "now unassigned (old iface '%s' refs[%d])\n",
234                            ctdb_addr_to_str(&vnn->public_address),
235                            ctdb_vnn_iface_string(vnn),
236                            vnn->iface?vnn->iface->references:0));
237         if (vnn->iface) {
238                 vnn->iface->references--;
239         }
240         vnn->iface = NULL;
241         if (vnn->pnn == ctdb->pnn) {
242                 vnn->pnn = -1;
243         }
244 }
245
246 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
247                                struct ctdb_vnn *vnn)
248 {
249         int i;
250
251         if (vnn->delete_pending) {
252                 return false;
253         }
254
255         if (vnn->iface && vnn->iface->link_up) {
256                 return true;
257         }
258
259         for (i=0; vnn->ifaces[i]; i++) {
260                 struct ctdb_iface *cur;
261
262                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
263                 if (cur == NULL) {
264                         continue;
265                 }
266
267                 if (cur->link_up) {
268                         return true;
269                 }
270         }
271
272         return false;
273 }
274
275 struct ctdb_takeover_arp {
276         struct ctdb_context *ctdb;
277         uint32_t count;
278         ctdb_sock_addr addr;
279         struct ctdb_tcp_array *tcparray;
280         struct ctdb_vnn *vnn;
281 };
282
283
284 /*
285   lists of tcp endpoints
286  */
287 struct ctdb_tcp_list {
288         struct ctdb_tcp_list *prev, *next;
289         struct ctdb_tcp_connection connection;
290 };
291
292 /*
293   list of clients to kill on IP release
294  */
295 struct ctdb_client_ip {
296         struct ctdb_client_ip *prev, *next;
297         struct ctdb_context *ctdb;
298         ctdb_sock_addr addr;
299         uint32_t client_id;
300 };
301
302
303 /*
304   send a gratuitous arp
305  */
306 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
307                                   struct timeval t, void *private_data)
308 {
309         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
310                                                         struct ctdb_takeover_arp);
311         int i, ret;
312         struct ctdb_tcp_array *tcparray;
313         const char *iface = ctdb_vnn_iface_string(arp->vnn);
314
315         ret = ctdb_sys_send_arp(&arp->addr, iface);
316         if (ret != 0) {
317                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
318                                   iface, strerror(errno)));
319         }
320
321         tcparray = arp->tcparray;
322         if (tcparray) {
323                 for (i=0;i<tcparray->num;i++) {
324                         struct ctdb_tcp_connection *tcon;
325
326                         tcon = &tcparray->connections[i];
327                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
328                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
329                                 ctdb_addr_to_str(&tcon->src_addr),
330                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
331                         ret = ctdb_sys_send_tcp(
332                                 &tcon->src_addr, 
333                                 &tcon->dst_addr,
334                                 0, 0, 0);
335                         if (ret != 0) {
336                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
337                                         ctdb_addr_to_str(&tcon->src_addr)));
338                         }
339                 }
340         }
341
342         arp->count++;
343
344         if (arp->count == CTDB_ARP_REPEAT) {
345                 talloc_free(arp);
346                 return;
347         }
348
349         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
350                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
351                         ctdb_control_send_arp, arp);
352 }
353
354 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
355                                        struct ctdb_vnn *vnn)
356 {
357         struct ctdb_takeover_arp *arp;
358         struct ctdb_tcp_array *tcparray;
359
360         if (!vnn->takeover_ctx) {
361                 vnn->takeover_ctx = talloc_new(vnn);
362                 if (!vnn->takeover_ctx) {
363                         return -1;
364                 }
365         }
366
367         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
368         if (!arp) {
369                 return -1;
370         }
371
372         arp->ctdb = ctdb;
373         arp->addr = vnn->public_address;
374         arp->vnn  = vnn;
375
376         tcparray = vnn->tcp_array;
377         if (tcparray) {
378                 /* add all of the known tcp connections for this IP to the
379                    list of tcp connections to send tickle acks for */
380                 arp->tcparray = talloc_steal(arp, tcparray);
381
382                 vnn->tcp_array = NULL;
383                 vnn->tcp_update_needed = true;
384         }
385
386         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
387                         timeval_zero(), ctdb_control_send_arp, arp);
388
389         return 0;
390 }
391
392 struct takeover_callback_state {
393         struct ctdb_req_control *c;
394         ctdb_sock_addr *addr;
395         struct ctdb_vnn *vnn;
396 };
397
398 struct ctdb_do_takeip_state {
399         struct ctdb_req_control *c;
400         struct ctdb_vnn *vnn;
401 };
402
403 /*
404   called when takeip event finishes
405  */
406 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
407                                     void *private_data)
408 {
409         struct ctdb_do_takeip_state *state =
410                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
411         int32_t ret;
412         TDB_DATA data;
413
414         if (status != 0) {
415                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
416         
417                 if (status == -ETIME) {
418                         ctdb_ban_self(ctdb);
419                 }
420                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
421                                  ctdb_addr_to_str(&state->vnn->public_address),
422                                  ctdb_vnn_iface_string(state->vnn)));
423                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
424
425                 node->flags |= NODE_FLAGS_UNHEALTHY;
426                 talloc_free(state);
427                 return;
428         }
429
430         if (ctdb->do_checkpublicip) {
431
432         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
433         if (ret != 0) {
434                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
435                 talloc_free(state);
436                 return;
437         }
438
439         }
440
441         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
442         data.dsize = strlen((char *)data.dptr) + 1;
443         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
444
445         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
446
447
448         /* the control succeeded */
449         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
450         talloc_free(state);
451         return;
452 }
453
454 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
455 {
456         state->vnn->update_in_flight = false;
457         return 0;
458 }
459
460 /*
461   take over an ip address
462  */
463 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
464                               struct ctdb_req_control *c,
465                               struct ctdb_vnn *vnn)
466 {
467         int ret;
468         struct ctdb_do_takeip_state *state;
469
470         if (vnn->update_in_flight) {
471                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
472                                     "update for this IP already in flight\n",
473                                     ctdb_addr_to_str(&vnn->public_address),
474                                     vnn->public_netmask_bits));
475                 return -1;
476         }
477
478         ret = ctdb_vnn_assign_iface(ctdb, vnn);
479         if (ret != 0) {
480                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
481                                  "assign a usable interface\n",
482                                  ctdb_addr_to_str(&vnn->public_address),
483                                  vnn->public_netmask_bits));
484                 return -1;
485         }
486
487         state = talloc(vnn, struct ctdb_do_takeip_state);
488         CTDB_NO_MEMORY(ctdb, state);
489
490         state->c = talloc_steal(ctdb, c);
491         state->vnn   = vnn;
492
493         vnn->update_in_flight = true;
494         talloc_set_destructor(state, ctdb_takeip_destructor);
495
496         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
497                             ctdb_addr_to_str(&vnn->public_address),
498                             vnn->public_netmask_bits,
499                             ctdb_vnn_iface_string(vnn)));
500
501         ret = ctdb_event_script_callback(ctdb,
502                                          state,
503                                          ctdb_do_takeip_callback,
504                                          state,
505                                          CTDB_EVENT_TAKE_IP,
506                                          "%s %s %u",
507                                          ctdb_vnn_iface_string(vnn),
508                                          ctdb_addr_to_str(&vnn->public_address),
509                                          vnn->public_netmask_bits);
510
511         if (ret != 0) {
512                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
513                         ctdb_addr_to_str(&vnn->public_address),
514                         ctdb_vnn_iface_string(vnn)));
515                 talloc_free(state);
516                 return -1;
517         }
518
519         return 0;
520 }
521
522 struct ctdb_do_updateip_state {
523         struct ctdb_req_control *c;
524         struct ctdb_iface *old;
525         struct ctdb_vnn *vnn;
526 };
527
528 /*
529   called when updateip event finishes
530  */
531 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
532                                       void *private_data)
533 {
534         struct ctdb_do_updateip_state *state =
535                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
536         int32_t ret;
537
538         if (status != 0) {
539                 if (status == -ETIME) {
540                         ctdb_ban_self(ctdb);
541                 }
542                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
543                         ctdb_addr_to_str(&state->vnn->public_address),
544                         state->old->name,
545                         ctdb_vnn_iface_string(state->vnn)));
546
547                 /*
548                  * All we can do is reset the old interface
549                  * and let the next run fix it
550                  */
551                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
552                 state->vnn->iface = state->old;
553                 state->vnn->iface->references++;
554
555                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
556                 talloc_free(state);
557                 return;
558         }
559
560         if (ctdb->do_checkpublicip) {
561
562         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
563         if (ret != 0) {
564                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         }
570
571         /* the control succeeded */
572         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
573         talloc_free(state);
574         return;
575 }
576
577 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
578 {
579         state->vnn->update_in_flight = false;
580         return 0;
581 }
582
583 /*
584   update (move) an ip address
585  */
586 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
587                                 struct ctdb_req_control *c,
588                                 struct ctdb_vnn *vnn)
589 {
590         int ret;
591         struct ctdb_do_updateip_state *state;
592         struct ctdb_iface *old = vnn->iface;
593         const char *new_name;
594
595         if (vnn->update_in_flight) {
596                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
597                                     "update for this IP already in flight\n",
598                                     ctdb_addr_to_str(&vnn->public_address),
599                                     vnn->public_netmask_bits));
600                 return -1;
601         }
602
603         ctdb_vnn_unassign_iface(ctdb, vnn);
604         ret = ctdb_vnn_assign_iface(ctdb, vnn);
605         if (ret != 0) {
606                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
607                                  "assin a usable interface (old iface '%s')\n",
608                                  ctdb_addr_to_str(&vnn->public_address),
609                                  vnn->public_netmask_bits,
610                                  old->name));
611                 return -1;
612         }
613
614         new_name = ctdb_vnn_iface_string(vnn);
615         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
616                 /* A benign update from one interface onto itself.
617                  * no need to run the eventscripts in this case, just return
618                  * success.
619                  */
620                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
621                 return 0;
622         }
623
624         state = talloc(vnn, struct ctdb_do_updateip_state);
625         CTDB_NO_MEMORY(ctdb, state);
626
627         state->c = talloc_steal(ctdb, c);
628         state->old = old;
629         state->vnn = vnn;
630
631         vnn->update_in_flight = true;
632         talloc_set_destructor(state, ctdb_updateip_destructor);
633
634         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
635                             "interface %s to %s\n",
636                             ctdb_addr_to_str(&vnn->public_address),
637                             vnn->public_netmask_bits,
638                             old->name,
639                             new_name));
640
641         ret = ctdb_event_script_callback(ctdb,
642                                          state,
643                                          ctdb_do_updateip_callback,
644                                          state,
645                                          CTDB_EVENT_UPDATE_IP,
646                                          "%s %s %s %u",
647                                          state->old->name,
648                                          new_name,
649                                          ctdb_addr_to_str(&vnn->public_address),
650                                          vnn->public_netmask_bits);
651         if (ret != 0) {
652                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
653                                  ctdb_addr_to_str(&vnn->public_address),
654                                  old->name, new_name));
655                 talloc_free(state);
656                 return -1;
657         }
658
659         return 0;
660 }
661
662 /*
663   Find the vnn of the node that has a public ip address
664   returns -1 if the address is not known as a public address
665  */
666 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
667 {
668         struct ctdb_vnn *vnn;
669
670         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
671                 if (ctdb_same_ip(&vnn->public_address, addr)) {
672                         return vnn;
673                 }
674         }
675
676         return NULL;
677 }
678
679 /*
680   take over an ip address
681  */
682 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
683                                  struct ctdb_req_control *c,
684                                  TDB_DATA indata,
685                                  bool *async_reply)
686 {
687         int ret;
688         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
689         struct ctdb_vnn *vnn;
690         bool have_ip = false;
691         bool do_updateip = false;
692         bool do_takeip = false;
693         struct ctdb_iface *best_iface = NULL;
694
695         if (pip->pnn != ctdb->pnn) {
696                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
697                                  "with pnn %d, but we're node %d\n",
698                                  ctdb_addr_to_str(&pip->addr),
699                                  pip->pnn, ctdb->pnn));
700                 return -1;
701         }
702
703         /* update out vnn list */
704         vnn = find_public_ip_vnn(ctdb, &pip->addr);
705         if (vnn == NULL) {
706                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
707                         ctdb_addr_to_str(&pip->addr)));
708                 return 0;
709         }
710
711         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
712                 have_ip = ctdb_sys_have_ip(&pip->addr);
713         }
714         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
715         if (best_iface == NULL) {
716                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
717                                  "a usable interface (old %s, have_ip %d)\n",
718                                  ctdb_addr_to_str(&vnn->public_address),
719                                  vnn->public_netmask_bits,
720                                  ctdb_vnn_iface_string(vnn),
721                                  have_ip));
722                 return -1;
723         }
724
725         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
726                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
727                 have_ip = false;
728         }
729
730
731         if (vnn->iface == NULL && have_ip) {
732                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
733                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
734                                  ctdb_addr_to_str(&vnn->public_address)));
735                 return 0;
736         }
737
738         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
739                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
740                                   "and we have it on iface[%s], but it was assigned to node %d"
741                                   "and we are node %d, banning ourself\n",
742                                  ctdb_addr_to_str(&vnn->public_address),
743                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
744                 ctdb_ban_self(ctdb);
745                 return -1;
746         }
747
748         if (vnn->pnn == -1 && have_ip) {
749                 vnn->pnn = ctdb->pnn;
750                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
751                                   "and we already have it on iface[%s], update local daemon\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                   ctdb_vnn_iface_string(vnn)));
754                 return 0;
755         }
756
757         if (vnn->iface) {
758                 if (vnn->iface != best_iface) {
759                         if (!vnn->iface->link_up) {
760                                 do_updateip = true;
761                         } else if (vnn->iface->references > (best_iface->references + 1)) {
762                                 /* only move when the rebalance gains something */
763                                         do_updateip = true;
764                         }
765                 }
766         }
767
768         if (!have_ip) {
769                 if (do_updateip) {
770                         ctdb_vnn_unassign_iface(ctdb, vnn);
771                         do_updateip = false;
772                 }
773                 do_takeip = true;
774         }
775
776         if (do_takeip) {
777                 ret = ctdb_do_takeip(ctdb, c, vnn);
778                 if (ret != 0) {
779                         return -1;
780                 }
781         } else if (do_updateip) {
782                 ret = ctdb_do_updateip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else {
787                 /*
788                  * The interface is up and the kernel known the ip
789                  * => do nothing
790                  */
791                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
792                         ctdb_addr_to_str(&pip->addr),
793                         vnn->public_netmask_bits,
794                         ctdb_vnn_iface_string(vnn)));
795                 return 0;
796         }
797
798         /* tell ctdb_control.c that we will be replying asynchronously */
799         *async_reply = true;
800
801         return 0;
802 }
803
804 /*
805   kill any clients that are registered with a IP that is being released
806  */
807 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
808 {
809         struct ctdb_client_ip *ip;
810
811         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
812                 ctdb_addr_to_str(addr)));
813
814         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
815                 ctdb_sock_addr tmp_addr;
816
817                 tmp_addr = ip->addr;
818                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
819                         ip->client_id,
820                         ctdb_addr_to_str(&ip->addr)));
821
822                 if (ctdb_same_ip(&tmp_addr, addr)) {
823                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
824                                                                      ip->client_id, 
825                                                                      struct ctdb_client);
826                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
827                                 ip->client_id,
828                                 ctdb_addr_to_str(&ip->addr),
829                                 client->pid));
830
831                         if (client->pid != 0) {
832                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
833                                         (unsigned)client->pid,
834                                         ctdb_addr_to_str(addr),
835                                         ip->client_id));
836                                 kill(client->pid, SIGKILL);
837                         }
838                 }
839         }
840 }
841
842 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
843 {
844         DLIST_REMOVE(ctdb->vnn, vnn);
845         ctdb_vnn_unassign_iface(ctdb, vnn);
846         ctdb_remove_orphaned_ifaces(ctdb, vnn);
847         talloc_free(vnn);
848 }
849
850 /*
851   called when releaseip event finishes
852  */
853 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
854                                 void *private_data)
855 {
856         struct takeover_callback_state *state = 
857                 talloc_get_type(private_data, struct takeover_callback_state);
858         TDB_DATA data;
859
860         if (status == -ETIME) {
861                 ctdb_ban_self(ctdb);
862         }
863
864         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
865                 if  (ctdb_sys_have_ip(state->addr)) {
866                         DEBUG(DEBUG_ERR,
867                               ("IP %s still hosted during release IP callback, failing\n",
868                                ctdb_addr_to_str(state->addr)));
869                         ctdb_request_control_reply(ctdb, state->c,
870                                                    NULL, -1, NULL);
871                         talloc_free(state);
872                         return;
873                 }
874         }
875
876         /* send a message to all clients of this node telling them
877            that the cluster has been reconfigured and they should
878            release any sockets on this IP */
879         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
880         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
881         data.dsize = strlen((char *)data.dptr)+1;
882
883         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
884
885         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
886
887         /* kill clients that have registered with this IP */
888         release_kill_clients(ctdb, state->addr);
889
890         ctdb_vnn_unassign_iface(ctdb, state->vnn);
891
892         /* Process the IP if it has been marked for deletion */
893         if (state->vnn->delete_pending) {
894                 do_delete_ip(ctdb, state->vnn);
895                 state->vnn = NULL;
896         }
897
898         /* the control succeeded */
899         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
900         talloc_free(state);
901 }
902
903 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
904 {
905         if (state->vnn != NULL) {
906                 state->vnn->update_in_flight = false;
907         }
908         return 0;
909 }
910
911 /*
912   release an ip address
913  */
914 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
915                                 struct ctdb_req_control *c,
916                                 TDB_DATA indata, 
917                                 bool *async_reply)
918 {
919         int ret;
920         struct takeover_callback_state *state;
921         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
922         struct ctdb_vnn *vnn;
923         char *iface;
924
925         /* update our vnn list */
926         vnn = find_public_ip_vnn(ctdb, &pip->addr);
927         if (vnn == NULL) {
928                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
929                         ctdb_addr_to_str(&pip->addr)));
930                 return 0;
931         }
932         vnn->pnn = pip->pnn;
933
934         /* stop any previous arps */
935         talloc_free(vnn->takeover_ctx);
936         vnn->takeover_ctx = NULL;
937
938         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
939          * lazy multicast to drop an IP from any node that isn't the
940          * intended new node.  The following causes makes ctdbd ignore
941          * a release for any address it doesn't host.
942          */
943         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
944                 if (!ctdb_sys_have_ip(&pip->addr)) {
945                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
946                                 ctdb_addr_to_str(&pip->addr),
947                                 vnn->public_netmask_bits,
948                                 ctdb_vnn_iface_string(vnn)));
949                         ctdb_vnn_unassign_iface(ctdb, vnn);
950                         return 0;
951                 }
952         } else {
953                 if (vnn->iface == NULL) {
954                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
955                                            ctdb_addr_to_str(&pip->addr),
956                                            vnn->public_netmask_bits));
957                         return 0;
958                 }
959         }
960
961         /* There is a potential race between take_ip and us because we
962          * update the VNN via a callback that run when the
963          * eventscripts have been run.  Avoid the race by allowing one
964          * update to be in flight at a time.
965          */
966         if (vnn->update_in_flight) {
967                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
968                                     "update for this IP already in flight\n",
969                                     ctdb_addr_to_str(&vnn->public_address),
970                                     vnn->public_netmask_bits));
971                 return -1;
972         }
973
974         iface = strdup(ctdb_vnn_iface_string(vnn));
975
976         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
977                 ctdb_addr_to_str(&pip->addr),
978                 vnn->public_netmask_bits,
979                 iface,
980                 pip->pnn));
981
982         state = talloc(ctdb, struct takeover_callback_state);
983         if (state == NULL) {
984                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
985                                __FILE__, __LINE__);
986                 free(iface);
987                 return -1;
988         }
989
990         state->c = talloc_steal(state, c);
991         state->addr = talloc(state, ctdb_sock_addr);       
992         if (state->addr == NULL) {
993                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
994                                __FILE__, __LINE__);
995                 free(iface);
996                 talloc_free(state);
997                 return -1;
998         }
999         *state->addr = pip->addr;
1000         state->vnn   = vnn;
1001
1002         vnn->update_in_flight = true;
1003         talloc_set_destructor(state, ctdb_releaseip_destructor);
1004
1005         ret = ctdb_event_script_callback(ctdb, 
1006                                          state, release_ip_callback, state,
1007                                          CTDB_EVENT_RELEASE_IP,
1008                                          "%s %s %u",
1009                                          iface,
1010                                          ctdb_addr_to_str(&pip->addr),
1011                                          vnn->public_netmask_bits);
1012         free(iface);
1013         if (ret != 0) {
1014                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1015                         ctdb_addr_to_str(&pip->addr),
1016                         ctdb_vnn_iface_string(vnn)));
1017                 talloc_free(state);
1018                 return -1;
1019         }
1020
1021         /* tell the control that we will be reply asynchronously */
1022         *async_reply = true;
1023         return 0;
1024 }
1025
1026 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1027                                    ctdb_sock_addr *addr,
1028                                    unsigned mask, const char *ifaces,
1029                                    bool check_address)
1030 {
1031         struct ctdb_vnn      *vnn;
1032         uint32_t num = 0;
1033         char *tmp;
1034         const char *iface;
1035         int i;
1036         int ret;
1037
1038         tmp = strdup(ifaces);
1039         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1040                 if (!ctdb_sys_check_iface_exists(iface)) {
1041                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1042                         free(tmp);
1043                         return -1;
1044                 }
1045         }
1046         free(tmp);
1047
1048         /* Verify that we dont have an entry for this ip yet */
1049         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1050                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1051                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1052                                 ctdb_addr_to_str(addr)));
1053                         return -1;
1054                 }               
1055         }
1056
1057         /* create a new vnn structure for this ip address */
1058         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1059         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1060         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1061         tmp = talloc_strdup(vnn, ifaces);
1062         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1063         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1064                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1065                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1066                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1067                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1068                 num++;
1069         }
1070         talloc_free(tmp);
1071         vnn->ifaces[num] = NULL;
1072         vnn->public_address      = *addr;
1073         vnn->public_netmask_bits = mask;
1074         vnn->pnn                 = -1;
1075         if (check_address) {
1076                 if (ctdb_sys_have_ip(addr)) {
1077                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1078                         vnn->pnn = ctdb->pnn;
1079                 }
1080         }
1081
1082         for (i=0; vnn->ifaces[i]; i++) {
1083                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1084                 if (ret != 0) {
1085                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1086                                            "for public_address[%s]\n",
1087                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1088                         talloc_free(vnn);
1089                         return -1;
1090                 }
1091         }
1092
1093         DLIST_ADD(ctdb->vnn, vnn);
1094
1095         return 0;
1096 }
1097
1098 /*
1099   setup the public address lists from a file
1100 */
1101 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1102 {
1103         char **lines;
1104         int nlines;
1105         int i;
1106
1107         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1108         if (lines == NULL) {
1109                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1110                 return -1;
1111         }
1112         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1113                 nlines--;
1114         }
1115
1116         for (i=0;i<nlines;i++) {
1117                 unsigned mask;
1118                 ctdb_sock_addr addr;
1119                 const char *addrstr;
1120                 const char *ifaces;
1121                 char *tok, *line;
1122
1123                 line = lines[i];
1124                 while ((*line == ' ') || (*line == '\t')) {
1125                         line++;
1126                 }
1127                 if (*line == '#') {
1128                         continue;
1129                 }
1130                 if (strcmp(line, "") == 0) {
1131                         continue;
1132                 }
1133                 tok = strtok(line, " \t");
1134                 addrstr = tok;
1135                 tok = strtok(NULL, " \t");
1136                 if (tok == NULL) {
1137                         if (NULL == ctdb->default_public_interface) {
1138                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1139                                          i+1));
1140                                 talloc_free(lines);
1141                                 return -1;
1142                         }
1143                         ifaces = ctdb->default_public_interface;
1144                 } else {
1145                         ifaces = tok;
1146                 }
1147
1148                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1149                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1150                         talloc_free(lines);
1151                         return -1;
1152                 }
1153                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1154                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1155                         talloc_free(lines);
1156                         return -1;
1157                 }
1158         }
1159
1160
1161         talloc_free(lines);
1162         return 0;
1163 }
1164
1165 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1166                               const char *iface,
1167                               const char *ip)
1168 {
1169         struct ctdb_vnn *svnn;
1170         struct ctdb_iface *cur = NULL;
1171         bool ok;
1172         int ret;
1173
1174         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1175         CTDB_NO_MEMORY(ctdb, svnn);
1176
1177         svnn->ifaces = talloc_array(svnn, const char *, 2);
1178         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1179         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1180         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1181         svnn->ifaces[1] = NULL;
1182
1183         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1184         if (!ok) {
1185                 talloc_free(svnn);
1186                 return -1;
1187         }
1188
1189         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1190         if (ret != 0) {
1191                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1192                                    "for single_ip[%s]\n",
1193                                    svnn->ifaces[0],
1194                                    ctdb_addr_to_str(&svnn->public_address)));
1195                 talloc_free(svnn);
1196                 return -1;
1197         }
1198
1199         /* assume the single public ip interface is initially "good" */
1200         cur = ctdb_find_iface(ctdb, iface);
1201         if (cur == NULL) {
1202                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1203                 return -1;
1204         }
1205         cur->link_up = true;
1206
1207         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1208         if (ret != 0) {
1209                 talloc_free(svnn);
1210                 return -1;
1211         }
1212
1213         ctdb->single_ip_vnn = svnn;
1214         return 0;
1215 }
1216
1217 struct ctdb_public_ip_list {
1218         struct ctdb_public_ip_list *next;
1219         uint32_t pnn;
1220         ctdb_sock_addr addr;
1221 };
1222
1223 /* Given a physical node, return the number of
1224    public addresses that is currently assigned to this node.
1225 */
1226 static int node_ip_coverage(struct ctdb_context *ctdb, 
1227         int32_t pnn,
1228         struct ctdb_public_ip_list *ips)
1229 {
1230         int num=0;
1231
1232         for (;ips;ips=ips->next) {
1233                 if (ips->pnn == pnn) {
1234                         num++;
1235                 }
1236         }
1237         return num;
1238 }
1239
1240
1241 /* Can the given node host the given IP: is the public IP known to the
1242  * node and is NOIPHOST unset?
1243 */
1244 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1245                              struct ctdb_ipflags ipflags,
1246                              struct ctdb_public_ip_list *ip)
1247 {
1248         struct ctdb_all_public_ips *public_ips;
1249         int i;
1250
1251         if (ipflags.noiphost) {
1252                 return false;
1253         }
1254
1255         public_ips = ctdb->nodes[pnn]->available_public_ips;
1256
1257         if (public_ips == NULL) {
1258                 return false;
1259         }
1260
1261         for (i=0; i<public_ips->num; i++) {
1262                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1263                         /* yes, this node can serve this public ip */
1264                         return true;
1265                 }
1266         }
1267
1268         return false;
1269 }
1270
1271 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1272                                  struct ctdb_ipflags ipflags,
1273                                  struct ctdb_public_ip_list *ip)
1274 {
1275         if (ipflags.noiptakeover) {
1276                 return false;
1277         }
1278
1279         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1280 }
1281
1282 /* search the node lists list for a node to takeover this ip.
1283    pick the node that currently are serving the least number of ips
1284    so that the ips get spread out evenly.
1285 */
1286 static int find_takeover_node(struct ctdb_context *ctdb, 
1287                 struct ctdb_ipflags *ipflags,
1288                 struct ctdb_public_ip_list *ip,
1289                 struct ctdb_public_ip_list *all_ips)
1290 {
1291         int pnn, min=0, num;
1292         int i, numnodes;
1293
1294         numnodes = talloc_array_length(ipflags);
1295         pnn    = -1;
1296         for (i=0; i<numnodes; i++) {
1297                 /* verify that this node can serve this ip */
1298                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1299                         /* no it couldnt   so skip to the next node */
1300                         continue;
1301                 }
1302
1303                 num = node_ip_coverage(ctdb, i, all_ips);
1304                 /* was this the first node we checked ? */
1305                 if (pnn == -1) {
1306                         pnn = i;
1307                         min  = num;
1308                 } else {
1309                         if (num < min) {
1310                                 pnn = i;
1311                                 min  = num;
1312                         }
1313                 }
1314         }       
1315         if (pnn == -1) {
1316                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1317                         ctdb_addr_to_str(&ip->addr)));
1318
1319                 return -1;
1320         }
1321
1322         ip->pnn = pnn;
1323         return 0;
1324 }
1325
1326 #define IP_KEYLEN       4
1327 static uint32_t *ip_key(ctdb_sock_addr *ip)
1328 {
1329         static uint32_t key[IP_KEYLEN];
1330
1331         bzero(key, sizeof(key));
1332
1333         switch (ip->sa.sa_family) {
1334         case AF_INET:
1335                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1336                 break;
1337         case AF_INET6: {
1338                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1339                 key[0]  = htonl(s6_a32[0]);
1340                 key[1]  = htonl(s6_a32[1]);
1341                 key[2]  = htonl(s6_a32[2]);
1342                 key[3]  = htonl(s6_a32[3]);
1343                 break;
1344         }
1345         default:
1346                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1347                 return key;
1348         }
1349
1350         return key;
1351 }
1352
1353 static void *add_ip_callback(void *parm, void *data)
1354 {
1355         struct ctdb_public_ip_list *this_ip = parm; 
1356         struct ctdb_public_ip_list *prev_ip = data; 
1357
1358         if (prev_ip == NULL) {
1359                 return parm;
1360         }
1361         if (this_ip->pnn == -1) {
1362                 this_ip->pnn = prev_ip->pnn;
1363         }
1364
1365         return parm;
1366 }
1367
1368 static int getips_count_callback(void *param, void *data)
1369 {
1370         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1371         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1372
1373         new_ip->next = *ip_list;
1374         *ip_list     = new_ip;
1375         return 0;
1376 }
1377
1378 static struct ctdb_public_ip_list *
1379 create_merged_ip_list(struct ctdb_context *ctdb)
1380 {
1381         int i, j;
1382         struct ctdb_public_ip_list *ip_list;
1383         struct ctdb_all_public_ips *public_ips;
1384
1385         if (ctdb->ip_tree != NULL) {
1386                 talloc_free(ctdb->ip_tree);
1387                 ctdb->ip_tree = NULL;
1388         }
1389         ctdb->ip_tree = trbt_create(ctdb, 0);
1390
1391         for (i=0;i<ctdb->num_nodes;i++) {
1392                 public_ips = ctdb->nodes[i]->known_public_ips;
1393
1394                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1395                         continue;
1396                 }
1397
1398                 /* there were no public ips for this node */
1399                 if (public_ips == NULL) {
1400                         continue;
1401                 }               
1402
1403                 for (j=0;j<public_ips->num;j++) {
1404                         struct ctdb_public_ip_list *tmp_ip; 
1405
1406                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1407                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1408                         /* Do not use information about IP addresses hosted
1409                          * on other nodes, it may not be accurate */
1410                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1411                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1412                         } else {
1413                                 tmp_ip->pnn = -1;
1414                         }
1415                         tmp_ip->addr = public_ips->ips[j].addr;
1416                         tmp_ip->next = NULL;
1417
1418                         trbt_insertarray32_callback(ctdb->ip_tree,
1419                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1420                                 add_ip_callback,
1421                                 tmp_ip);
1422                 }
1423         }
1424
1425         ip_list = NULL;
1426         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1427
1428         return ip_list;
1429 }
1430
1431 /* 
1432  * This is the length of the longtest common prefix between the IPs.
1433  * It is calculated by XOR-ing the 2 IPs together and counting the
1434  * number of leading zeroes.  The implementation means that all
1435  * addresses end up being 128 bits long.
1436  *
1437  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1438  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1439  * lots of nodes and IP addresses?
1440  */
1441 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1442 {
1443         uint32_t ip1_k[IP_KEYLEN];
1444         uint32_t *t;
1445         int i;
1446         uint32_t x;
1447
1448         uint32_t distance = 0;
1449
1450         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1451         t = ip_key(ip2);
1452         for (i=0; i<IP_KEYLEN; i++) {
1453                 x = ip1_k[i] ^ t[i];
1454                 if (x == 0) {
1455                         distance += 32;
1456                 } else {
1457                         /* Count number of leading zeroes. 
1458                          * FIXME? This could be optimised...
1459                          */
1460                         while ((x & (1 << 31)) == 0) {
1461                                 x <<= 1;
1462                                 distance += 1;
1463                         }
1464                 }
1465         }
1466
1467         return distance;
1468 }
1469
1470 /* Calculate the IP distance for the given IP relative to IPs on the
1471    given node.  The ips argument is generally the all_ips variable
1472    used in the main part of the algorithm.
1473  */
1474 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1475                                   struct ctdb_public_ip_list *ips,
1476                                   int pnn)
1477 {
1478         struct ctdb_public_ip_list *t;
1479         uint32_t d;
1480
1481         uint32_t sum = 0;
1482
1483         for (t=ips; t != NULL; t=t->next) {
1484                 if (t->pnn != pnn) {
1485                         continue;
1486                 }
1487
1488                 /* Optimisation: We never calculate the distance
1489                  * between an address and itself.  This allows us to
1490                  * calculate the effect of removing an address from a
1491                  * node by simply calculating the distance between
1492                  * that address and all of the exitsing addresses.
1493                  * Moreover, we assume that we're only ever dealing
1494                  * with addresses from all_ips so we can identify an
1495                  * address via a pointer rather than doing a more
1496                  * expensive address comparison. */
1497                 if (&(t->addr) == ip) {
1498                         continue;
1499                 }
1500
1501                 d = ip_distance(ip, &(t->addr));
1502                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1503         }
1504
1505         return sum;
1506 }
1507
1508 /* Return the LCP2 imbalance metric for addresses currently assigned
1509    to the given node.
1510  */
1511 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1512 {
1513         struct ctdb_public_ip_list *t;
1514
1515         uint32_t imbalance = 0;
1516
1517         for (t=all_ips; t!=NULL; t=t->next) {
1518                 if (t->pnn != pnn) {
1519                         continue;
1520                 }
1521                 /* Pass the rest of the IPs rather than the whole
1522                    all_ips input list.
1523                 */
1524                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1525         }
1526
1527         return imbalance;
1528 }
1529
1530 /* Allocate any unassigned IPs just by looping through the IPs and
1531  * finding the best node for each.
1532  */
1533 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1534                                       struct ctdb_ipflags *ipflags,
1535                                       struct ctdb_public_ip_list *all_ips)
1536 {
1537         struct ctdb_public_ip_list *tmp_ip;
1538
1539         /* loop over all ip's and find a physical node to cover for 
1540            each unassigned ip.
1541         */
1542         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1543                 if (tmp_ip->pnn == -1) {
1544                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1545                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1546                                         ctdb_addr_to_str(&tmp_ip->addr)));
1547                         }
1548                 }
1549         }
1550 }
1551
1552 /* Basic non-deterministic rebalancing algorithm.
1553  */
1554 static void basic_failback(struct ctdb_context *ctdb,
1555                            struct ctdb_ipflags *ipflags,
1556                            struct ctdb_public_ip_list *all_ips,
1557                            int num_ips)
1558 {
1559         int i, numnodes;
1560         int maxnode, maxnum, minnode, minnum, num, retries;
1561         struct ctdb_public_ip_list *tmp_ip;
1562
1563         numnodes = talloc_array_length(ipflags);
1564         retries = 0;
1565
1566 try_again:
1567         maxnum=0;
1568         minnum=0;
1569
1570         /* for each ip address, loop over all nodes that can serve
1571            this ip and make sure that the difference between the node
1572            serving the most and the node serving the least ip's are
1573            not greater than 1.
1574         */
1575         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1576                 if (tmp_ip->pnn == -1) {
1577                         continue;
1578                 }
1579
1580                 /* Get the highest and lowest number of ips's served by any 
1581                    valid node which can serve this ip.
1582                 */
1583                 maxnode = -1;
1584                 minnode = -1;
1585                 for (i=0; i<numnodes; i++) {
1586                         /* only check nodes that can actually serve this ip */
1587                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1588                                 /* no it couldnt   so skip to the next node */
1589                                 continue;
1590                         }
1591
1592                         num = node_ip_coverage(ctdb, i, all_ips);
1593                         if (maxnode == -1) {
1594                                 maxnode = i;
1595                                 maxnum  = num;
1596                         } else {
1597                                 if (num > maxnum) {
1598                                         maxnode = i;
1599                                         maxnum  = num;
1600                                 }
1601                         }
1602                         if (minnode == -1) {
1603                                 minnode = i;
1604                                 minnum  = num;
1605                         } else {
1606                                 if (num < minnum) {
1607                                         minnode = i;
1608                                         minnum  = num;
1609                                 }
1610                         }
1611                 }
1612                 if (maxnode == -1) {
1613                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1614                                 ctdb_addr_to_str(&tmp_ip->addr)));
1615
1616                         continue;
1617                 }
1618
1619                 /* if the spread between the smallest and largest coverage by
1620                    a node is >=2 we steal one of the ips from the node with
1621                    most coverage to even things out a bit.
1622                    try to do this a limited number of times since we dont
1623                    want to spend too much time balancing the ip coverage.
1624                 */
1625                 if ( (maxnum > minnum+1)
1626                      && (retries < (num_ips + 5)) ){
1627                         struct ctdb_public_ip_list *tmp;
1628
1629                         /* Reassign one of maxnode's VNNs */
1630                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1631                                 if (tmp->pnn == maxnode) {
1632                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1633                                         retries++;
1634                                         goto try_again;;
1635                                 }
1636                         }
1637                 }
1638         }
1639 }
1640
1641 static void lcp2_init(struct ctdb_context *tmp_ctx,
1642                       struct ctdb_ipflags *ipflags,
1643                       struct ctdb_public_ip_list *all_ips,
1644                       uint32_t *force_rebalance_nodes,
1645                       uint32_t **lcp2_imbalances,
1646                       bool **rebalance_candidates)
1647 {
1648         int i, numnodes;
1649         struct ctdb_public_ip_list *tmp_ip;
1650
1651         numnodes = talloc_array_length(ipflags);
1652
1653         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1654         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1655         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1656         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1657
1658         for (i=0; i<numnodes; i++) {
1659                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1660                 /* First step: assume all nodes are candidates */
1661                 (*rebalance_candidates)[i] = true;
1662         }
1663
1664         /* 2nd step: if a node has IPs assigned then it must have been
1665          * healthy before, so we remove it from consideration.  This
1666          * is overkill but is all we have because we don't maintain
1667          * state between takeover runs.  An alternative would be to
1668          * keep state and invalidate it every time the recovery master
1669          * changes.
1670          */
1671         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1672                 if (tmp_ip->pnn != -1) {
1673                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1674                 }
1675         }
1676
1677         /* 3rd step: if a node is forced to re-balance then
1678            we allow failback onto the node */
1679         if (force_rebalance_nodes == NULL) {
1680                 return;
1681         }
1682         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1683                 uint32_t pnn = force_rebalance_nodes[i];
1684                 if (pnn >= numnodes) {
1685                         DEBUG(DEBUG_ERR,
1686                               (__location__ "unknown node %u\n", pnn));
1687                         continue;
1688                 }
1689
1690                 DEBUG(DEBUG_NOTICE,
1691                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1692                 (*rebalance_candidates)[pnn] = true;
1693         }
1694 }
1695
1696 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1697  * the IP/node combination that will cost the least.
1698  */
1699 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1700                                      struct ctdb_ipflags *ipflags,
1701                                      struct ctdb_public_ip_list *all_ips,
1702                                      uint32_t *lcp2_imbalances)
1703 {
1704         struct ctdb_public_ip_list *tmp_ip;
1705         int dstnode, numnodes;
1706
1707         int minnode;
1708         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1709         struct ctdb_public_ip_list *minip;
1710
1711         bool should_loop = true;
1712         bool have_unassigned = true;
1713
1714         numnodes = talloc_array_length(ipflags);
1715
1716         while (have_unassigned && should_loop) {
1717                 should_loop = false;
1718
1719                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1720                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1721
1722                 minnode = -1;
1723                 mindsum = 0;
1724                 minip = NULL;
1725
1726                 /* loop over each unassigned ip. */
1727                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1728                         if (tmp_ip->pnn != -1) {
1729                                 continue;
1730                         }
1731
1732                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1733                                 /* only check nodes that can actually takeover this ip */
1734                                 if (!can_node_takeover_ip(ctdb, dstnode,
1735                                                           ipflags[dstnode],
1736                                                           tmp_ip)) {
1737                                         /* no it couldnt   so skip to the next node */
1738                                         continue;
1739                                 }
1740
1741                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1742                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1743                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1744                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1745                                                    dstnode,
1746                                                    dstimbl - lcp2_imbalances[dstnode]));
1747
1748
1749                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1750                                         minnode = dstnode;
1751                                         minimbl = dstimbl;
1752                                         mindsum = dstdsum;
1753                                         minip = tmp_ip;
1754                                         should_loop = true;
1755                                 }
1756                         }
1757                 }
1758
1759                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1760
1761                 /* If we found one then assign it to the given node. */
1762                 if (minnode != -1) {
1763                         minip->pnn = minnode;
1764                         lcp2_imbalances[minnode] = minimbl;
1765                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1766                                           ctdb_addr_to_str(&(minip->addr)),
1767                                           minnode,
1768                                           mindsum));
1769                 }
1770
1771                 /* There might be a better way but at least this is clear. */
1772                 have_unassigned = false;
1773                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1774                         if (tmp_ip->pnn == -1) {
1775                                 have_unassigned = true;
1776                         }
1777                 }
1778         }
1779
1780         /* We know if we have an unassigned addresses so we might as
1781          * well optimise.
1782          */
1783         if (have_unassigned) {
1784                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1785                         if (tmp_ip->pnn == -1) {
1786                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1787                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1788                         }
1789                 }
1790         }
1791 }
1792
1793 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1794  * to move IPs from, determines the best IP/destination node
1795  * combination to move from the source node.
1796  */
1797 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1798                                     struct ctdb_ipflags *ipflags,
1799                                     struct ctdb_public_ip_list *all_ips,
1800                                     int srcnode,
1801                                     uint32_t *lcp2_imbalances,
1802                                     bool *rebalance_candidates)
1803 {
1804         int dstnode, mindstnode, numnodes;
1805         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1806         uint32_t minsrcimbl, mindstimbl;
1807         struct ctdb_public_ip_list *minip;
1808         struct ctdb_public_ip_list *tmp_ip;
1809
1810         /* Find an IP and destination node that best reduces imbalance. */
1811         srcimbl = 0;
1812         minip = NULL;
1813         minsrcimbl = 0;
1814         mindstnode = -1;
1815         mindstimbl = 0;
1816
1817         numnodes = talloc_array_length(ipflags);
1818
1819         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1820         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1821                            srcnode, lcp2_imbalances[srcnode]));
1822
1823         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1824                 /* Only consider addresses on srcnode. */
1825                 if (tmp_ip->pnn != srcnode) {
1826                         continue;
1827                 }
1828
1829                 /* What is this IP address costing the source node? */
1830                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1831                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1832
1833                 /* Consider this IP address would cost each potential
1834                  * destination node.  Destination nodes are limited to
1835                  * those that are newly healthy, since we don't want
1836                  * to do gratuitous failover of IPs just to make minor
1837                  * balance improvements.
1838                  */
1839                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1840                         if (!rebalance_candidates[dstnode]) {
1841                                 continue;
1842                         }
1843
1844                         /* only check nodes that can actually takeover this ip */
1845                         if (!can_node_takeover_ip(ctdb, dstnode,
1846                                                   ipflags[dstnode], tmp_ip)) {
1847                                 /* no it couldnt   so skip to the next node */
1848                                 continue;
1849                         }
1850
1851                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1852                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1853                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1854                                            srcnode, -srcdsum,
1855                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1856                                            dstnode, dstdsum));
1857
1858                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1859                             (dstdsum < srcdsum) &&                      \
1860                             ((mindstnode == -1) ||                              \
1861                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1862
1863                                 minip = tmp_ip;
1864                                 minsrcimbl = srcimbl;
1865                                 mindstnode = dstnode;
1866                                 mindstimbl = dstimbl;
1867                         }
1868                 }
1869         }
1870         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1871
1872         if (mindstnode != -1) {
1873                 /* We found a move that makes things better... */
1874                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1875                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1876                                   ctdb_addr_to_str(&(minip->addr)),
1877                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1878
1879
1880                 lcp2_imbalances[srcnode] = minsrcimbl;
1881                 lcp2_imbalances[mindstnode] = mindstimbl;
1882                 minip->pnn = mindstnode;
1883
1884                 return true;
1885         }
1886
1887         return false;
1888         
1889 }
1890
1891 struct lcp2_imbalance_pnn {
1892         uint32_t imbalance;
1893         int pnn;
1894 };
1895
1896 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1897 {
1898         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1899         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1900
1901         if (lipa->imbalance > lipb->imbalance) {
1902                 return -1;
1903         } else if (lipa->imbalance == lipb->imbalance) {
1904                 return 0;
1905         } else {
1906                 return 1;
1907         }
1908 }
1909
1910 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1911  * node with the highest LCP2 imbalance, and then determines the best
1912  * IP/destination node combination to move from the source node.
1913  */
1914 static void lcp2_failback(struct ctdb_context *ctdb,
1915                           struct ctdb_ipflags *ipflags,
1916                           struct ctdb_public_ip_list *all_ips,
1917                           uint32_t *lcp2_imbalances,
1918                           bool *rebalance_candidates)
1919 {
1920         int i, numnodes;
1921         struct lcp2_imbalance_pnn * lips;
1922         bool again;
1923
1924         numnodes = talloc_array_length(ipflags);
1925
1926 try_again:
1927         /* Put the imbalances and nodes into an array, sort them and
1928          * iterate through candidates.  Usually the 1st one will be
1929          * used, so this doesn't cost much...
1930          */
1931         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1932         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1933         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1934         for (i=0; i<numnodes; i++) {
1935                 lips[i].imbalance = lcp2_imbalances[i];
1936                 lips[i].pnn = i;
1937                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1938         }
1939         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1940               lcp2_cmp_imbalance_pnn);
1941
1942         again = false;
1943         for (i=0; i<numnodes; i++) {
1944                 /* This means that all nodes had 0 or 1 addresses, so
1945                  * can't be imbalanced.
1946                  */
1947                 if (lips[i].imbalance == 0) {
1948                         break;
1949                 }
1950
1951                 if (lcp2_failback_candidate(ctdb,
1952                                             ipflags,
1953                                             all_ips,
1954                                             lips[i].pnn,
1955                                             lcp2_imbalances,
1956                                             rebalance_candidates)) {
1957                         again = true;
1958                         break;
1959                 }
1960         }
1961
1962         talloc_free(lips);
1963         if (again) {
1964                 goto try_again;
1965         }
1966 }
1967
1968 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1969                                     struct ctdb_ipflags *ipflags,
1970                                     struct ctdb_public_ip_list *all_ips)
1971 {
1972         struct ctdb_public_ip_list *tmp_ip;
1973
1974         /* verify that the assigned nodes can serve that public ip
1975            and set it to -1 if not
1976         */
1977         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1978                 if (tmp_ip->pnn == -1) {
1979                         continue;
1980                 }
1981                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1982                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1983                         /* this node can not serve this ip. */
1984                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1985                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1986                                            tmp_ip->pnn));
1987                         tmp_ip->pnn = -1;
1988                 }
1989         }
1990 }
1991
1992 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
1993                                        struct ctdb_ipflags *ipflags,
1994                                        struct ctdb_public_ip_list *all_ips)
1995 {
1996         struct ctdb_public_ip_list *tmp_ip;
1997         int i, numnodes;
1998
1999         numnodes = talloc_array_length(ipflags);
2000
2001         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2002        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2003         *  always be allocated the same way for a specific set of
2004         *  available/unavailable nodes.
2005         */
2006
2007         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2008                 tmp_ip->pnn = i % numnodes;
2009         }
2010
2011         /* IP failback doesn't make sense with deterministic
2012          * IPs, since the modulo step above implicitly fails
2013          * back IPs to their "home" node.
2014          */
2015         if (1 == ctdb->tunable.no_ip_failback) {
2016                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2017         }
2018
2019         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2020
2021         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2022
2023         /* No failback here! */
2024 }
2025
2026 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2027                                           struct ctdb_ipflags *ipflags,
2028                                           struct ctdb_public_ip_list *all_ips)
2029 {
2030         /* This should be pushed down into basic_failback. */
2031         struct ctdb_public_ip_list *tmp_ip;
2032         int num_ips = 0;
2033         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2034                 num_ips++;
2035         }
2036
2037         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2038
2039         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2040
2041         /* If we don't want IPs to fail back then don't rebalance IPs. */
2042         if (1 == ctdb->tunable.no_ip_failback) {
2043                 return;
2044         }
2045
2046         /* Now, try to make sure the ip adresses are evenly distributed
2047            across the nodes.
2048         */
2049         basic_failback(ctdb, ipflags, all_ips, num_ips);
2050 }
2051
2052 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2053                           struct ctdb_ipflags *ipflags,
2054                           struct ctdb_public_ip_list *all_ips,
2055                           uint32_t *force_rebalance_nodes)
2056 {
2057         uint32_t *lcp2_imbalances;
2058         bool *rebalance_candidates;
2059         int numnodes, num_rebalance_candidates, i;
2060
2061         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2062
2063         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2064
2065         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2066                   &lcp2_imbalances, &rebalance_candidates);
2067
2068         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2069
2070         /* If we don't want IPs to fail back then don't rebalance IPs. */
2071         if (1 == ctdb->tunable.no_ip_failback) {
2072                 goto finished;
2073         }
2074
2075         /* It is only worth continuing if we have suitable target
2076          * nodes to transfer IPs to.  This check is much cheaper than
2077          * continuing on...
2078          */
2079         numnodes = talloc_array_length(ipflags);
2080         num_rebalance_candidates = 0;
2081         for (i=0; i<numnodes; i++) {
2082                 if (rebalance_candidates[i]) {
2083                         num_rebalance_candidates++;
2084                 }
2085         }
2086         if (num_rebalance_candidates == 0) {
2087                 goto finished;
2088         }
2089
2090         /* Now, try to make sure the ip adresses are evenly distributed
2091            across the nodes.
2092         */
2093         lcp2_failback(ctdb, ipflags, all_ips,
2094                       lcp2_imbalances, rebalance_candidates);
2095
2096 finished:
2097         talloc_free(tmp_ctx);
2098 }
2099
2100 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2101 {
2102         int i;
2103
2104         for (i=0;i<nodemap->num;i++) {
2105                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2106                         /* Found one completely healthy node */
2107                         return false;
2108                 }
2109         }
2110
2111         return true;
2112 }
2113
2114 /* The calculation part of the IP allocation algorithm. */
2115 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2116                                    struct ctdb_ipflags *ipflags,
2117                                    struct ctdb_public_ip_list **all_ips_p,
2118                                    uint32_t *force_rebalance_nodes)
2119 {
2120         /* since nodes only know about those public addresses that
2121            can be served by that particular node, no single node has
2122            a full list of all public addresses that exist in the cluster.
2123            Walk over all node structures and create a merged list of
2124            all public addresses that exist in the cluster.
2125
2126            keep the tree of ips around as ctdb->ip_tree
2127         */
2128         *all_ips_p = create_merged_ip_list(ctdb);
2129
2130         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2131                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2132         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2133                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2134         } else {
2135                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2136         }
2137
2138         /* at this point ->pnn is the node which will own each IP
2139            or -1 if there is no node that can cover this ip
2140         */
2141
2142         return;
2143 }
2144
2145 struct get_tunable_callback_data {
2146         const char *tunable;
2147         uint32_t *out;
2148         bool fatal;
2149 };
2150
2151 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2152                                  int32_t res, TDB_DATA outdata,
2153                                  void *callback)
2154 {
2155         struct get_tunable_callback_data *cd =
2156                 (struct get_tunable_callback_data *)callback;
2157         int size;
2158
2159         if (res != 0) {
2160                 /* Already handled in fail callback */
2161                 return;
2162         }
2163
2164         if (outdata.dsize != sizeof(uint32_t)) {
2165                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2166                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2167                                  (int)outdata.dsize));
2168                 cd->fatal = true;
2169                 return;
2170         }
2171
2172         size = talloc_array_length(cd->out);
2173         if (pnn >= size) {
2174                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2175                                  cd->tunable, pnn, size));
2176                 return;
2177         }
2178
2179                 
2180         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2181 }
2182
2183 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2184                                        int32_t res, TDB_DATA outdata,
2185                                        void *callback)
2186 {
2187         struct get_tunable_callback_data *cd =
2188                 (struct get_tunable_callback_data *)callback;
2189
2190         switch (res) {
2191         case -ETIME:
2192                 DEBUG(DEBUG_ERR,
2193                       ("Timed out getting tunable \"%s\" from node %d\n",
2194                        cd->tunable, pnn));
2195                 cd->fatal = true;
2196                 break;
2197         case -EINVAL:
2198         case -1:
2199                 DEBUG(DEBUG_WARNING,
2200                       ("Tunable \"%s\" not implemented on node %d\n",
2201                        cd->tunable, pnn));
2202                 break;
2203         default:
2204                 DEBUG(DEBUG_ERR,
2205                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2206                        cd->tunable, pnn));
2207                 cd->fatal = true;
2208         }
2209 }
2210
2211 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2212                                         TALLOC_CTX *tmp_ctx,
2213                                         struct ctdb_node_map *nodemap,
2214                                         const char *tunable,
2215                                         uint32_t default_value)
2216 {
2217         TDB_DATA data;
2218         struct ctdb_control_get_tunable *t;
2219         uint32_t *nodes;
2220         uint32_t *tvals;
2221         struct get_tunable_callback_data callback_data;
2222         int i;
2223
2224         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2225         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2226         for (i=0; i<nodemap->num; i++) {
2227                 tvals[i] = default_value;
2228         }
2229                 
2230         callback_data.out = tvals;
2231         callback_data.tunable = tunable;
2232         callback_data.fatal = false;
2233
2234         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2235         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2236         t = (struct ctdb_control_get_tunable *)data.dptr;
2237         t->length = strlen(tunable)+1;
2238         memcpy(t->name, tunable, t->length);
2239         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2240         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2241                                       nodes, 0, TAKEOVER_TIMEOUT(),
2242                                       false, data,
2243                                       get_tunable_callback,
2244                                       get_tunable_fail_callback,
2245                                       &callback_data) != 0) {
2246                 if (callback_data.fatal) {
2247                         talloc_free(tvals);
2248                         tvals = NULL;
2249                 }
2250         }
2251         talloc_free(nodes);
2252         talloc_free(data.dptr);
2253
2254         return tvals;
2255 }
2256
2257 struct get_runstate_callback_data {
2258         enum ctdb_runstate *out;
2259         bool fatal;
2260 };
2261
2262 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2263                                   int32_t res, TDB_DATA outdata,
2264                                   void *callback_data)
2265 {
2266         struct get_runstate_callback_data *cd =
2267                 (struct get_runstate_callback_data *)callback_data;
2268         int size;
2269
2270         if (res != 0) {
2271                 /* Already handled in fail callback */
2272                 return;
2273         }
2274
2275         if (outdata.dsize != sizeof(uint32_t)) {
2276                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2277                                  pnn, (int)sizeof(uint32_t),
2278                                  (int)outdata.dsize));
2279                 cd->fatal = true;
2280                 return;
2281         }
2282
2283         size = talloc_array_length(cd->out);
2284         if (pnn >= size) {
2285                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2286                                  pnn, size));
2287                 return;
2288         }
2289
2290         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2291 }
2292
2293 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2294                                        int32_t res, TDB_DATA outdata,
2295                                        void *callback)
2296 {
2297         struct get_runstate_callback_data *cd =
2298                 (struct get_runstate_callback_data *)callback;
2299
2300         switch (res) {
2301         case -ETIME:
2302                 DEBUG(DEBUG_ERR,
2303                       ("Timed out getting runstate from node %d\n", pnn));
2304                 cd->fatal = true;
2305                 break;
2306         default:
2307                 DEBUG(DEBUG_WARNING,
2308                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2309                        pnn));
2310         }
2311 }
2312
2313 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2314                                                     TALLOC_CTX *tmp_ctx,
2315                                                     struct ctdb_node_map *nodemap,
2316                                                     enum ctdb_runstate default_value)
2317 {
2318         uint32_t *nodes;
2319         enum ctdb_runstate *rs;
2320         struct get_runstate_callback_data callback_data;
2321         int i;
2322
2323         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2324         CTDB_NO_MEMORY_NULL(ctdb, rs);
2325         for (i=0; i<nodemap->num; i++) {
2326                 rs[i] = default_value;
2327         }
2328
2329         callback_data.out = rs;
2330         callback_data.fatal = false;
2331
2332         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2333         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2334                                       nodes, 0, TAKEOVER_TIMEOUT(),
2335                                       true, tdb_null,
2336                                       get_runstate_callback,
2337                                       get_runstate_fail_callback,
2338                                       &callback_data) != 0) {
2339                 if (callback_data.fatal) {
2340                         free(rs);
2341                         rs = NULL;
2342                 }
2343         }
2344         talloc_free(nodes);
2345
2346         return rs;
2347 }
2348
2349 /* Set internal flags for IP allocation:
2350  *   Clear ip flags
2351  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2352  *   Set NOIPHOST ip flag for each INACTIVE node
2353  *   if all nodes are disabled:
2354  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2355  *   else
2356  *     Set NOIPHOST ip flags for disabled nodes
2357  */
2358 static struct ctdb_ipflags *
2359 set_ipflags_internal(struct ctdb_context *ctdb,
2360                      TALLOC_CTX *tmp_ctx,
2361                      struct ctdb_node_map *nodemap,
2362                      uint32_t *tval_noiptakeover,
2363                      uint32_t *tval_noiphostonalldisabled,
2364                      enum ctdb_runstate *runstate)
2365 {
2366         int i;
2367         struct ctdb_ipflags *ipflags;
2368
2369         /* Clear IP flags - implicit due to talloc_zero */
2370         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2371         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2372
2373         for (i=0;i<nodemap->num;i++) {
2374                 /* Can not take IPs on node with NoIPTakeover set */
2375                 if (tval_noiptakeover[i] != 0) {
2376                         ipflags[i].noiptakeover = true;
2377                 }
2378
2379                 /* Can not host IPs on node not in RUNNING state */
2380                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2381                         ipflags[i].noiphost = true;
2382                         continue;
2383                 }
2384                 /* Can not host IPs on INACTIVE node */
2385                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2386                         ipflags[i].noiphost = true;
2387                 }
2388                 /* Remember the runstate */
2389                 ipflags[i].runstate = runstate[i];
2390         }
2391
2392         if (all_nodes_are_disabled(nodemap)) {
2393                 /* If all nodes are disabled, can not host IPs on node
2394                  * with NoIPHostOnAllDisabled set
2395                  */
2396                 for (i=0;i<nodemap->num;i++) {
2397                         if (tval_noiphostonalldisabled[i] != 0) {
2398                                 ipflags[i].noiphost = true;
2399                         }
2400                 }
2401         } else {
2402                 /* If some nodes are not disabled, then can not host
2403                  * IPs on DISABLED node
2404                  */
2405                 for (i=0;i<nodemap->num;i++) {
2406                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2407                                 ipflags[i].noiphost = true;
2408                         }
2409                 }
2410         }
2411
2412         return ipflags;
2413 }
2414
2415 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2416                                         TALLOC_CTX *tmp_ctx,
2417                                         struct ctdb_node_map *nodemap)
2418 {
2419         uint32_t *tval_noiptakeover;
2420         uint32_t *tval_noiphostonalldisabled;
2421         struct ctdb_ipflags *ipflags;
2422         enum ctdb_runstate *runstate;
2423
2424
2425         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2426                                                    "NoIPTakeover", 0);
2427         if (tval_noiptakeover == NULL) {
2428                 return NULL;
2429         }
2430
2431         tval_noiphostonalldisabled =
2432                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2433                                        "NoIPHostOnAllDisabled", 0);
2434         if (tval_noiphostonalldisabled == NULL) {
2435                 /* Caller frees tmp_ctx */
2436                 return NULL;
2437         }
2438
2439         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2440          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2441          * reasonable behaviour on a mixed cluster during upgrade.
2442          */
2443         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2444                                            CTDB_RUNSTATE_RUNNING);
2445         if (runstate == NULL) {
2446                 /* Caller frees tmp_ctx */
2447                 return NULL;
2448         }
2449
2450         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2451                                        tval_noiptakeover,
2452                                        tval_noiphostonalldisabled,
2453                                        runstate);
2454
2455         talloc_free(tval_noiptakeover);
2456         talloc_free(tval_noiphostonalldisabled);
2457         talloc_free(runstate);
2458
2459         return ipflags;
2460 }
2461
2462 struct iprealloc_callback_data {
2463         bool *retry_nodes;
2464         int retry_count;
2465         client_async_callback fail_callback;
2466         void *fail_callback_data;
2467         struct ctdb_node_map *nodemap;
2468 };
2469
2470 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2471                                         int32_t res, TDB_DATA outdata,
2472                                         void *callback)
2473 {
2474         int numnodes;
2475         struct iprealloc_callback_data *cd =
2476                 (struct iprealloc_callback_data *)callback;
2477
2478         numnodes = talloc_array_length(cd->retry_nodes);
2479         if (pnn > numnodes) {
2480                 DEBUG(DEBUG_ERR,
2481                       ("ipreallocated failure from node %d, "
2482                        "but only %d nodes in nodemap\n",
2483                        pnn, numnodes));
2484                 return;
2485         }
2486
2487         /* Can't run the "ipreallocated" event on a INACTIVE node */
2488         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2489                 DEBUG(DEBUG_WARNING,
2490                       ("ipreallocated failed on inactive node %d, ignoring\n",
2491                        pnn));
2492                 return;
2493         }
2494
2495         switch (res) {
2496         case -ETIME:
2497                 /* If the control timed out then that's a real error,
2498                  * so call the real fail callback
2499                  */
2500                 if (cd->fail_callback) {
2501                         cd->fail_callback(ctdb, pnn, res, outdata,
2502                                           cd->fail_callback_data);
2503                 } else {
2504                         DEBUG(DEBUG_WARNING,
2505                               ("iprealloc timed out but no callback registered\n"));
2506                 }
2507                 break;
2508         default:
2509                 /* If not a timeout then either the ipreallocated
2510                  * eventscript (or some setup) failed.  This might
2511                  * have failed because the IPREALLOCATED control isn't
2512                  * implemented - right now there is no way of knowing
2513                  * because the error codes are all folded down to -1.
2514                  * Consider retrying using EVENTSCRIPT control...
2515                  */
2516                 DEBUG(DEBUG_WARNING,
2517                       ("ipreallocated failure from node %d, flagging retry\n",
2518                        pnn));
2519                 cd->retry_nodes[pnn] = true;
2520                 cd->retry_count++;
2521         }
2522 }
2523
2524 struct takeover_callback_data {
2525         bool *node_failed;
2526         client_async_callback fail_callback;
2527         void *fail_callback_data;
2528         struct ctdb_node_map *nodemap;
2529 };
2530
2531 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2532                                        uint32_t node_pnn, int32_t res,
2533                                        TDB_DATA outdata, void *callback_data)
2534 {
2535         struct takeover_callback_data *cd =
2536                 talloc_get_type_abort(callback_data,
2537                                       struct takeover_callback_data);
2538         int i;
2539
2540         for (i = 0; i < cd->nodemap->num; i++) {
2541                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2542                         break;
2543                 }
2544         }
2545
2546         if (i == cd->nodemap->num) {
2547                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2548                 return;
2549         }
2550
2551         if (!cd->node_failed[i]) {
2552                 cd->node_failed[i] = true;
2553                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2554                                   cd->fail_callback_data);
2555         }
2556 }
2557
2558 /*
2559   make any IP alias changes for public addresses that are necessary 
2560  */
2561 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2562                       uint32_t *force_rebalance_nodes,
2563                       client_async_callback fail_callback, void *callback_data)
2564 {
2565         int i, j, ret;
2566         struct ctdb_public_ip ip;
2567         uint32_t *nodes;
2568         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2569         TDB_DATA data;
2570         struct timeval timeout;
2571         struct client_async_data *async_data;
2572         struct ctdb_client_control_state *state;
2573         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2574         struct ctdb_ipflags *ipflags;
2575         struct takeover_callback_data *takeover_data;
2576         struct iprealloc_callback_data iprealloc_data;
2577         bool *retry_data;
2578         bool can_host_ips;
2579
2580         /*
2581          * ip failover is completely disabled, just send out the 
2582          * ipreallocated event.
2583          */
2584         if (ctdb->tunable.disable_ip_failover != 0) {
2585                 goto ipreallocated;
2586         }
2587
2588         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2589         if (ipflags == NULL) {
2590                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2591                 talloc_free(tmp_ctx);
2592                 return -1;
2593         }
2594
2595         /* Short-circuit IP allocation if no nodes are in the RUNNING
2596          * runstate yet, since no nodes will be able to host IPs */
2597         can_host_ips = false;
2598         for (i=0; i<nodemap->num; i++) {
2599                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2600                         can_host_ips = true;
2601                 }
2602         }
2603         if (!can_host_ips) {
2604                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2605                 return 0;
2606         }
2607
2608         /* Do the IP reassignment calculations */
2609         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2610
2611         /* Now tell all nodes to release any public IPs should not
2612          * host.  This will be a NOOP on nodes that don't currently
2613          * hold the given IP.
2614          */
2615         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2616         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2617
2618         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2619                                                        bool, nodemap->num);
2620         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2621         takeover_data->fail_callback = fail_callback;
2622         takeover_data->fail_callback_data = callback_data;
2623         takeover_data->nodemap = nodemap;
2624
2625         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2626         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2627
2628         async_data->fail_callback = takeover_run_fail_callback;
2629         async_data->callback_data = takeover_data;
2630
2631         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2632
2633         /* Send a RELEASE_IP to all nodes that should not be hosting
2634          * each IP.  For each IP, all but one of these will be
2635          * redundant.  However, the redundant ones are used to tell
2636          * nodes which node should be hosting the IP so that commands
2637          * like "ctdb ip" can display a particular nodes idea of who
2638          * is hosting what. */
2639         for (i=0;i<nodemap->num;i++) {
2640                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2641                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2642                         continue;
2643                 }
2644
2645                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2646                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2647                                 /* This node should be serving this
2648                                    vnn so dont tell it to release the ip
2649                                 */
2650                                 continue;
2651                         }
2652                         ip.pnn  = tmp_ip->pnn;
2653                         ip.addr = tmp_ip->addr;
2654
2655                         timeout = TAKEOVER_TIMEOUT();
2656                         data.dsize = sizeof(ip);
2657                         data.dptr  = (uint8_t *)&ip;
2658                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2659                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2660                                                   data, async_data,
2661                                                   &timeout, NULL);
2662                         if (state == NULL) {
2663                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2664                                 talloc_free(tmp_ctx);
2665                                 return -1;
2666                         }
2667
2668                         ctdb_client_async_add(async_data, state);
2669                 }
2670         }
2671         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2672                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2673                 talloc_free(tmp_ctx);
2674                 return -1;
2675         }
2676         talloc_free(async_data);
2677
2678
2679         /* For each IP, send a TAKOVER_IP to the node that should be
2680          * hosting it.  Many of these will often be redundant (since
2681          * the allocation won't have changed) but they can be useful
2682          * to recover from inconsistencies. */
2683         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2684         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2685
2686         async_data->fail_callback = fail_callback;
2687         async_data->callback_data = callback_data;
2688
2689         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2690                 if (tmp_ip->pnn == -1) {
2691                         /* this IP won't be taken over */
2692                         continue;
2693                 }
2694
2695                 ip.pnn  = tmp_ip->pnn;
2696                 ip.addr = tmp_ip->addr;
2697
2698                 timeout = TAKEOVER_TIMEOUT();
2699                 data.dsize = sizeof(ip);
2700                 data.dptr  = (uint8_t *)&ip;
2701                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2702                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2703                                           data, async_data, &timeout, NULL);
2704                 if (state == NULL) {
2705                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2706                         talloc_free(tmp_ctx);
2707                         return -1;
2708                 }
2709
2710                 ctdb_client_async_add(async_data, state);
2711         }
2712         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2713                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2714                 talloc_free(tmp_ctx);
2715                 return -1;
2716         }
2717
2718 ipreallocated:
2719         /*
2720          * Tell all nodes to run eventscripts to process the
2721          * "ipreallocated" event.  This can do a lot of things,
2722          * including restarting services to reconfigure them if public
2723          * IPs have moved.  Once upon a time this event only used to
2724          * update natgw.
2725          */
2726         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2727         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2728         iprealloc_data.retry_nodes = retry_data;
2729         iprealloc_data.retry_count = 0;
2730         iprealloc_data.fail_callback = fail_callback;
2731         iprealloc_data.fail_callback_data = callback_data;
2732         iprealloc_data.nodemap = nodemap;
2733
2734         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2735         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2736                                         nodes, 0, TAKEOVER_TIMEOUT(),
2737                                         false, tdb_null,
2738                                         NULL, iprealloc_fail_callback,
2739                                         &iprealloc_data);
2740         if (ret != 0) {
2741                 /* If the control failed then we should retry to any
2742                  * nodes flagged by iprealloc_fail_callback using the
2743                  * EVENTSCRIPT control.  This is a best-effort at
2744                  * backward compatiblity when running a mixed cluster
2745                  * where some nodes have not yet been upgraded to
2746                  * support the IPREALLOCATED control.
2747                  */
2748                 DEBUG(DEBUG_WARNING,
2749                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2750
2751                 nodes = talloc_array(tmp_ctx, uint32_t,
2752                                      iprealloc_data.retry_count);
2753                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2754
2755                 j = 0;
2756                 for (i=0; i<nodemap->num; i++) {
2757                         if (iprealloc_data.retry_nodes[i]) {
2758                                 nodes[j] = i;
2759                                 j++;
2760                         }
2761                 }
2762
2763                 data.dptr  = discard_const("ipreallocated");
2764                 data.dsize = strlen((char *)data.dptr) + 1; 
2765                 ret = ctdb_client_async_control(ctdb,
2766                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2767                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2768                                                 false, data,
2769                                                 NULL, fail_callback,
2770                                                 callback_data);
2771                 if (ret != 0) {
2772                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2773                 }
2774         }
2775
2776         talloc_free(tmp_ctx);
2777         return ret;
2778 }
2779
2780
2781 /*
2782   destroy a ctdb_client_ip structure
2783  */
2784 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2785 {
2786         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2787                 ctdb_addr_to_str(&ip->addr),
2788                 ntohs(ip->addr.ip.sin_port),
2789                 ip->client_id));
2790
2791         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2792         return 0;
2793 }
2794
2795 /*
2796   called by a client to inform us of a TCP connection that it is managing
2797   that should tickled with an ACK when IP takeover is done
2798  */
2799 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2800                                 TDB_DATA indata)
2801 {
2802         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2803         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2804         struct ctdb_tcp_list *tcp;
2805         struct ctdb_tcp_connection t;
2806         int ret;
2807         TDB_DATA data;
2808         struct ctdb_client_ip *ip;
2809         struct ctdb_vnn *vnn;
2810         ctdb_sock_addr addr;
2811
2812         /* If we don't have public IPs, tickles are useless */
2813         if (ctdb->vnn == NULL) {
2814                 return 0;
2815         }
2816
2817         tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2818
2819         addr = tcp_sock->src;
2820         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2821         addr = tcp_sock->dest;
2822         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2823
2824         ZERO_STRUCT(addr);
2825         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2826         vnn = find_public_ip_vnn(ctdb, &addr);
2827         if (vnn == NULL) {
2828                 switch (addr.sa.sa_family) {
2829                 case AF_INET:
2830                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2831                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2832                                         ctdb_addr_to_str(&addr)));
2833                         }
2834                         break;
2835                 case AF_INET6:
2836                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2837                                 ctdb_addr_to_str(&addr)));
2838                         break;
2839                 default:
2840                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2841                 }
2842
2843                 return 0;
2844         }
2845
2846         if (vnn->pnn != ctdb->pnn) {
2847                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2848                         ctdb_addr_to_str(&addr),
2849                         client_id, client->pid));
2850                 /* failing this call will tell smbd to die */
2851                 return -1;
2852         }
2853
2854         ip = talloc(client, struct ctdb_client_ip);
2855         CTDB_NO_MEMORY(ctdb, ip);
2856
2857         ip->ctdb      = ctdb;
2858         ip->addr      = addr;
2859         ip->client_id = client_id;
2860         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2861         DLIST_ADD(ctdb->client_ip_list, ip);
2862
2863         tcp = talloc(client, struct ctdb_tcp_list);
2864         CTDB_NO_MEMORY(ctdb, tcp);
2865
2866         tcp->connection.src_addr = tcp_sock->src;
2867         tcp->connection.dst_addr = tcp_sock->dest;
2868
2869         DLIST_ADD(client->tcp_list, tcp);
2870
2871         t.src_addr = tcp_sock->src;
2872         t.dst_addr = tcp_sock->dest;
2873
2874         data.dptr = (uint8_t *)&t;
2875         data.dsize = sizeof(t);
2876
2877         switch (addr.sa.sa_family) {
2878         case AF_INET:
2879                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2880                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2881                         ctdb_addr_to_str(&tcp_sock->src),
2882                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2883                 break;
2884         case AF_INET6:
2885                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2886                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2887                         ctdb_addr_to_str(&tcp_sock->src),
2888                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2889                 break;
2890         default:
2891                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2892         }
2893
2894
2895         /* tell all nodes about this tcp connection */
2896         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2897                                        CTDB_CONTROL_TCP_ADD,
2898                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2899         if (ret != 0) {
2900                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2901                 return -1;
2902         }
2903
2904         return 0;
2905 }
2906
2907 /*
2908   find a tcp address on a list
2909  */
2910 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2911                                            struct ctdb_tcp_connection *tcp)
2912 {
2913         int i;
2914
2915         if (array == NULL) {
2916                 return NULL;
2917         }
2918
2919         for (i=0;i<array->num;i++) {
2920                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2921                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2922                         return &array->connections[i];
2923                 }
2924         }
2925         return NULL;
2926 }
2927
2928
2929
2930 /*
2931   called by a daemon to inform us of a TCP connection that one of its
2932   clients managing that should tickled with an ACK when IP takeover is
2933   done
2934  */
2935 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2936 {
2937         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2938         struct ctdb_tcp_array *tcparray;
2939         struct ctdb_tcp_connection tcp;
2940         struct ctdb_vnn *vnn;
2941
2942         /* If we don't have public IPs, tickles are useless */
2943         if (ctdb->vnn == NULL) {
2944                 return 0;
2945         }
2946
2947         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2948         if (vnn == NULL) {
2949                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2950                         ctdb_addr_to_str(&p->dst_addr)));
2951
2952                 return -1;
2953         }
2954
2955
2956         tcparray = vnn->tcp_array;
2957
2958         /* If this is the first tickle */
2959         if (tcparray == NULL) {
2960                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2961                 CTDB_NO_MEMORY(ctdb, tcparray);
2962                 vnn->tcp_array = tcparray;
2963
2964                 tcparray->num = 0;
2965                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2966                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2967
2968                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2969                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2970                 tcparray->num++;
2971
2972                 if (tcp_update_needed) {
2973                         vnn->tcp_update_needed = true;
2974                 }
2975                 return 0;
2976         }
2977
2978
2979         /* Do we already have this tickle ?*/
2980         tcp.src_addr = p->src_addr;
2981         tcp.dst_addr = p->dst_addr;
2982         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2983                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2984                         ctdb_addr_to_str(&tcp.dst_addr),
2985                         ntohs(tcp.dst_addr.ip.sin_port),
2986                         vnn->pnn));
2987                 return 0;
2988         }
2989
2990         /* A new tickle, we must add it to the array */
2991         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2992                                         struct ctdb_tcp_connection,
2993                                         tcparray->num+1);
2994         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2995
2996         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2997         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2998         tcparray->num++;
2999
3000         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3001                 ctdb_addr_to_str(&tcp.dst_addr),
3002                 ntohs(tcp.dst_addr.ip.sin_port),
3003                 vnn->pnn));
3004
3005         if (tcp_update_needed) {
3006                 vnn->tcp_update_needed = true;
3007         }
3008
3009         return 0;
3010 }
3011
3012
3013 /*
3014   called by a daemon to inform us of a TCP connection that one of its
3015   clients managing that should tickled with an ACK when IP takeover is
3016   done
3017  */
3018 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3019 {
3020         struct ctdb_tcp_connection *tcpp;
3021         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3022
3023         if (vnn == NULL) {
3024                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3025                         ctdb_addr_to_str(&conn->dst_addr)));
3026                 return;
3027         }
3028
3029         /* if the array is empty we cant remove it
3030            and we dont need to do anything
3031          */
3032         if (vnn->tcp_array == NULL) {
3033                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3034                         ctdb_addr_to_str(&conn->dst_addr),
3035                         ntohs(conn->dst_addr.ip.sin_port)));
3036                 return;
3037         }
3038
3039
3040         /* See if we know this connection
3041            if we dont know this connection  then we dont need to do anything
3042          */
3043         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3044         if (tcpp == NULL) {
3045                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3046                         ctdb_addr_to_str(&conn->dst_addr),
3047                         ntohs(conn->dst_addr.ip.sin_port)));
3048                 return;
3049         }
3050
3051
3052         /* We need to remove this entry from the array.
3053            Instead of allocating a new array and copying data to it
3054            we cheat and just copy the last entry in the existing array
3055            to the entry that is to be removed and just shring the 
3056            ->num field
3057          */
3058         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3059         vnn->tcp_array->num--;
3060
3061         /* If we deleted the last entry we also need to remove the entire array
3062          */
3063         if (vnn->tcp_array->num == 0) {
3064                 talloc_free(vnn->tcp_array);
3065                 vnn->tcp_array = NULL;
3066         }               
3067
3068         vnn->tcp_update_needed = true;
3069
3070         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3071                 ctdb_addr_to_str(&conn->src_addr),
3072                 ntohs(conn->src_addr.ip.sin_port)));
3073 }
3074
3075
3076 /*
3077   called by a daemon to inform us of a TCP connection that one of its
3078   clients used are no longer needed in the tickle database
3079  */
3080 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3081 {
3082         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3083
3084         /* If we don't have public IPs, tickles are useless */
3085         if (ctdb->vnn == NULL) {
3086                 return 0;
3087         }
3088
3089         ctdb_remove_tcp_connection(ctdb, conn);
3090
3091         return 0;
3092 }
3093
3094
3095 /*
3096   Called when another daemon starts - causes all tickles for all
3097   public addresses we are serving to be sent to the new node on the
3098   next check.  This actually causes the next scheduled call to
3099   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3100   doesn't require careful error handling.
3101  */
3102 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3103 {
3104         struct ctdb_vnn *vnn;
3105
3106         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3107                            (unsigned long) pnn));
3108
3109         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3110                 vnn->tcp_update_needed = true;
3111         }
3112
3113         return 0;
3114 }
3115
3116
3117 /*
3118   called when a client structure goes away - hook to remove
3119   elements from the tcp_list in all daemons
3120  */
3121 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3122 {
3123         while (client->tcp_list) {
3124                 struct ctdb_tcp_list *tcp = client->tcp_list;
3125                 DLIST_REMOVE(client->tcp_list, tcp);
3126                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3127         }
3128 }
3129
3130
3131 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3132 {
3133         struct ctdb_vnn *vnn;
3134         int count = 0;
3135
3136         if (ctdb->tunable.disable_ip_failover == 1) {
3137                 return;
3138         }
3139
3140         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3141                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3142                         ctdb_vnn_unassign_iface(ctdb, vnn);
3143                         continue;
3144                 }
3145                 if (!vnn->iface) {
3146                         continue;
3147                 }
3148
3149                 /* Don't allow multiple releases at once.  Some code,
3150                  * particularly ctdb_tickle_sentenced_connections() is
3151                  * not re-entrant */
3152                 if (vnn->update_in_flight) {
3153                         DEBUG(DEBUG_WARNING,
3154                               (__location__
3155                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3156                                     ctdb_addr_to_str(&vnn->public_address),
3157                                     vnn->public_netmask_bits,
3158                                     ctdb_vnn_iface_string(vnn)));
3159                         continue;
3160                 }
3161                 vnn->update_in_flight = true;
3162
3163                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3164                                     ctdb_addr_to_str(&vnn->public_address),
3165                                     vnn->public_netmask_bits,
3166                                     ctdb_vnn_iface_string(vnn)));
3167
3168                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3169                                   ctdb_vnn_iface_string(vnn),
3170                                   ctdb_addr_to_str(&vnn->public_address),
3171                                   vnn->public_netmask_bits);
3172                 release_kill_clients(ctdb, &vnn->public_address);
3173                 ctdb_vnn_unassign_iface(ctdb, vnn);
3174                 vnn->update_in_flight = false;
3175                 count++;
3176         }
3177
3178         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3179 }
3180
3181
3182 /*
3183   get list of public IPs
3184  */
3185 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3186                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3187 {
3188         int i, num, len;
3189         struct ctdb_all_public_ips *ips;
3190         struct ctdb_vnn *vnn;
3191         bool only_available = false;
3192
3193         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3194                 only_available = true;
3195         }
3196
3197         /* count how many public ip structures we have */
3198         num = 0;
3199         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3200                 num++;
3201         }
3202
3203         len = offsetof(struct ctdb_all_public_ips, ips) + 
3204                 num*sizeof(struct ctdb_public_ip);
3205         ips = talloc_zero_size(outdata, len);
3206         CTDB_NO_MEMORY(ctdb, ips);
3207
3208         i = 0;
3209         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3210                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3211                         continue;
3212                 }
3213                 ips->ips[i].pnn  = vnn->pnn;
3214                 ips->ips[i].addr = vnn->public_address;
3215                 i++;
3216         }
3217         ips->num = i;
3218         len = offsetof(struct ctdb_all_public_ips, ips) +
3219                 i*sizeof(struct ctdb_public_ip);
3220
3221         outdata->dsize = len;
3222         outdata->dptr  = (uint8_t *)ips;
3223
3224         return 0;
3225 }
3226
3227
3228 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3229                                         struct ctdb_req_control *c,
3230                                         TDB_DATA indata,
3231                                         TDB_DATA *outdata)
3232 {
3233         int i, num, len;
3234         ctdb_sock_addr *addr;
3235         struct ctdb_control_public_ip_info *info;
3236         struct ctdb_vnn *vnn;
3237
3238         addr = (ctdb_sock_addr *)indata.dptr;
3239
3240         vnn = find_public_ip_vnn(ctdb, addr);
3241         if (vnn == NULL) {
3242                 /* if it is not a public ip   it could be our 'single ip' */
3243                 if (ctdb->single_ip_vnn) {
3244                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3245                                 vnn = ctdb->single_ip_vnn;
3246                         }
3247                 }
3248         }
3249         if (vnn == NULL) {
3250                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3251                                  "'%s'not a public address\n",
3252                                  ctdb_addr_to_str(addr)));
3253                 return -1;
3254         }
3255
3256         /* count how many public ip structures we have */
3257         num = 0;
3258         for (;vnn->ifaces[num];) {
3259                 num++;
3260         }
3261
3262         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3263                 num*sizeof(struct ctdb_control_iface_info);
3264         info = talloc_zero_size(outdata, len);
3265         CTDB_NO_MEMORY(ctdb, info);
3266
3267         info->ip.addr = vnn->public_address;
3268         info->ip.pnn = vnn->pnn;
3269         info->active_idx = 0xFFFFFFFF;
3270
3271         for (i=0; vnn->ifaces[i]; i++) {
3272                 struct ctdb_iface *cur;
3273
3274                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3275                 if (cur == NULL) {
3276                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3277                                            vnn->ifaces[i]));
3278                         return -1;
3279                 }
3280                 if (vnn->iface == cur) {
3281                         info->active_idx = i;
3282                 }
3283                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3284                 info->ifaces[i].link_state = cur->link_up;
3285                 info->ifaces[i].references = cur->references;
3286         }
3287         info->num = i;
3288         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3289                 i*sizeof(struct ctdb_control_iface_info);
3290
3291         outdata->dsize = len;
3292         outdata->dptr  = (uint8_t *)info;
3293
3294         return 0;
3295 }
3296
3297 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3298                                 struct ctdb_req_control *c,
3299                                 TDB_DATA *outdata)
3300 {
3301         int i, num, len;
3302         struct ctdb_control_get_ifaces *ifaces;
3303         struct ctdb_iface *cur;
3304
3305         /* count how many public ip structures we have */
3306         num = 0;
3307         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3308                 num++;
3309         }
3310
3311         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3312                 num*sizeof(struct ctdb_control_iface_info);
3313         ifaces = talloc_zero_size(outdata, len);
3314         CTDB_NO_MEMORY(ctdb, ifaces);
3315
3316         i = 0;
3317         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3318                 strcpy(ifaces->ifaces[i].name, cur->name);
3319                 ifaces->ifaces[i].link_state = cur->link_up;
3320                 ifaces->ifaces[i].references = cur->references;
3321                 i++;
3322         }
3323         ifaces->num = i;
3324         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3325                 i*sizeof(struct ctdb_control_iface_info);
3326
3327         outdata->dsize = len;
3328         outdata->dptr  = (uint8_t *)ifaces;
3329
3330         return 0;
3331 }
3332
3333 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3334                                     struct ctdb_req_control *c,
3335                                     TDB_DATA indata)
3336 {
3337         struct ctdb_control_iface_info *info;
3338         struct ctdb_iface *iface;
3339         bool link_up = false;
3340
3341         info = (struct ctdb_control_iface_info *)indata.dptr;
3342
3343         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3344                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3345                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3346                                   len, len, info->name));
3347                 return -1;
3348         }
3349
3350         switch (info->link_state) {
3351         case 0:
3352                 link_up = false;
3353                 break;
3354         case 1:
3355                 link_up = true;
3356                 break;
3357         default:
3358                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3359                                   (unsigned int)info->link_state));
3360                 return -1;
3361         }
3362
3363         if (info->references != 0) {
3364                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3365                                   (unsigned int)info->references));
3366                 return -1;
3367         }
3368
3369         iface = ctdb_find_iface(ctdb, info->name);
3370         if (iface == NULL) {
3371                 return -1;
3372         }
3373
3374         if (link_up == iface->link_up) {
3375                 return 0;
3376         }
3377
3378         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3379               ("iface[%s] has changed it's link status %s => %s\n",
3380                iface->name,
3381                iface->link_up?"up":"down",
3382                link_up?"up":"down"));
3383
3384         iface->link_up = link_up;
3385         return 0;
3386 }
3387
3388
3389 /* 
3390    structure containing the listening socket and the list of tcp connections
3391    that the ctdb daemon is to kill
3392 */
3393 struct ctdb_kill_tcp {
3394         struct ctdb_vnn *vnn;
3395         struct ctdb_context *ctdb;
3396         int capture_fd;
3397         struct fd_event *fde;
3398         trbt_tree_t *connections;
3399         void *private_data;
3400 };
3401
3402 /*
3403   a tcp connection that is to be killed
3404  */
3405 struct ctdb_killtcp_con {
3406         ctdb_sock_addr src_addr;
3407         ctdb_sock_addr dst_addr;
3408         int count;
3409         struct ctdb_kill_tcp *killtcp;
3410 };
3411
3412 /* this function is used to create a key to represent this socketpair
3413    in the killtcp tree.
3414    this key is used to insert and lookup matching socketpairs that are
3415    to be tickled and RST
3416 */
3417 #define KILLTCP_KEYLEN  10
3418 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3419 {
3420         static uint32_t key[KILLTCP_KEYLEN];
3421
3422         bzero(key, sizeof(key));
3423
3424         if (src->sa.sa_family != dst->sa.sa_family) {
3425                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3426                 return key;
3427         }
3428         
3429         switch (src->sa.sa_family) {
3430         case AF_INET:
3431                 key[0]  = dst->ip.sin_addr.s_addr;
3432                 key[1]  = src->ip.sin_addr.s_addr;
3433                 key[2]  = dst->ip.sin_port;
3434                 key[3]  = src->ip.sin_port;
3435                 break;
3436         case AF_INET6: {
3437                 uint32_t *dst6_addr32 =
3438                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3439                 uint32_t *src6_addr32 =
3440                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3441                 key[0]  = dst6_addr32[3];
3442                 key[1]  = src6_addr32[3];
3443                 key[2]  = dst6_addr32[2];
3444                 key[3]  = src6_addr32[2];
3445                 key[4]  = dst6_addr32[1];
3446                 key[5]  = src6_addr32[1];
3447                 key[6]  = dst6_addr32[0];
3448                 key[7]  = src6_addr32[0];
3449                 key[8]  = dst->ip6.sin6_port;
3450                 key[9]  = src->ip6.sin6_port;
3451                 break;
3452         }
3453         default:
3454                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3455                 return key;
3456         }
3457
3458         return key;
3459 }
3460
3461 /*
3462   called when we get a read event on the raw socket
3463  */
3464 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3465                                 uint16_t flags, void *private_data)
3466 {
3467         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3468         struct ctdb_killtcp_con *con;
3469         ctdb_sock_addr src, dst;
3470         uint32_t ack_seq, seq;
3471
3472         if (!(flags & EVENT_FD_READ)) {
3473                 return;
3474         }
3475
3476         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3477                                 killtcp->private_data,
3478                                 &src, &dst,
3479                                 &ack_seq, &seq) != 0) {
3480                 /* probably a non-tcp ACK packet */
3481                 return;
3482         }
3483
3484         /* check if we have this guy in our list of connections
3485            to kill
3486         */
3487         con = trbt_lookuparray32(killtcp->connections, 
3488                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3489         if (con == NULL) {
3490                 /* no this was some other packet we can just ignore */
3491                 return;
3492         }
3493
3494         /* This one has been tickled !
3495            now reset him and remove him from the list.
3496          */
3497         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3498                 ntohs(con->dst_addr.ip.sin_port),
3499                 ctdb_addr_to_str(&con->src_addr),
3500                 ntohs(con->src_addr.ip.sin_port)));
3501
3502         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3503         talloc_free(con);
3504 }
3505
3506
3507 /* when traversing the list of all tcp connections to send tickle acks to
3508    (so that we can capture the ack coming back and kill the connection
3509     by a RST)
3510    this callback is called for each connection we are currently trying to kill
3511 */
3512 static int tickle_connection_traverse(void *param, void *data)
3513 {
3514         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3515
3516         /* have tried too many times, just give up */
3517         if (con->count >= 5) {
3518                 /* can't delete in traverse: reparent to delete_cons */
3519                 talloc_steal(param, con);
3520                 return 0;
3521         }
3522
3523         /* othervise, try tickling it again */
3524         con->count++;
3525         ctdb_sys_send_tcp(
3526                 (ctdb_sock_addr *)&con->dst_addr,
3527                 (ctdb_sock_addr *)&con->src_addr,
3528                 0, 0, 0);
3529         return 0;
3530 }
3531
3532
3533 /* 
3534    called every second until all sentenced connections have been reset
3535  */
3536 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3537                                               struct timeval t, void *private_data)
3538 {
3539         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3540         void *delete_cons = talloc_new(NULL);
3541
3542         /* loop over all connections sending tickle ACKs */
3543         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3544
3545         /* now we've finished traverse, it's safe to do deletion. */
3546         talloc_free(delete_cons);
3547
3548         /* If there are no more connections to kill we can remove the
3549            entire killtcp structure
3550          */
3551         if ( (killtcp->connections == NULL) || 
3552              (killtcp->connections->root == NULL) ) {
3553                 talloc_free(killtcp);
3554                 return;
3555         }
3556
3557         /* try tickling them again in a seconds time
3558          */
3559         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3560                         ctdb_tickle_sentenced_connections, killtcp);
3561 }
3562
3563 /*
3564   destroy the killtcp structure
3565  */
3566 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3567 {
3568         struct ctdb_vnn *tmpvnn;
3569
3570         /* verify that this vnn is still active */
3571         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3572                 if (tmpvnn == killtcp->vnn) {
3573                         break;
3574                 }
3575         }
3576
3577         if (tmpvnn == NULL) {
3578                 return 0;
3579         }
3580
3581         if (killtcp->vnn->killtcp != killtcp) {
3582                 return 0;
3583         }
3584
3585         killtcp->vnn->killtcp = NULL;
3586
3587         return 0;
3588 }
3589
3590
3591 /* nothing fancy here, just unconditionally replace any existing
3592    connection structure with the new one.
3593
3594    dont even free the old one if it did exist, that one is talloc_stolen
3595    by the same node in the tree anyway and will be deleted when the new data 
3596    is deleted
3597 */
3598 static void *add_killtcp_callback(void *parm, void *data)
3599 {
3600         return parm;
3601 }
3602
3603 /*
3604   add a tcp socket to the list of connections we want to RST
3605  */
3606 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3607                                        ctdb_sock_addr *s,
3608                                        ctdb_sock_addr *d)
3609 {
3610         ctdb_sock_addr src, dst;
3611         struct ctdb_kill_tcp *killtcp;
3612         struct ctdb_killtcp_con *con;
3613         struct ctdb_vnn *vnn;
3614
3615         ctdb_canonicalize_ip(s, &src);
3616         ctdb_canonicalize_ip(d, &dst);
3617
3618         vnn = find_public_ip_vnn(ctdb, &dst);
3619         if (vnn == NULL) {
3620                 vnn = find_public_ip_vnn(ctdb, &src);
3621         }
3622         if (vnn == NULL) {
3623                 /* if it is not a public ip   it could be our 'single ip' */
3624                 if (ctdb->single_ip_vnn) {
3625                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3626                                 vnn = ctdb->single_ip_vnn;
3627                         }
3628                 }
3629         }
3630         if (vnn == NULL) {
3631                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3632                 return -1;
3633         }
3634
3635         killtcp = vnn->killtcp;
3636         
3637         /* If this is the first connection to kill we must allocate
3638            a new structure
3639          */
3640         if (killtcp == NULL) {
3641                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3642                 CTDB_NO_MEMORY(ctdb, killtcp);
3643
3644                 killtcp->vnn         = vnn;
3645                 killtcp->ctdb        = ctdb;
3646                 killtcp->capture_fd  = -1;
3647                 killtcp->connections = trbt_create(killtcp, 0);
3648
3649                 vnn->killtcp         = killtcp;
3650                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3651         }
3652
3653
3654
3655         /* create a structure that describes this connection we want to
3656            RST and store it in killtcp->connections
3657         */
3658         con = talloc(killtcp, struct ctdb_killtcp_con);
3659         CTDB_NO_MEMORY(ctdb, con);
3660         con->src_addr = src;
3661         con->dst_addr = dst;
3662         con->count    = 0;
3663         con->killtcp  = killtcp;
3664
3665
3666         trbt_insertarray32_callback(killtcp->connections,
3667                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3668                         add_killtcp_callback, con);
3669
3670         /* 
3671            If we dont have a socket to listen on yet we must create it
3672          */
3673         if (killtcp->capture_fd == -1) {
3674                 const char *iface = ctdb_vnn_iface_string(vnn);
3675                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3676                 if (killtcp->capture_fd == -1) {
3677                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3678                                           "socket on iface '%s' for killtcp (%s)\n",
3679                                           iface, strerror(errno)));
3680                         goto failed;
3681                 }
3682         }
3683
3684
3685         if (killtcp->fde == NULL) {
3686                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3687                                             EVENT_FD_READ,
3688                                             capture_tcp_handler, killtcp);
3689                 tevent_fd_set_auto_close(killtcp->fde);
3690
3691                 /* We also need to set up some events to tickle all these connections
3692                    until they are all reset
3693                 */
3694                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3695                                 ctdb_tickle_sentenced_connections, killtcp);
3696         }
3697
3698         /* tickle him once now */
3699         ctdb_sys_send_tcp(
3700                 &con->dst_addr,
3701                 &con->src_addr,
3702                 0, 0, 0);
3703
3704         return 0;
3705
3706 failed:
3707         talloc_free(vnn->killtcp);
3708         vnn->killtcp = NULL;
3709         return -1;
3710 }
3711
3712 /*
3713   kill a TCP connection.
3714  */
3715 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3716 {
3717         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3718
3719         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3720 }
3721
3722 /*
3723   called by a daemon to inform us of the entire list of TCP tickles for
3724   a particular public address.
3725   this control should only be sent by the node that is currently serving
3726   that public address.
3727  */
3728 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3729 {
3730         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3731         struct ctdb_tcp_array *tcparray;
3732         struct ctdb_vnn *vnn;
3733
3734         /* We must at least have tickles.num or else we cant verify the size
3735            of the received data blob
3736          */
3737         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3738                                         tickles.connections)) {
3739                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3740                 return -1;
3741         }
3742
3743         /* verify that the size of data matches what we expect */
3744         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3745                                 tickles.connections)
3746                          + sizeof(struct ctdb_tcp_connection)
3747                                  * list->tickles.num) {
3748                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3749                 return -1;
3750         }
3751
3752         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3753                            ctdb_addr_to_str(&list->addr)));
3754
3755         vnn = find_public_ip_vnn(ctdb, &list->addr);
3756         if (vnn == NULL) {
3757                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3758                         ctdb_addr_to_str(&list->addr)));
3759
3760                 return 1;
3761         }
3762
3763         /* remove any old ticklelist we might have */
3764         talloc_free(vnn->tcp_array);
3765         vnn->tcp_array = NULL;
3766
3767         tcparray = talloc(vnn, struct ctdb_tcp_array);
3768         CTDB_NO_MEMORY(ctdb, tcparray);
3769
3770         tcparray->num = list->tickles.num;
3771
3772         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3773         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3774
3775         memcpy(tcparray->connections, &list->tickles.connections[0],
3776                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3777
3778         /* We now have a new fresh tickle list array for this vnn */
3779         vnn->tcp_array = tcparray;
3780
3781         return 0;
3782 }
3783
3784 /*
3785   called to return the full list of tickles for the puclic address associated 
3786   with the provided vnn
3787  */
3788 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3789 {
3790         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3791         struct ctdb_control_tcp_tickle_list *list;
3792         struct ctdb_tcp_array *tcparray;
3793         int num;
3794         struct ctdb_vnn *vnn;
3795
3796         vnn = find_public_ip_vnn(ctdb, addr);
3797         if (vnn == NULL) {
3798                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3799                         ctdb_addr_to_str(addr)));
3800
3801                 return 1;
3802         }
3803
3804         tcparray = vnn->tcp_array;
3805         if (tcparray) {
3806                 num = tcparray->num;
3807         } else {
3808                 num = 0;
3809         }
3810
3811         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3812                                 tickles.connections)
3813                         + sizeof(struct ctdb_tcp_connection) * num;
3814
3815         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3816         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3817         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3818
3819         list->addr = *addr;
3820         list->tickles.num = num;
3821         if (num) {
3822                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3823                         sizeof(struct ctdb_tcp_connection) * num);
3824         }
3825
3826         return 0;
3827 }
3828
3829
3830 /*
3831   set the list of all tcp tickles for a public address
3832  */
3833 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3834                                             ctdb_sock_addr *addr,
3835                                             struct ctdb_tcp_array *tcparray)
3836 {
3837         int ret, num;
3838         TDB_DATA data;
3839         struct ctdb_control_tcp_tickle_list *list;
3840
3841         if (tcparray) {
3842                 num = tcparray->num;
3843         } else {
3844                 num = 0;
3845         }
3846
3847         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3848                                 tickles.connections) +
3849                         sizeof(struct ctdb_tcp_connection) * num;
3850         data.dptr = talloc_size(ctdb, data.dsize);
3851         CTDB_NO_MEMORY(ctdb, data.dptr);
3852
3853         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3854         list->addr = *addr;
3855         list->tickles.num = num;
3856         if (tcparray) {
3857                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3858         }
3859
3860         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3861                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3862                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3863         if (ret != 0) {
3864                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3865                 return -1;
3866         }
3867
3868         talloc_free(data.dptr);
3869
3870         return ret;
3871 }
3872
3873
3874 /*
3875   perform tickle updates if required
3876  */
3877 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3878                                 struct timed_event *te, 
3879                                 struct timeval t, void *private_data)
3880 {
3881         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3882         int ret;
3883         struct ctdb_vnn *vnn;
3884
3885         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3886                 /* we only send out updates for public addresses that 
3887                    we have taken over
3888                  */
3889                 if (ctdb->pnn != vnn->pnn) {
3890                         continue;
3891                 }
3892                 /* We only send out the updates if we need to */
3893                 if (!vnn->tcp_update_needed) {
3894                         continue;
3895                 }
3896                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3897                                                        &vnn->public_address,
3898                                                        vnn->tcp_array);
3899                 if (ret != 0) {
3900                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3901                                 ctdb_addr_to_str(&vnn->public_address)));
3902                 } else {
3903                         DEBUG(DEBUG_INFO,
3904                               ("Sent tickle update for public address %s\n",
3905                                ctdb_addr_to_str(&vnn->public_address)));
3906                         vnn->tcp_update_needed = false;
3907                 }
3908         }
3909
3910         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3911                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3912                              ctdb_update_tcp_tickles, ctdb);
3913 }               
3914         
3915
3916 /*
3917   start periodic update of tcp tickles
3918  */
3919 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3920 {
3921         ctdb->tickle_update_context = talloc_new(ctdb);
3922
3923         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3924                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3925                              ctdb_update_tcp_tickles, ctdb);
3926 }
3927
3928
3929
3930
3931 struct control_gratious_arp {
3932         struct ctdb_context *ctdb;
3933         ctdb_sock_addr addr;
3934         const char *iface;
3935         int count;
3936 };
3937
3938 /*
3939   send a control_gratuitous arp
3940  */
3941 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3942                                   struct timeval t, void *private_data)
3943 {
3944         int ret;
3945         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3946                                                         struct control_gratious_arp);
3947
3948         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3949         if (ret != 0) {
3950                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3951                                  arp->iface, strerror(errno)));
3952         }
3953
3954
3955         arp->count++;
3956         if (arp->count == CTDB_ARP_REPEAT) {
3957                 talloc_free(arp);
3958                 return;
3959         }
3960
3961         event_add_timed(arp->ctdb->ev, arp, 
3962                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3963                         send_gratious_arp, arp);
3964 }
3965
3966
3967 /*
3968   send a gratious arp 
3969  */
3970 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3971 {
3972         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3973         struct control_gratious_arp *arp;
3974
3975         /* verify the size of indata */
3976         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3977                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3978                                  (unsigned)indata.dsize, 
3979                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3980                 return -1;
3981         }
3982         if (indata.dsize != 
3983                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3984                 + gratious_arp->len ) ){
3985
3986                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3987                         "but should be %u bytes\n", 
3988                          (unsigned)indata.dsize, 
3989                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3990                 return -1;
3991         }
3992
3993
3994         arp = talloc(ctdb, struct control_gratious_arp);
3995         CTDB_NO_MEMORY(ctdb, arp);
3996
3997         arp->ctdb  = ctdb;
3998         arp->addr   = gratious_arp->addr;
3999         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4000         CTDB_NO_MEMORY(ctdb, arp->iface);
4001         arp->count = 0;
4002         
4003         event_add_timed(arp->ctdb->ev, arp, 
4004                         timeval_zero(), send_gratious_arp, arp);
4005
4006         return 0;
4007 }
4008
4009 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4010 {
4011         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4012         int ret;
4013
4014         /* verify the size of indata */
4015         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4016                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4017                 return -1;
4018         }
4019         if (indata.dsize != 
4020                 ( offsetof(struct ctdb_control_ip_iface, iface)
4021                 + pub->len ) ){
4022
4023                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4024                         "but should be %u bytes\n", 
4025                          (unsigned)indata.dsize, 
4026                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4027                 return -1;
4028         }
4029
4030         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4031
4032         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4033
4034         if (ret != 0) {
4035                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4036                 return -1;
4037         }
4038
4039         return 0;
4040 }
4041
4042 struct delete_ip_callback_state {
4043         struct ctdb_req_control *c;
4044 };
4045
4046 /*
4047   called when releaseip event finishes for del_public_address
4048  */
4049 static void delete_ip_callback(struct ctdb_context *ctdb,
4050                                int32_t status, TDB_DATA data,
4051                                const char *errormsg,
4052                                void *private_data)
4053 {
4054         struct delete_ip_callback_state *state =
4055                 talloc_get_type(private_data, struct delete_ip_callback_state);
4056
4057         /* If release failed then fail. */
4058         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4059         talloc_free(private_data);
4060 }
4061
4062 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4063                                         struct ctdb_req_control *c,
4064                                         TDB_DATA indata, bool *async_reply)
4065 {
4066         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4067         struct ctdb_vnn *vnn;
4068
4069         /* verify the size of indata */
4070         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4071                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4072                 return -1;
4073         }
4074         if (indata.dsize != 
4075                 ( offsetof(struct ctdb_control_ip_iface, iface)
4076                 + pub->len ) ){
4077
4078                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4079                         "but should be %u bytes\n", 
4080                          (unsigned)indata.dsize, 
4081                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4082                 return -1;
4083         }
4084
4085         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4086
4087         /* walk over all public addresses until we find a match */
4088         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4089                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4090                         if (vnn->pnn == ctdb->pnn) {
4091                                 struct delete_ip_callback_state *state;
4092                                 struct ctdb_public_ip *ip;
4093                                 TDB_DATA data;
4094                                 int ret;
4095
4096                                 vnn->delete_pending = true;
4097
4098                                 state = talloc(ctdb,
4099                                                struct delete_ip_callback_state);
4100                                 CTDB_NO_MEMORY(ctdb, state);
4101                                 state->c = c;
4102
4103                                 ip = talloc(state, struct ctdb_public_ip);
4104                                 if (ip == NULL) {
4105                                         DEBUG(DEBUG_ERR,
4106                                               (__location__ " Out of memory\n"));
4107                                         talloc_free(state);
4108                                         return -1;
4109                                 }
4110                                 ip->pnn = -1;
4111                                 ip->addr = pub->addr;
4112
4113                                 data.dsize = sizeof(struct ctdb_public_ip);
4114                                 data.dptr = (unsigned char *)ip;
4115
4116                                 ret = ctdb_daemon_send_control(ctdb,
4117                                                                ctdb_get_pnn(ctdb),
4118                                                                0,
4119                                                                CTDB_CONTROL_RELEASE_IP,
4120                                                                0, 0,
4121                                                                data,
4122                                                                delete_ip_callback,
4123                                                                state);
4124                                 if (ret == -1) {
4125                                         DEBUG(DEBUG_ERR,
4126                                               (__location__ "Unable to send "
4127                                                "CTDB_CONTROL_RELEASE_IP\n"));
4128                                         talloc_free(state);
4129                                         return -1;
4130                                 }
4131
4132                                 state->c = talloc_steal(state, c);
4133                                 *async_reply = true;
4134                         } else {
4135                                 /* This IP is not hosted on the
4136                                  * current node so just delete it
4137                                  * now. */
4138                                 do_delete_ip(ctdb, vnn);
4139                         }
4140
4141                         return 0;
4142                 }
4143         }
4144
4145         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4146                          ctdb_addr_to_str(&pub->addr)));
4147         return -1;
4148 }
4149
4150
4151 struct ipreallocated_callback_state {
4152         struct ctdb_req_control *c;
4153 };
4154
4155 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4156                                         int status, void *p)
4157 {
4158         struct ipreallocated_callback_state *state =
4159                 talloc_get_type(p, struct ipreallocated_callback_state);
4160
4161         if (status != 0) {
4162                 DEBUG(DEBUG_ERR,
4163                       (" \"ipreallocated\" event script failed (status %d)\n",
4164                        status));
4165                 if (status == -ETIME) {
4166                         ctdb_ban_self(ctdb);
4167                 }
4168         }
4169
4170         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4171         talloc_free(state);
4172 }
4173
4174 /* A control to run the ipreallocated event */
4175 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4176                                    struct ctdb_req_control *c,
4177                                    bool *async_reply)
4178 {
4179         int ret;
4180         struct ipreallocated_callback_state *state;
4181
4182         state = talloc(ctdb, struct ipreallocated_callback_state);
4183         CTDB_NO_MEMORY(ctdb, state);
4184
4185         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4186
4187         ret = ctdb_event_script_callback(ctdb, state,
4188                                          ctdb_ipreallocated_callback, state,
4189                                          CTDB_EVENT_IPREALLOCATED,
4190                                          "%s", "");
4191
4192         if (ret != 0) {
4193                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4194                 talloc_free(state);
4195                 return -1;
4196         }
4197
4198         /* tell the control that we will be reply asynchronously */
4199         state->c    = talloc_steal(state, c);
4200         *async_reply = true;
4201
4202         return 0;
4203 }
4204
4205
4206 /* This function is called from the recovery daemon to verify that a remote
4207    node has the expected ip allocation.
4208    This is verified against ctdb->ip_tree
4209 */
4210 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4211                                 struct ctdb_all_public_ips *ips,
4212                                 uint32_t pnn)
4213 {
4214         struct ctdb_public_ip_list *tmp_ip; 
4215         int i;
4216
4217         if (ctdb->ip_tree == NULL) {
4218                 /* dont know the expected allocation yet, assume remote node
4219                    is correct. */
4220                 return 0;
4221         }
4222
4223         if (ips == NULL) {
4224                 return 0;
4225         }
4226
4227         for (i=0; i<ips->num; i++) {
4228                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4229                 if (tmp_ip == NULL) {
4230                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4231                         return -1;
4232                 }
4233
4234                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4235                         continue;
4236                 }
4237
4238                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4239                         DEBUG(DEBUG_ERR,
4240                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4241                                pnn,
4242                                ctdb_addr_to_str(&ips->ips[i].addr),
4243                                ips->ips[i].pnn, tmp_ip->pnn));
4244                         return -1;
4245                 }
4246         }
4247
4248         return 0;
4249 }
4250
4251 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4252 {
4253         struct ctdb_public_ip_list *tmp_ip;
4254
4255         /* IP tree is never built if DisableIPFailover is set */
4256         if (ctdb->tunable.disable_ip_failover != 0) {
4257                 return 0;
4258         }
4259
4260         if (ctdb->ip_tree == NULL) {
4261                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4262                 return -1;
4263         }
4264
4265         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4266         if (tmp_ip == NULL) {
4267                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4268                 return -1;
4269         }
4270
4271         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4272         tmp_ip->pnn = ip->pnn;
4273
4274         return 0;
4275 }
4276
4277 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4278 {
4279         TALLOC_FREE(ctdb->ip_tree);
4280 }
4281
4282 struct ctdb_reloadips_handle {
4283         struct ctdb_context *ctdb;
4284         struct ctdb_req_control *c;
4285         int status;
4286         int fd[2];
4287         pid_t child;
4288         struct fd_event *fde;
4289 };
4290
4291 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4292 {
4293         if (h == h->ctdb->reload_ips) {
4294                 h->ctdb->reload_ips = NULL;
4295         }
4296         if (h->c != NULL) {
4297                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4298                 h->c = NULL;
4299         }
4300         ctdb_kill(h->ctdb, h->child, SIGKILL);
4301         return 0;
4302 }
4303
4304 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4305                                 struct timed_event *te,
4306                                 struct timeval t, void *private_data)
4307 {
4308         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4309
4310         talloc_free(h);
4311 }       
4312
4313 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4314                              uint16_t flags, void *private_data)
4315 {
4316         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4317
4318         char res;
4319         int ret;
4320
4321         ret = sys_read(h->fd[0], &res, 1);
4322         if (ret < 1 || res != 0) {
4323                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4324                 res = 1;
4325         }
4326         h->status = res;
4327
4328         talloc_free(h);
4329 }
4330
4331 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4332 {
4333         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4334         struct ctdb_all_public_ips *ips;
4335         struct ctdb_vnn *vnn;
4336         struct client_async_data *async_data;
4337         struct timeval timeout;
4338         TDB_DATA data;
4339         struct ctdb_client_control_state *state;
4340         bool first_add;
4341         int i, ret;
4342
4343         CTDB_NO_MEMORY(ctdb, mem_ctx);
4344
4345         /* Read IPs from local node */
4346         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4347                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4348         if (ret != 0) {
4349                 DEBUG(DEBUG_ERR,
4350                       ("Unable to fetch public IPs from local node\n"));
4351                 talloc_free(mem_ctx);
4352                 return -1;
4353         }
4354
4355         /* Read IPs file - this is safe since this is a child process */
4356         ctdb->vnn = NULL;
4357         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4358                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4359                 talloc_free(mem_ctx);
4360                 return -1;
4361         }
4362
4363         async_data = talloc_zero(mem_ctx, struct client_async_data);
4364         CTDB_NO_MEMORY(ctdb, async_data);
4365
4366         /* Compare IPs between node and file for IPs to be deleted */
4367         for (i = 0; i < ips->num; i++) {
4368                 /* */
4369                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4370                         if (ctdb_same_ip(&vnn->public_address,
4371                                          &ips->ips[i].addr)) {
4372                                 /* IP is still in file */
4373                                 break;
4374                         }
4375                 }
4376
4377                 if (vnn == NULL) {
4378                         /* Delete IP ips->ips[i] */
4379                         struct ctdb_control_ip_iface *pub;
4380
4381                         DEBUG(DEBUG_NOTICE,
4382                               ("IP %s no longer configured, deleting it\n",
4383                                ctdb_addr_to_str(&ips->ips[i].addr)));
4384
4385                         pub = talloc_zero(mem_ctx,
4386                                           struct ctdb_control_ip_iface);
4387                         CTDB_NO_MEMORY(ctdb, pub);
4388
4389                         pub->addr  = ips->ips[i].addr;
4390                         pub->mask  = 0;
4391                         pub->len   = 0;
4392
4393                         timeout = TAKEOVER_TIMEOUT();
4394
4395                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4396                                               iface) + pub->len;
4397                         data.dptr = (uint8_t *)pub;
4398
4399                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4400                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4401                                                   0, data, async_data,
4402                                                   &timeout, NULL);
4403                         if (state == NULL) {
4404                                 DEBUG(DEBUG_ERR,
4405                                       (__location__
4406                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4407                                 goto failed;
4408                         }
4409
4410                         ctdb_client_async_add(async_data, state);
4411                 }
4412         }
4413
4414         /* Compare IPs between node and file for IPs to be added */
4415         first_add = true;
4416         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4417                 for (i = 0; i < ips->num; i++) {
4418                         if (ctdb_same_ip(&vnn->public_address,
4419                                          &ips->ips[i].addr)) {
4420                                 /* IP already on node */
4421                                 break;
4422                         }
4423                 }
4424                 if (i == ips->num) {
4425                         /* Add IP ips->ips[i] */
4426                         struct ctdb_control_ip_iface *pub;
4427                         const char *ifaces = NULL;
4428                         uint32_t len;
4429                         int iface = 0;
4430
4431                         DEBUG(DEBUG_NOTICE,
4432                               ("New IP %s configured, adding it\n",
4433                                ctdb_addr_to_str(&vnn->public_address)));
4434                         if (first_add) {
4435                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4436
4437                                 data.dsize = sizeof(pnn);
4438                                 data.dptr  = (uint8_t *)&pnn;
4439
4440                                 ret = ctdb_client_send_message(
4441                                         ctdb,
4442                                         CTDB_BROADCAST_CONNECTED,
4443                                         CTDB_SRVID_REBALANCE_NODE,
4444                                         data);
4445                                 if (ret != 0) {
4446                                         DEBUG(DEBUG_WARNING,
4447                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4448                                 }
4449
4450                                 first_add = false;
4451                         }
4452
4453                         ifaces = vnn->ifaces[0];
4454                         iface = 1;
4455                         while (vnn->ifaces[iface] != NULL) {
4456                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4457                                                          vnn->ifaces[iface]);
4458                                 iface++;
4459                         }
4460
4461                         len   = strlen(ifaces) + 1;
4462                         pub = talloc_zero_size(mem_ctx,
4463                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4464                         CTDB_NO_MEMORY(ctdb, pub);
4465
4466                         pub->addr  = vnn->public_address;
4467                         pub->mask  = vnn->public_netmask_bits;
4468                         pub->len   = len;
4469                         memcpy(&pub->iface[0], ifaces, pub->len);
4470
4471                         timeout = TAKEOVER_TIMEOUT();
4472
4473                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4474                                               iface) + pub->len;
4475                         data.dptr = (uint8_t *)pub;
4476
4477                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4478                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4479                                                   0, data, async_data,
4480                                                   &timeout, NULL);
4481                         if (state == NULL) {
4482                                 DEBUG(DEBUG_ERR,
4483                                       (__location__
4484                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4485                                 goto failed;
4486                         }
4487
4488                         ctdb_client_async_add(async_data, state);
4489                 }
4490         }
4491
4492         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4493                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4494                 goto failed;
4495         }
4496
4497         talloc_free(mem_ctx);
4498         return 0;
4499
4500 failed:
4501         talloc_free(mem_ctx);
4502         return -1;
4503 }
4504
4505 /* This control is sent to force the node to re-read the public addresses file
4506    and drop any addresses we should nnot longer host, and add new addresses
4507    that we are now able to host
4508 */
4509 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4510 {
4511         struct ctdb_reloadips_handle *h;
4512         pid_t parent = getpid();
4513
4514         if (ctdb->reload_ips != NULL) {
4515                 talloc_free(ctdb->reload_ips);
4516                 ctdb->reload_ips = NULL;
4517         }
4518
4519         h = talloc(ctdb, struct ctdb_reloadips_handle);
4520         CTDB_NO_MEMORY(ctdb, h);
4521         h->ctdb     = ctdb;
4522         h->c        = NULL;
4523         h->status   = -1;
4524         
4525         if (pipe(h->fd) == -1) {
4526                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4527                 talloc_free(h);
4528                 return -1;
4529         }
4530
4531         h->child = ctdb_fork(ctdb);
4532         if (h->child == (pid_t)-1) {
4533                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4534                 close(h->fd[0]);
4535                 close(h->fd[1]);
4536                 talloc_free(h);
4537                 return -1;
4538         }
4539
4540         /* child process */
4541         if (h->child == 0) {
4542                 signed char res = 0;
4543
4544                 close(h->fd[0]);
4545                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4546
4547                 ctdb_set_process_name("ctdb_reloadips");
4548                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4549                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4550                         res = -1;
4551                 } else {
4552                         res = ctdb_reloadips_child(ctdb);
4553                         if (res != 0) {
4554                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4555                         }
4556                 }
4557
4558                 sys_write(h->fd[1], &res, 1);
4559                 /* make sure we die when our parent dies */
4560                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4561                         sleep(5);
4562                 }
4563                 _exit(0);
4564         }
4565
4566         h->c             = talloc_steal(h, c);
4567
4568         close(h->fd[1]);
4569         set_close_on_exec(h->fd[0]);
4570
4571         talloc_set_destructor(h, ctdb_reloadips_destructor);
4572
4573
4574         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4575                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4576                         (void *)h);
4577         tevent_fd_set_auto_close(h->fde);
4578
4579         event_add_timed(ctdb->ev, h,
4580                         timeval_current_ofs(120, 0),
4581                         ctdb_reloadips_timeout_event, h);
4582
4583         /* we reply later */
4584         *async_reply = true;
4585         return 0;
4586 }