ctdb: Use prctl_set_comment from lib/util
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 struct ctdb_interface {
57         struct ctdb_interface *prev, *next;
58         const char *name;
59         bool link_up;
60         uint32_t references;
61 };
62
63 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
64 {
65         if (vnn->iface) {
66                 return vnn->iface->name;
67         }
68
69         return "__none__";
70 }
71
72 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
73 {
74         struct ctdb_interface *i;
75
76         /* Verify that we don't have an entry for this ip yet */
77         for (i=ctdb->ifaces;i;i=i->next) {
78                 if (strcmp(i->name, iface) == 0) {
79                         return 0;
80                 }
81         }
82
83         /* create a new structure for this interface */
84         i = talloc_zero(ctdb, struct ctdb_interface);
85         CTDB_NO_MEMORY_FATAL(ctdb, i);
86         i->name = talloc_strdup(i, iface);
87         CTDB_NO_MEMORY(ctdb, i->name);
88
89         i->link_up = true;
90
91         DLIST_ADD(ctdb->ifaces, i);
92
93         return 0;
94 }
95
96 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
97                                         const char *name)
98 {
99         int n;
100
101         for (n = 0; vnn->ifaces[n] != NULL; n++) {
102                 if (strcmp(name, vnn->ifaces[n]) == 0) {
103                         return true;
104                 }
105         }
106
107         return false;
108 }
109
110 /* If any interfaces now have no possible IPs then delete them.  This
111  * implementation is naive (i.e. simple) rather than clever
112  * (i.e. complex).  Given that this is run on delip and that operation
113  * is rare, this doesn't need to be efficient - it needs to be
114  * foolproof.  One alternative is reference counting, where the logic
115  * is distributed and can, therefore, be broken in multiple places.
116  * Another alternative is to build a red-black tree of interfaces that
117  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
118  * once) and then walking ctdb->ifaces once and deleting those not in
119  * the tree.  Let's go to one of those if the naive implementation
120  * causes problems...  :-)
121  */
122 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
123                                         struct ctdb_vnn *vnn)
124 {
125         struct ctdb_interface *i, *next;
126
127         /* For each interface, check if there's an IP using it. */
128         for (i = ctdb->ifaces; i != NULL; i = next) {
129                 struct ctdb_vnn *tv;
130                 bool found;
131                 next = i->next;
132
133                 /* Only consider interfaces named in the given VNN. */
134                 if (!vnn_has_interface_with_name(vnn, i->name)) {
135                         continue;
136                 }
137
138                 /* Is the "single IP" on this interface? */
139                 if ((ctdb->single_ip_vnn != NULL) &&
140                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
141                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
142                         /* Found, next interface please... */
143                         continue;
144                 }
145                 /* Search for a vnn with this interface. */
146                 found = false;
147                 for (tv=ctdb->vnn; tv; tv=tv->next) {
148                         if (vnn_has_interface_with_name(tv, i->name)) {
149                                 found = true;
150                                 break;
151                         }
152                 }
153
154                 if (!found) {
155                         /* None of the VNNs are using this interface. */
156                         DLIST_REMOVE(ctdb->ifaces, i);
157                         talloc_free(i);
158                 }
159         }
160 }
161
162
163 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
164                                               const char *iface)
165 {
166         struct ctdb_interface *i;
167
168         for (i=ctdb->ifaces;i;i=i->next) {
169                 if (strcmp(i->name, iface) == 0) {
170                         return i;
171                 }
172         }
173
174         return NULL;
175 }
176
177 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178                                                   struct ctdb_vnn *vnn)
179 {
180         int i;
181         struct ctdb_interface *cur = NULL;
182         struct ctdb_interface *best = NULL;
183
184         for (i=0; vnn->ifaces[i]; i++) {
185
186                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
187                 if (cur == NULL) {
188                         continue;
189                 }
190
191                 if (!cur->link_up) {
192                         continue;
193                 }
194
195                 if (best == NULL) {
196                         best = cur;
197                         continue;
198                 }
199
200                 if (cur->references < best->references) {
201                         best = cur;
202                         continue;
203                 }
204         }
205
206         return best;
207 }
208
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210                                      struct ctdb_vnn *vnn)
211 {
212         struct ctdb_interface *best = NULL;
213
214         if (vnn->iface) {
215                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216                                    "still assigned to iface '%s'\n",
217                                    ctdb_addr_to_str(&vnn->public_address),
218                                    ctdb_vnn_iface_string(vnn)));
219                 return 0;
220         }
221
222         best = ctdb_vnn_best_iface(ctdb, vnn);
223         if (best == NULL) {
224                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225                                   "cannot assign to iface any iface\n",
226                                   ctdb_addr_to_str(&vnn->public_address)));
227                 return -1;
228         }
229
230         vnn->iface = best;
231         best->references++;
232         vnn->pnn = ctdb->pnn;
233
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now assigned to iface '%s' refs[%d]\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            best->references));
239         return 0;
240 }
241
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243                                     struct ctdb_vnn *vnn)
244 {
245         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246                            "now unassigned (old iface '%s' refs[%d])\n",
247                            ctdb_addr_to_str(&vnn->public_address),
248                            ctdb_vnn_iface_string(vnn),
249                            vnn->iface?vnn->iface->references:0));
250         if (vnn->iface) {
251                 vnn->iface->references--;
252         }
253         vnn->iface = NULL;
254         if (vnn->pnn == ctdb->pnn) {
255                 vnn->pnn = -1;
256         }
257 }
258
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260                                struct ctdb_vnn *vnn)
261 {
262         int i;
263
264         /* Nodes that are not RUNNING can not host IPs */
265         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
266                 return false;
267         }
268
269         if (vnn->delete_pending) {
270                 return false;
271         }
272
273         if (vnn->iface && vnn->iface->link_up) {
274                 return true;
275         }
276
277         for (i=0; vnn->ifaces[i]; i++) {
278                 struct ctdb_interface *cur;
279
280                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
281                 if (cur == NULL) {
282                         continue;
283                 }
284
285                 if (cur->link_up) {
286                         return true;
287                 }
288         }
289
290         return false;
291 }
292
293 struct ctdb_takeover_arp {
294         struct ctdb_context *ctdb;
295         uint32_t count;
296         ctdb_sock_addr addr;
297         struct ctdb_tcp_array *tcparray;
298         struct ctdb_vnn *vnn;
299 };
300
301
302 /*
303   lists of tcp endpoints
304  */
305 struct ctdb_tcp_list {
306         struct ctdb_tcp_list *prev, *next;
307         struct ctdb_connection connection;
308 };
309
310 /*
311   list of clients to kill on IP release
312  */
313 struct ctdb_client_ip {
314         struct ctdb_client_ip *prev, *next;
315         struct ctdb_context *ctdb;
316         ctdb_sock_addr addr;
317         uint32_t client_id;
318 };
319
320
321 /*
322   send a gratuitous arp
323  */
324 static void ctdb_control_send_arp(struct tevent_context *ev,
325                                   struct tevent_timer *te,
326                                   struct timeval t, void *private_data)
327 {
328         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
329                                                         struct ctdb_takeover_arp);
330         int i, ret;
331         struct ctdb_tcp_array *tcparray;
332         const char *iface = ctdb_vnn_iface_string(arp->vnn);
333
334         ret = ctdb_sys_send_arp(&arp->addr, iface);
335         if (ret != 0) {
336                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
337                                   iface, strerror(errno)));
338         }
339
340         tcparray = arp->tcparray;
341         if (tcparray) {
342                 for (i=0;i<tcparray->num;i++) {
343                         struct ctdb_connection *tcon;
344
345                         tcon = &tcparray->connections[i];
346                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
347                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
348                                 ctdb_addr_to_str(&tcon->src),
349                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
350                         ret = ctdb_sys_send_tcp(
351                                 &tcon->src,
352                                 &tcon->dst,
353                                 0, 0, 0);
354                         if (ret != 0) {
355                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
356                                         ctdb_addr_to_str(&tcon->src)));
357                         }
358                 }
359         }
360
361         arp->count++;
362
363         if (arp->count == CTDB_ARP_REPEAT) {
364                 talloc_free(arp);
365                 return;
366         }
367
368         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
369                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
370                          ctdb_control_send_arp, arp);
371 }
372
373 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
374                                        struct ctdb_vnn *vnn)
375 {
376         struct ctdb_takeover_arp *arp;
377         struct ctdb_tcp_array *tcparray;
378
379         if (!vnn->takeover_ctx) {
380                 vnn->takeover_ctx = talloc_new(vnn);
381                 if (!vnn->takeover_ctx) {
382                         return -1;
383                 }
384         }
385
386         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
387         if (!arp) {
388                 return -1;
389         }
390
391         arp->ctdb = ctdb;
392         arp->addr = vnn->public_address;
393         arp->vnn  = vnn;
394
395         tcparray = vnn->tcp_array;
396         if (tcparray) {
397                 /* add all of the known tcp connections for this IP to the
398                    list of tcp connections to send tickle acks for */
399                 arp->tcparray = talloc_steal(arp, tcparray);
400
401                 vnn->tcp_array = NULL;
402                 vnn->tcp_update_needed = true;
403         }
404
405         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
406                          timeval_zero(), ctdb_control_send_arp, arp);
407
408         return 0;
409 }
410
411 struct takeover_callback_state {
412         struct ctdb_req_control_old *c;
413         ctdb_sock_addr *addr;
414         struct ctdb_vnn *vnn;
415 };
416
417 struct ctdb_do_takeip_state {
418         struct ctdb_req_control_old *c;
419         struct ctdb_vnn *vnn;
420 };
421
422 /*
423   called when takeip event finishes
424  */
425 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
426                                     void *private_data)
427 {
428         struct ctdb_do_takeip_state *state =
429                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
430         int32_t ret;
431         TDB_DATA data;
432
433         if (status != 0) {
434                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
435         
436                 if (status == -ETIME) {
437                         ctdb_ban_self(ctdb);
438                 }
439                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
440                                  ctdb_addr_to_str(&state->vnn->public_address),
441                                  ctdb_vnn_iface_string(state->vnn)));
442                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
443
444                 node->flags |= NODE_FLAGS_UNHEALTHY;
445                 talloc_free(state);
446                 return;
447         }
448
449         if (ctdb->do_checkpublicip) {
450
451         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
452         if (ret != 0) {
453                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
454                 talloc_free(state);
455                 return;
456         }
457
458         }
459
460         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
461         data.dsize = strlen((char *)data.dptr) + 1;
462         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
463
464         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
465
466
467         /* the control succeeded */
468         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
469         talloc_free(state);
470         return;
471 }
472
473 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
474 {
475         state->vnn->update_in_flight = false;
476         return 0;
477 }
478
479 /*
480   take over an ip address
481  */
482 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
483                               struct ctdb_req_control_old *c,
484                               struct ctdb_vnn *vnn)
485 {
486         int ret;
487         struct ctdb_do_takeip_state *state;
488
489         if (vnn->update_in_flight) {
490                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
491                                     "update for this IP already in flight\n",
492                                     ctdb_addr_to_str(&vnn->public_address),
493                                     vnn->public_netmask_bits));
494                 return -1;
495         }
496
497         ret = ctdb_vnn_assign_iface(ctdb, vnn);
498         if (ret != 0) {
499                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
500                                  "assign a usable interface\n",
501                                  ctdb_addr_to_str(&vnn->public_address),
502                                  vnn->public_netmask_bits));
503                 return -1;
504         }
505
506         state = talloc(vnn, struct ctdb_do_takeip_state);
507         CTDB_NO_MEMORY(ctdb, state);
508
509         state->c = talloc_steal(ctdb, c);
510         state->vnn   = vnn;
511
512         vnn->update_in_flight = true;
513         talloc_set_destructor(state, ctdb_takeip_destructor);
514
515         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
516                             ctdb_addr_to_str(&vnn->public_address),
517                             vnn->public_netmask_bits,
518                             ctdb_vnn_iface_string(vnn)));
519
520         ret = ctdb_event_script_callback(ctdb,
521                                          state,
522                                          ctdb_do_takeip_callback,
523                                          state,
524                                          CTDB_EVENT_TAKE_IP,
525                                          "%s %s %u",
526                                          ctdb_vnn_iface_string(vnn),
527                                          ctdb_addr_to_str(&vnn->public_address),
528                                          vnn->public_netmask_bits);
529
530         if (ret != 0) {
531                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
532                         ctdb_addr_to_str(&vnn->public_address),
533                         ctdb_vnn_iface_string(vnn)));
534                 talloc_free(state);
535                 return -1;
536         }
537
538         return 0;
539 }
540
541 struct ctdb_do_updateip_state {
542         struct ctdb_req_control_old *c;
543         struct ctdb_interface *old;
544         struct ctdb_vnn *vnn;
545 };
546
547 /*
548   called when updateip event finishes
549  */
550 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
551                                       void *private_data)
552 {
553         struct ctdb_do_updateip_state *state =
554                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
555         int32_t ret;
556
557         if (status != 0) {
558                 if (status == -ETIME) {
559                         ctdb_ban_self(ctdb);
560                 }
561                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
562                         ctdb_addr_to_str(&state->vnn->public_address),
563                         state->old->name,
564                         ctdb_vnn_iface_string(state->vnn)));
565
566                 /*
567                  * All we can do is reset the old interface
568                  * and let the next run fix it
569                  */
570                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
571                 state->vnn->iface = state->old;
572                 state->vnn->iface->references++;
573
574                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
575                 talloc_free(state);
576                 return;
577         }
578
579         if (ctdb->do_checkpublicip) {
580
581         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
582         if (ret != 0) {
583                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
584                 talloc_free(state);
585                 return;
586         }
587
588         }
589
590         /* the control succeeded */
591         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
592         talloc_free(state);
593         return;
594 }
595
596 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
597 {
598         state->vnn->update_in_flight = false;
599         return 0;
600 }
601
602 /*
603   update (move) an ip address
604  */
605 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
606                                 struct ctdb_req_control_old *c,
607                                 struct ctdb_vnn *vnn)
608 {
609         int ret;
610         struct ctdb_do_updateip_state *state;
611         struct ctdb_interface *old = vnn->iface;
612         const char *new_name;
613
614         if (vnn->update_in_flight) {
615                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
616                                     "update for this IP already in flight\n",
617                                     ctdb_addr_to_str(&vnn->public_address),
618                                     vnn->public_netmask_bits));
619                 return -1;
620         }
621
622         ctdb_vnn_unassign_iface(ctdb, vnn);
623         ret = ctdb_vnn_assign_iface(ctdb, vnn);
624         if (ret != 0) {
625                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
626                                  "assin a usable interface (old iface '%s')\n",
627                                  ctdb_addr_to_str(&vnn->public_address),
628                                  vnn->public_netmask_bits,
629                                  old->name));
630                 return -1;
631         }
632
633         new_name = ctdb_vnn_iface_string(vnn);
634         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
635                 /* A benign update from one interface onto itself.
636                  * no need to run the eventscripts in this case, just return
637                  * success.
638                  */
639                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
640                 return 0;
641         }
642
643         state = talloc(vnn, struct ctdb_do_updateip_state);
644         CTDB_NO_MEMORY(ctdb, state);
645
646         state->c = talloc_steal(ctdb, c);
647         state->old = old;
648         state->vnn = vnn;
649
650         vnn->update_in_flight = true;
651         talloc_set_destructor(state, ctdb_updateip_destructor);
652
653         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
654                             "interface %s to %s\n",
655                             ctdb_addr_to_str(&vnn->public_address),
656                             vnn->public_netmask_bits,
657                             old->name,
658                             new_name));
659
660         ret = ctdb_event_script_callback(ctdb,
661                                          state,
662                                          ctdb_do_updateip_callback,
663                                          state,
664                                          CTDB_EVENT_UPDATE_IP,
665                                          "%s %s %s %u",
666                                          state->old->name,
667                                          new_name,
668                                          ctdb_addr_to_str(&vnn->public_address),
669                                          vnn->public_netmask_bits);
670         if (ret != 0) {
671                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
672                                  ctdb_addr_to_str(&vnn->public_address),
673                                  old->name, new_name));
674                 talloc_free(state);
675                 return -1;
676         }
677
678         return 0;
679 }
680
681 /*
682   Find the vnn of the node that has a public ip address
683   returns -1 if the address is not known as a public address
684  */
685 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
686 {
687         struct ctdb_vnn *vnn;
688
689         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
690                 if (ctdb_same_ip(&vnn->public_address, addr)) {
691                         return vnn;
692                 }
693         }
694
695         return NULL;
696 }
697
698 /*
699   take over an ip address
700  */
701 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
702                                  struct ctdb_req_control_old *c,
703                                  TDB_DATA indata,
704                                  bool *async_reply)
705 {
706         int ret;
707         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
708         struct ctdb_vnn *vnn;
709         bool have_ip = false;
710         bool do_updateip = false;
711         bool do_takeip = false;
712         struct ctdb_interface *best_iface = NULL;
713
714         if (pip->pnn != ctdb->pnn) {
715                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
716                                  "with pnn %d, but we're node %d\n",
717                                  ctdb_addr_to_str(&pip->addr),
718                                  pip->pnn, ctdb->pnn));
719                 return -1;
720         }
721
722         /* update out vnn list */
723         vnn = find_public_ip_vnn(ctdb, &pip->addr);
724         if (vnn == NULL) {
725                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
726                         ctdb_addr_to_str(&pip->addr)));
727                 return 0;
728         }
729
730         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
731                 have_ip = ctdb_sys_have_ip(&pip->addr);
732         }
733         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
734         if (best_iface == NULL) {
735                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
736                                  "a usable interface (old %s, have_ip %d)\n",
737                                  ctdb_addr_to_str(&vnn->public_address),
738                                  vnn->public_netmask_bits,
739                                  ctdb_vnn_iface_string(vnn),
740                                  have_ip));
741                 return -1;
742         }
743
744         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
745                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
746                 have_ip = false;
747         }
748
749
750         if (vnn->iface == NULL && have_ip) {
751                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
753                                  ctdb_addr_to_str(&vnn->public_address)));
754                 return 0;
755         }
756
757         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
758                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
759                                   "and we have it on iface[%s], but it was assigned to node %d"
760                                   "and we are node %d, banning ourself\n",
761                                  ctdb_addr_to_str(&vnn->public_address),
762                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
763                 ctdb_ban_self(ctdb);
764                 return -1;
765         }
766
767         if (vnn->pnn == -1 && have_ip) {
768                 vnn->pnn = ctdb->pnn;
769                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
770                                   "and we already have it on iface[%s], update local daemon\n",
771                                  ctdb_addr_to_str(&vnn->public_address),
772                                   ctdb_vnn_iface_string(vnn)));
773                 return 0;
774         }
775
776         if (vnn->iface) {
777                 if (vnn->iface != best_iface) {
778                         if (!vnn->iface->link_up) {
779                                 do_updateip = true;
780                         } else if (vnn->iface->references > (best_iface->references + 1)) {
781                                 /* only move when the rebalance gains something */
782                                         do_updateip = true;
783                         }
784                 }
785         }
786
787         if (!have_ip) {
788                 if (do_updateip) {
789                         ctdb_vnn_unassign_iface(ctdb, vnn);
790                         do_updateip = false;
791                 }
792                 do_takeip = true;
793         }
794
795         if (do_takeip) {
796                 ret = ctdb_do_takeip(ctdb, c, vnn);
797                 if (ret != 0) {
798                         return -1;
799                 }
800         } else if (do_updateip) {
801                 ret = ctdb_do_updateip(ctdb, c, vnn);
802                 if (ret != 0) {
803                         return -1;
804                 }
805         } else {
806                 /*
807                  * The interface is up and the kernel known the ip
808                  * => do nothing
809                  */
810                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
811                         ctdb_addr_to_str(&pip->addr),
812                         vnn->public_netmask_bits,
813                         ctdb_vnn_iface_string(vnn)));
814                 return 0;
815         }
816
817         /* tell ctdb_control.c that we will be replying asynchronously */
818         *async_reply = true;
819
820         return 0;
821 }
822
823 /*
824   kill any clients that are registered with a IP that is being released
825  */
826 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
827 {
828         struct ctdb_client_ip *ip;
829
830         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
831                 ctdb_addr_to_str(addr)));
832
833         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
834                 ctdb_sock_addr tmp_addr;
835
836                 tmp_addr = ip->addr;
837                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
838                         ip->client_id,
839                         ctdb_addr_to_str(&ip->addr)));
840
841                 if (ctdb_same_ip(&tmp_addr, addr)) {
842                         struct ctdb_client *client = reqid_find(ctdb->idr,
843                                                                 ip->client_id,
844                                                                 struct ctdb_client);
845                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
846                                 ip->client_id,
847                                 ctdb_addr_to_str(&ip->addr),
848                                 client->pid));
849
850                         if (client->pid != 0) {
851                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
852                                         (unsigned)client->pid,
853                                         ctdb_addr_to_str(addr),
854                                         ip->client_id));
855                                 kill(client->pid, SIGKILL);
856                         }
857                 }
858         }
859 }
860
861 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
862 {
863         DLIST_REMOVE(ctdb->vnn, vnn);
864         ctdb_vnn_unassign_iface(ctdb, vnn);
865         ctdb_remove_orphaned_ifaces(ctdb, vnn);
866         talloc_free(vnn);
867 }
868
869 /*
870   called when releaseip event finishes
871  */
872 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
873                                 void *private_data)
874 {
875         struct takeover_callback_state *state = 
876                 talloc_get_type(private_data, struct takeover_callback_state);
877         TDB_DATA data;
878
879         if (status == -ETIME) {
880                 ctdb_ban_self(ctdb);
881         }
882
883         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
884                 if  (ctdb_sys_have_ip(state->addr)) {
885                         DEBUG(DEBUG_ERR,
886                               ("IP %s still hosted during release IP callback, failing\n",
887                                ctdb_addr_to_str(state->addr)));
888                         ctdb_request_control_reply(ctdb, state->c,
889                                                    NULL, -1, NULL);
890                         talloc_free(state);
891                         return;
892                 }
893         }
894
895         /* send a message to all clients of this node telling them
896            that the cluster has been reconfigured and they should
897            release any sockets on this IP */
898         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
899         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
900         data.dsize = strlen((char *)data.dptr)+1;
901
902         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
903
904         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
905
906         /* kill clients that have registered with this IP */
907         release_kill_clients(ctdb, state->addr);
908
909         ctdb_vnn_unassign_iface(ctdb, state->vnn);
910
911         /* Process the IP if it has been marked for deletion */
912         if (state->vnn->delete_pending) {
913                 do_delete_ip(ctdb, state->vnn);
914                 state->vnn = NULL;
915         }
916
917         /* the control succeeded */
918         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
919         talloc_free(state);
920 }
921
922 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
923 {
924         if (state->vnn != NULL) {
925                 state->vnn->update_in_flight = false;
926         }
927         return 0;
928 }
929
930 /*
931   release an ip address
932  */
933 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
934                                 struct ctdb_req_control_old *c,
935                                 TDB_DATA indata, 
936                                 bool *async_reply)
937 {
938         int ret;
939         struct takeover_callback_state *state;
940         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
941         struct ctdb_vnn *vnn;
942         char *iface;
943
944         /* update our vnn list */
945         vnn = find_public_ip_vnn(ctdb, &pip->addr);
946         if (vnn == NULL) {
947                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
948                         ctdb_addr_to_str(&pip->addr)));
949                 return 0;
950         }
951         vnn->pnn = pip->pnn;
952
953         /* stop any previous arps */
954         talloc_free(vnn->takeover_ctx);
955         vnn->takeover_ctx = NULL;
956
957         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
958          * lazy multicast to drop an IP from any node that isn't the
959          * intended new node.  The following causes makes ctdbd ignore
960          * a release for any address it doesn't host.
961          */
962         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
963                 if (!ctdb_sys_have_ip(&pip->addr)) {
964                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
965                                 ctdb_addr_to_str(&pip->addr),
966                                 vnn->public_netmask_bits,
967                                 ctdb_vnn_iface_string(vnn)));
968                         ctdb_vnn_unassign_iface(ctdb, vnn);
969                         return 0;
970                 }
971         } else {
972                 if (vnn->iface == NULL) {
973                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
974                                            ctdb_addr_to_str(&pip->addr),
975                                            vnn->public_netmask_bits));
976                         return 0;
977                 }
978         }
979
980         /* There is a potential race between take_ip and us because we
981          * update the VNN via a callback that run when the
982          * eventscripts have been run.  Avoid the race by allowing one
983          * update to be in flight at a time.
984          */
985         if (vnn->update_in_flight) {
986                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
987                                     "update for this IP already in flight\n",
988                                     ctdb_addr_to_str(&vnn->public_address),
989                                     vnn->public_netmask_bits));
990                 return -1;
991         }
992
993         iface = strdup(ctdb_vnn_iface_string(vnn));
994
995         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
996                 ctdb_addr_to_str(&pip->addr),
997                 vnn->public_netmask_bits,
998                 iface,
999                 pip->pnn));
1000
1001         state = talloc(ctdb, struct takeover_callback_state);
1002         if (state == NULL) {
1003                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1004                                __FILE__, __LINE__);
1005                 free(iface);
1006                 return -1;
1007         }
1008
1009         state->c = talloc_steal(state, c);
1010         state->addr = talloc(state, ctdb_sock_addr);       
1011         if (state->addr == NULL) {
1012                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1013                                __FILE__, __LINE__);
1014                 free(iface);
1015                 talloc_free(state);
1016                 return -1;
1017         }
1018         *state->addr = pip->addr;
1019         state->vnn   = vnn;
1020
1021         vnn->update_in_flight = true;
1022         talloc_set_destructor(state, ctdb_releaseip_destructor);
1023
1024         ret = ctdb_event_script_callback(ctdb, 
1025                                          state, release_ip_callback, state,
1026                                          CTDB_EVENT_RELEASE_IP,
1027                                          "%s %s %u",
1028                                          iface,
1029                                          ctdb_addr_to_str(&pip->addr),
1030                                          vnn->public_netmask_bits);
1031         free(iface);
1032         if (ret != 0) {
1033                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1034                         ctdb_addr_to_str(&pip->addr),
1035                         ctdb_vnn_iface_string(vnn)));
1036                 talloc_free(state);
1037                 return -1;
1038         }
1039
1040         /* tell the control that we will be reply asynchronously */
1041         *async_reply = true;
1042         return 0;
1043 }
1044
1045 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1046                                    ctdb_sock_addr *addr,
1047                                    unsigned mask, const char *ifaces,
1048                                    bool check_address)
1049 {
1050         struct ctdb_vnn      *vnn;
1051         uint32_t num = 0;
1052         char *tmp;
1053         const char *iface;
1054         int i;
1055         int ret;
1056
1057         tmp = strdup(ifaces);
1058         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1059                 if (!ctdb_sys_check_iface_exists(iface)) {
1060                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1061                         free(tmp);
1062                         return -1;
1063                 }
1064         }
1065         free(tmp);
1066
1067         /* Verify that we don't have an entry for this ip yet */
1068         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1069                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1070                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1071                                 ctdb_addr_to_str(addr)));
1072                         return -1;
1073                 }               
1074         }
1075
1076         /* create a new vnn structure for this ip address */
1077         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1078         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1079         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1080         tmp = talloc_strdup(vnn, ifaces);
1081         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1082         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1083                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1084                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1085                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1086                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1087                 num++;
1088         }
1089         talloc_free(tmp);
1090         vnn->ifaces[num] = NULL;
1091         vnn->public_address      = *addr;
1092         vnn->public_netmask_bits = mask;
1093         vnn->pnn                 = -1;
1094         if (check_address) {
1095                 if (ctdb_sys_have_ip(addr)) {
1096                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1097                         vnn->pnn = ctdb->pnn;
1098                 }
1099         }
1100
1101         for (i=0; vnn->ifaces[i]; i++) {
1102                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1103                 if (ret != 0) {
1104                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1105                                            "for public_address[%s]\n",
1106                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1107                         talloc_free(vnn);
1108                         return -1;
1109                 }
1110         }
1111
1112         DLIST_ADD(ctdb->vnn, vnn);
1113
1114         return 0;
1115 }
1116
1117 /*
1118   setup the public address lists from a file
1119 */
1120 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1121 {
1122         char **lines;
1123         int nlines;
1124         int i;
1125
1126         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1127         if (lines == NULL) {
1128                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1129                 return -1;
1130         }
1131         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1132                 nlines--;
1133         }
1134
1135         for (i=0;i<nlines;i++) {
1136                 unsigned mask;
1137                 ctdb_sock_addr addr;
1138                 const char *addrstr;
1139                 const char *ifaces;
1140                 char *tok, *line;
1141
1142                 line = lines[i];
1143                 while ((*line == ' ') || (*line == '\t')) {
1144                         line++;
1145                 }
1146                 if (*line == '#') {
1147                         continue;
1148                 }
1149                 if (strcmp(line, "") == 0) {
1150                         continue;
1151                 }
1152                 tok = strtok(line, " \t");
1153                 addrstr = tok;
1154                 tok = strtok(NULL, " \t");
1155                 if (tok == NULL) {
1156                         if (NULL == ctdb->default_public_interface) {
1157                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1158                                          i+1));
1159                                 talloc_free(lines);
1160                                 return -1;
1161                         }
1162                         ifaces = ctdb->default_public_interface;
1163                 } else {
1164                         ifaces = tok;
1165                 }
1166
1167                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1168                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1169                         talloc_free(lines);
1170                         return -1;
1171                 }
1172                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1173                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1174                         talloc_free(lines);
1175                         return -1;
1176                 }
1177         }
1178
1179
1180         talloc_free(lines);
1181         return 0;
1182 }
1183
1184 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1185                               const char *iface,
1186                               const char *ip)
1187 {
1188         struct ctdb_vnn *svnn;
1189         struct ctdb_interface *cur = NULL;
1190         bool ok;
1191         int ret;
1192
1193         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1194         CTDB_NO_MEMORY(ctdb, svnn);
1195
1196         svnn->ifaces = talloc_array(svnn, const char *, 2);
1197         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1198         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1199         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1200         svnn->ifaces[1] = NULL;
1201
1202         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1203         if (!ok) {
1204                 talloc_free(svnn);
1205                 return -1;
1206         }
1207
1208         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1209         if (ret != 0) {
1210                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1211                                    "for single_ip[%s]\n",
1212                                    svnn->ifaces[0],
1213                                    ctdb_addr_to_str(&svnn->public_address)));
1214                 talloc_free(svnn);
1215                 return -1;
1216         }
1217
1218         /* assume the single public ip interface is initially "good" */
1219         cur = ctdb_find_iface(ctdb, iface);
1220         if (cur == NULL) {
1221                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1222                 return -1;
1223         }
1224         cur->link_up = true;
1225
1226         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1227         if (ret != 0) {
1228                 talloc_free(svnn);
1229                 return -1;
1230         }
1231
1232         ctdb->single_ip_vnn = svnn;
1233         return 0;
1234 }
1235
1236 struct public_ip_list {
1237         struct public_ip_list *next;
1238         uint32_t pnn;
1239         ctdb_sock_addr addr;
1240 };
1241
1242 /* Given a physical node, return the number of
1243    public addresses that is currently assigned to this node.
1244 */
1245 static int node_ip_coverage(struct ctdb_context *ctdb, int32_t pnn,
1246                             struct public_ip_list *ips)
1247 {
1248         int num=0;
1249
1250         for (;ips;ips=ips->next) {
1251                 if (ips->pnn == pnn) {
1252                         num++;
1253                 }
1254         }
1255         return num;
1256 }
1257
1258
1259 /* Can the given node host the given IP: is the public IP known to the
1260  * node and is NOIPHOST unset?
1261 */
1262 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1263                              struct ctdb_ipflags ipflags,
1264                              struct public_ip_list *ip)
1265 {
1266         struct ctdb_public_ip_list_old *public_ips;
1267         int i;
1268
1269         if (ipflags.noiphost) {
1270                 return false;
1271         }
1272
1273         public_ips = ctdb->nodes[pnn]->available_public_ips;
1274
1275         if (public_ips == NULL) {
1276                 return false;
1277         }
1278
1279         for (i=0; i<public_ips->num; i++) {
1280                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1281                         /* yes, this node can serve this public ip */
1282                         return true;
1283                 }
1284         }
1285
1286         return false;
1287 }
1288
1289 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1290                                  struct ctdb_ipflags ipflags,
1291                                  struct public_ip_list *ip)
1292 {
1293         if (ipflags.noiptakeover) {
1294                 return false;
1295         }
1296
1297         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1298 }
1299
1300 /* search the node lists list for a node to takeover this ip.
1301    pick the node that currently are serving the least number of ips
1302    so that the ips get spread out evenly.
1303 */
1304 static int find_takeover_node(struct ctdb_context *ctdb,
1305                               struct ctdb_ipflags *ipflags,
1306                               struct public_ip_list *ip,
1307                               struct public_ip_list *all_ips)
1308 {
1309         int pnn, min=0, num;
1310         int i, numnodes;
1311
1312         numnodes = talloc_array_length(ipflags);
1313         pnn    = -1;
1314         for (i=0; i<numnodes; i++) {
1315                 /* verify that this node can serve this ip */
1316                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1317                         /* no it couldnt   so skip to the next node */
1318                         continue;
1319                 }
1320
1321                 num = node_ip_coverage(ctdb, i, all_ips);
1322                 /* was this the first node we checked ? */
1323                 if (pnn == -1) {
1324                         pnn = i;
1325                         min  = num;
1326                 } else {
1327                         if (num < min) {
1328                                 pnn = i;
1329                                 min  = num;
1330                         }
1331                 }
1332         }       
1333         if (pnn == -1) {
1334                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1335                         ctdb_addr_to_str(&ip->addr)));
1336
1337                 return -1;
1338         }
1339
1340         ip->pnn = pnn;
1341         return 0;
1342 }
1343
1344 #define IP_KEYLEN       4
1345 static uint32_t *ip_key(ctdb_sock_addr *ip)
1346 {
1347         static uint32_t key[IP_KEYLEN];
1348
1349         bzero(key, sizeof(key));
1350
1351         switch (ip->sa.sa_family) {
1352         case AF_INET:
1353                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1354                 break;
1355         case AF_INET6: {
1356                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1357                 key[0]  = htonl(s6_a32[0]);
1358                 key[1]  = htonl(s6_a32[1]);
1359                 key[2]  = htonl(s6_a32[2]);
1360                 key[3]  = htonl(s6_a32[3]);
1361                 break;
1362         }
1363         default:
1364                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1365                 return key;
1366         }
1367
1368         return key;
1369 }
1370
1371 static void *add_ip_callback(void *parm, void *data)
1372 {
1373         struct public_ip_list *this_ip = parm;
1374         struct public_ip_list *prev_ip = data;
1375
1376         if (prev_ip == NULL) {
1377                 return parm;
1378         }
1379         if (this_ip->pnn == -1) {
1380                 this_ip->pnn = prev_ip->pnn;
1381         }
1382
1383         return parm;
1384 }
1385
1386 static int getips_count_callback(void *param, void *data)
1387 {
1388         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1389         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1390
1391         new_ip->next = *ip_list;
1392         *ip_list     = new_ip;
1393         return 0;
1394 }
1395
1396 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1397                                        struct ctdb_public_ip_list_old *ips,
1398                                        uint32_t pnn);
1399
1400 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1401                                          struct ctdb_node_map_old *nodemap)
1402 {
1403         int j;
1404         int ret;
1405
1406         if (ctdb->num_nodes != nodemap->num) {
1407                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1408                                   ctdb->num_nodes, nodemap->num));
1409                 return -1;
1410         }
1411
1412         for (j=0; j<nodemap->num; j++) {
1413                 /* For readability */
1414                 struct ctdb_node *node = ctdb->nodes[j];
1415
1416                 /* release any existing data */
1417                 TALLOC_FREE(node->known_public_ips);
1418                 TALLOC_FREE(node->available_public_ips);
1419
1420                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1421                         continue;
1422                 }
1423
1424                 /* Retrieve the list of known public IPs from the node */
1425                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1426                                         TAKEOVER_TIMEOUT(),
1427                                         node->pnn,
1428                                         ctdb->nodes,
1429                                         0,
1430                                         &node->known_public_ips);
1431                 if (ret != 0) {
1432                         DEBUG(DEBUG_ERR,
1433                               ("Failed to read known public IPs from node: %u\n",
1434                                node->pnn));
1435                         return -1;
1436                 }
1437
1438                 if (ctdb->do_checkpublicip) {
1439                         verify_remote_ip_allocation(ctdb,
1440                                                     node->known_public_ips,
1441                                                     node->pnn);
1442                 }
1443
1444                 /* Retrieve the list of available public IPs from the node */
1445                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1446                                         TAKEOVER_TIMEOUT(),
1447                                         node->pnn,
1448                                         ctdb->nodes,
1449                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1450                                         &node->available_public_ips);
1451                 if (ret != 0) {
1452                         DEBUG(DEBUG_ERR,
1453                               ("Failed to read available public IPs from node: %u\n",
1454                                node->pnn));
1455                         return -1;
1456                 }
1457         }
1458
1459         return 0;
1460 }
1461
1462 static struct public_ip_list *
1463 create_merged_ip_list(struct ctdb_context *ctdb)
1464 {
1465         int i, j;
1466         struct public_ip_list *ip_list;
1467         struct ctdb_public_ip_list_old *public_ips;
1468
1469         if (ctdb->ip_tree != NULL) {
1470                 talloc_free(ctdb->ip_tree);
1471                 ctdb->ip_tree = NULL;
1472         }
1473         ctdb->ip_tree = trbt_create(ctdb, 0);
1474
1475         for (i=0;i<ctdb->num_nodes;i++) {
1476                 public_ips = ctdb->nodes[i]->known_public_ips;
1477
1478                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1479                         continue;
1480                 }
1481
1482                 /* there were no public ips for this node */
1483                 if (public_ips == NULL) {
1484                         continue;
1485                 }               
1486
1487                 for (j=0;j<public_ips->num;j++) {
1488                         struct public_ip_list *tmp_ip;
1489
1490                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1491                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1492                         /* Do not use information about IP addresses hosted
1493                          * on other nodes, it may not be accurate */
1494                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1495                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1496                         } else {
1497                                 tmp_ip->pnn = -1;
1498                         }
1499                         tmp_ip->addr = public_ips->ips[j].addr;
1500                         tmp_ip->next = NULL;
1501
1502                         trbt_insertarray32_callback(ctdb->ip_tree,
1503                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1504                                 add_ip_callback,
1505                                 tmp_ip);
1506                 }
1507         }
1508
1509         ip_list = NULL;
1510         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1511
1512         return ip_list;
1513 }
1514
1515 /* 
1516  * This is the length of the longtest common prefix between the IPs.
1517  * It is calculated by XOR-ing the 2 IPs together and counting the
1518  * number of leading zeroes.  The implementation means that all
1519  * addresses end up being 128 bits long.
1520  *
1521  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1522  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1523  * lots of nodes and IP addresses?
1524  */
1525 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1526 {
1527         uint32_t ip1_k[IP_KEYLEN];
1528         uint32_t *t;
1529         int i;
1530         uint32_t x;
1531
1532         uint32_t distance = 0;
1533
1534         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1535         t = ip_key(ip2);
1536         for (i=0; i<IP_KEYLEN; i++) {
1537                 x = ip1_k[i] ^ t[i];
1538                 if (x == 0) {
1539                         distance += 32;
1540                 } else {
1541                         /* Count number of leading zeroes. 
1542                          * FIXME? This could be optimised...
1543                          */
1544                         while ((x & (1 << 31)) == 0) {
1545                                 x <<= 1;
1546                                 distance += 1;
1547                         }
1548                 }
1549         }
1550
1551         return distance;
1552 }
1553
1554 /* Calculate the IP distance for the given IP relative to IPs on the
1555    given node.  The ips argument is generally the all_ips variable
1556    used in the main part of the algorithm.
1557  */
1558 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1559                                   struct public_ip_list *ips,
1560                                   int pnn)
1561 {
1562         struct public_ip_list *t;
1563         uint32_t d;
1564
1565         uint32_t sum = 0;
1566
1567         for (t=ips; t != NULL; t=t->next) {
1568                 if (t->pnn != pnn) {
1569                         continue;
1570                 }
1571
1572                 /* Optimisation: We never calculate the distance
1573                  * between an address and itself.  This allows us to
1574                  * calculate the effect of removing an address from a
1575                  * node by simply calculating the distance between
1576                  * that address and all of the exitsing addresses.
1577                  * Moreover, we assume that we're only ever dealing
1578                  * with addresses from all_ips so we can identify an
1579                  * address via a pointer rather than doing a more
1580                  * expensive address comparison. */
1581                 if (&(t->addr) == ip) {
1582                         continue;
1583                 }
1584
1585                 d = ip_distance(ip, &(t->addr));
1586                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1587         }
1588
1589         return sum;
1590 }
1591
1592 /* Return the LCP2 imbalance metric for addresses currently assigned
1593    to the given node.
1594  */
1595 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1596 {
1597         struct public_ip_list *t;
1598
1599         uint32_t imbalance = 0;
1600
1601         for (t=all_ips; t!=NULL; t=t->next) {
1602                 if (t->pnn != pnn) {
1603                         continue;
1604                 }
1605                 /* Pass the rest of the IPs rather than the whole
1606                    all_ips input list.
1607                 */
1608                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1609         }
1610
1611         return imbalance;
1612 }
1613
1614 /* Allocate any unassigned IPs just by looping through the IPs and
1615  * finding the best node for each.
1616  */
1617 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1618                                       struct ctdb_ipflags *ipflags,
1619                                       struct public_ip_list *all_ips)
1620 {
1621         struct public_ip_list *tmp_ip;
1622
1623         /* loop over all ip's and find a physical node to cover for 
1624            each unassigned ip.
1625         */
1626         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1627                 if (tmp_ip->pnn == -1) {
1628                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1629                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1630                                         ctdb_addr_to_str(&tmp_ip->addr)));
1631                         }
1632                 }
1633         }
1634 }
1635
1636 /* Basic non-deterministic rebalancing algorithm.
1637  */
1638 static void basic_failback(struct ctdb_context *ctdb,
1639                            struct ctdb_ipflags *ipflags,
1640                            struct public_ip_list *all_ips,
1641                            int num_ips)
1642 {
1643         int i, numnodes;
1644         int maxnode, maxnum, minnode, minnum, num, retries;
1645         struct public_ip_list *tmp_ip;
1646
1647         numnodes = talloc_array_length(ipflags);
1648         retries = 0;
1649
1650 try_again:
1651         maxnum=0;
1652         minnum=0;
1653
1654         /* for each ip address, loop over all nodes that can serve
1655            this ip and make sure that the difference between the node
1656            serving the most and the node serving the least ip's are
1657            not greater than 1.
1658         */
1659         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1660                 if (tmp_ip->pnn == -1) {
1661                         continue;
1662                 }
1663
1664                 /* Get the highest and lowest number of ips's served by any 
1665                    valid node which can serve this ip.
1666                 */
1667                 maxnode = -1;
1668                 minnode = -1;
1669                 for (i=0; i<numnodes; i++) {
1670                         /* only check nodes that can actually serve this ip */
1671                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1672                                 /* no it couldnt   so skip to the next node */
1673                                 continue;
1674                         }
1675
1676                         num = node_ip_coverage(ctdb, i, all_ips);
1677                         if (maxnode == -1) {
1678                                 maxnode = i;
1679                                 maxnum  = num;
1680                         } else {
1681                                 if (num > maxnum) {
1682                                         maxnode = i;
1683                                         maxnum  = num;
1684                                 }
1685                         }
1686                         if (minnode == -1) {
1687                                 minnode = i;
1688                                 minnum  = num;
1689                         } else {
1690                                 if (num < minnum) {
1691                                         minnode = i;
1692                                         minnum  = num;
1693                                 }
1694                         }
1695                 }
1696                 if (maxnode == -1) {
1697                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1698                                 ctdb_addr_to_str(&tmp_ip->addr)));
1699
1700                         continue;
1701                 }
1702
1703                 /* if the spread between the smallest and largest coverage by
1704                    a node is >=2 we steal one of the ips from the node with
1705                    most coverage to even things out a bit.
1706                    try to do this a limited number of times since we dont
1707                    want to spend too much time balancing the ip coverage.
1708                 */
1709                 if ( (maxnum > minnum+1)
1710                      && (retries < (num_ips + 5)) ){
1711                         struct public_ip_list *tmp;
1712
1713                         /* Reassign one of maxnode's VNNs */
1714                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1715                                 if (tmp->pnn == maxnode) {
1716                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1717                                         retries++;
1718                                         goto try_again;;
1719                                 }
1720                         }
1721                 }
1722         }
1723 }
1724
1725 static void lcp2_init(struct ctdb_context *tmp_ctx,
1726                       struct ctdb_ipflags *ipflags,
1727                       struct public_ip_list *all_ips,
1728                       uint32_t *force_rebalance_nodes,
1729                       uint32_t **lcp2_imbalances,
1730                       bool **rebalance_candidates)
1731 {
1732         int i, numnodes;
1733         struct public_ip_list *tmp_ip;
1734
1735         numnodes = talloc_array_length(ipflags);
1736
1737         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1738         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1739         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1740         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1741
1742         for (i=0; i<numnodes; i++) {
1743                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1744                 /* First step: assume all nodes are candidates */
1745                 (*rebalance_candidates)[i] = true;
1746         }
1747
1748         /* 2nd step: if a node has IPs assigned then it must have been
1749          * healthy before, so we remove it from consideration.  This
1750          * is overkill but is all we have because we don't maintain
1751          * state between takeover runs.  An alternative would be to
1752          * keep state and invalidate it every time the recovery master
1753          * changes.
1754          */
1755         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1756                 if (tmp_ip->pnn != -1) {
1757                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1758                 }
1759         }
1760
1761         /* 3rd step: if a node is forced to re-balance then
1762            we allow failback onto the node */
1763         if (force_rebalance_nodes == NULL) {
1764                 return;
1765         }
1766         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1767                 uint32_t pnn = force_rebalance_nodes[i];
1768                 if (pnn >= numnodes) {
1769                         DEBUG(DEBUG_ERR,
1770                               (__location__ "unknown node %u\n", pnn));
1771                         continue;
1772                 }
1773
1774                 DEBUG(DEBUG_NOTICE,
1775                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1776                 (*rebalance_candidates)[pnn] = true;
1777         }
1778 }
1779
1780 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1781  * the IP/node combination that will cost the least.
1782  */
1783 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1784                                      struct ctdb_ipflags *ipflags,
1785                                      struct public_ip_list *all_ips,
1786                                      uint32_t *lcp2_imbalances)
1787 {
1788         struct public_ip_list *tmp_ip;
1789         int dstnode, numnodes;
1790
1791         int minnode;
1792         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1793         struct public_ip_list *minip;
1794
1795         bool should_loop = true;
1796         bool have_unassigned = true;
1797
1798         numnodes = talloc_array_length(ipflags);
1799
1800         while (have_unassigned && should_loop) {
1801                 should_loop = false;
1802
1803                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1804                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1805
1806                 minnode = -1;
1807                 mindsum = 0;
1808                 minip = NULL;
1809
1810                 /* loop over each unassigned ip. */
1811                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1812                         if (tmp_ip->pnn != -1) {
1813                                 continue;
1814                         }
1815
1816                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1817                                 /* only check nodes that can actually takeover this ip */
1818                                 if (!can_node_takeover_ip(ctdb, dstnode,
1819                                                           ipflags[dstnode],
1820                                                           tmp_ip)) {
1821                                         /* no it couldnt   so skip to the next node */
1822                                         continue;
1823                                 }
1824
1825                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1826                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1827                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1828                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1829                                                    dstnode,
1830                                                    dstimbl - lcp2_imbalances[dstnode]));
1831
1832
1833                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1834                                         minnode = dstnode;
1835                                         minimbl = dstimbl;
1836                                         mindsum = dstdsum;
1837                                         minip = tmp_ip;
1838                                         should_loop = true;
1839                                 }
1840                         }
1841                 }
1842
1843                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1844
1845                 /* If we found one then assign it to the given node. */
1846                 if (minnode != -1) {
1847                         minip->pnn = minnode;
1848                         lcp2_imbalances[minnode] = minimbl;
1849                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1850                                           ctdb_addr_to_str(&(minip->addr)),
1851                                           minnode,
1852                                           mindsum));
1853                 }
1854
1855                 /* There might be a better way but at least this is clear. */
1856                 have_unassigned = false;
1857                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1858                         if (tmp_ip->pnn == -1) {
1859                                 have_unassigned = true;
1860                         }
1861                 }
1862         }
1863
1864         /* We know if we have an unassigned addresses so we might as
1865          * well optimise.
1866          */
1867         if (have_unassigned) {
1868                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1869                         if (tmp_ip->pnn == -1) {
1870                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1871                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1872                         }
1873                 }
1874         }
1875 }
1876
1877 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1878  * to move IPs from, determines the best IP/destination node
1879  * combination to move from the source node.
1880  */
1881 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1882                                     struct ctdb_ipflags *ipflags,
1883                                     struct public_ip_list *all_ips,
1884                                     int srcnode,
1885                                     uint32_t *lcp2_imbalances,
1886                                     bool *rebalance_candidates)
1887 {
1888         int dstnode, mindstnode, numnodes;
1889         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1890         uint32_t minsrcimbl, mindstimbl;
1891         struct public_ip_list *minip;
1892         struct public_ip_list *tmp_ip;
1893
1894         /* Find an IP and destination node that best reduces imbalance. */
1895         srcimbl = 0;
1896         minip = NULL;
1897         minsrcimbl = 0;
1898         mindstnode = -1;
1899         mindstimbl = 0;
1900
1901         numnodes = talloc_array_length(ipflags);
1902
1903         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1904         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1905                            srcnode, lcp2_imbalances[srcnode]));
1906
1907         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1908                 /* Only consider addresses on srcnode. */
1909                 if (tmp_ip->pnn != srcnode) {
1910                         continue;
1911                 }
1912
1913                 /* What is this IP address costing the source node? */
1914                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1915                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1916
1917                 /* Consider this IP address would cost each potential
1918                  * destination node.  Destination nodes are limited to
1919                  * those that are newly healthy, since we don't want
1920                  * to do gratuitous failover of IPs just to make minor
1921                  * balance improvements.
1922                  */
1923                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1924                         if (!rebalance_candidates[dstnode]) {
1925                                 continue;
1926                         }
1927
1928                         /* only check nodes that can actually takeover this ip */
1929                         if (!can_node_takeover_ip(ctdb, dstnode,
1930                                                   ipflags[dstnode], tmp_ip)) {
1931                                 /* no it couldnt   so skip to the next node */
1932                                 continue;
1933                         }
1934
1935                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1936                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1937                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1938                                            srcnode, -srcdsum,
1939                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1940                                            dstnode, dstdsum));
1941
1942                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1943                             (dstdsum < srcdsum) &&                      \
1944                             ((mindstnode == -1) ||                              \
1945                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1946
1947                                 minip = tmp_ip;
1948                                 minsrcimbl = srcimbl;
1949                                 mindstnode = dstnode;
1950                                 mindstimbl = dstimbl;
1951                         }
1952                 }
1953         }
1954         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1955
1956         if (mindstnode != -1) {
1957                 /* We found a move that makes things better... */
1958                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1959                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1960                                   ctdb_addr_to_str(&(minip->addr)),
1961                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1962
1963
1964                 lcp2_imbalances[srcnode] = minsrcimbl;
1965                 lcp2_imbalances[mindstnode] = mindstimbl;
1966                 minip->pnn = mindstnode;
1967
1968                 return true;
1969         }
1970
1971         return false;
1972         
1973 }
1974
1975 struct lcp2_imbalance_pnn {
1976         uint32_t imbalance;
1977         int pnn;
1978 };
1979
1980 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1981 {
1982         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1983         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1984
1985         if (lipa->imbalance > lipb->imbalance) {
1986                 return -1;
1987         } else if (lipa->imbalance == lipb->imbalance) {
1988                 return 0;
1989         } else {
1990                 return 1;
1991         }
1992 }
1993
1994 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1995  * node with the highest LCP2 imbalance, and then determines the best
1996  * IP/destination node combination to move from the source node.
1997  */
1998 static void lcp2_failback(struct ctdb_context *ctdb,
1999                           struct ctdb_ipflags *ipflags,
2000                           struct public_ip_list *all_ips,
2001                           uint32_t *lcp2_imbalances,
2002                           bool *rebalance_candidates)
2003 {
2004         int i, numnodes;
2005         struct lcp2_imbalance_pnn * lips;
2006         bool again;
2007
2008         numnodes = talloc_array_length(ipflags);
2009
2010 try_again:
2011         /* Put the imbalances and nodes into an array, sort them and
2012          * iterate through candidates.  Usually the 1st one will be
2013          * used, so this doesn't cost much...
2014          */
2015         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2016         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2017         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2018         for (i=0; i<numnodes; i++) {
2019                 lips[i].imbalance = lcp2_imbalances[i];
2020                 lips[i].pnn = i;
2021                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2022         }
2023         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2024               lcp2_cmp_imbalance_pnn);
2025
2026         again = false;
2027         for (i=0; i<numnodes; i++) {
2028                 /* This means that all nodes had 0 or 1 addresses, so
2029                  * can't be imbalanced.
2030                  */
2031                 if (lips[i].imbalance == 0) {
2032                         break;
2033                 }
2034
2035                 if (lcp2_failback_candidate(ctdb,
2036                                             ipflags,
2037                                             all_ips,
2038                                             lips[i].pnn,
2039                                             lcp2_imbalances,
2040                                             rebalance_candidates)) {
2041                         again = true;
2042                         break;
2043                 }
2044         }
2045
2046         talloc_free(lips);
2047         if (again) {
2048                 goto try_again;
2049         }
2050 }
2051
2052 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2053                                     struct ctdb_ipflags *ipflags,
2054                                     struct public_ip_list *all_ips)
2055 {
2056         struct public_ip_list *tmp_ip;
2057
2058         /* verify that the assigned nodes can serve that public ip
2059            and set it to -1 if not
2060         */
2061         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2062                 if (tmp_ip->pnn == -1) {
2063                         continue;
2064                 }
2065                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2066                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2067                         /* this node can not serve this ip. */
2068                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2069                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2070                                            tmp_ip->pnn));
2071                         tmp_ip->pnn = -1;
2072                 }
2073         }
2074 }
2075
2076 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2077                                        struct ctdb_ipflags *ipflags,
2078                                        struct public_ip_list *all_ips)
2079 {
2080         struct public_ip_list *tmp_ip;
2081         int i, numnodes;
2082
2083         numnodes = talloc_array_length(ipflags);
2084
2085         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2086        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2087         *  always be allocated the same way for a specific set of
2088         *  available/unavailable nodes.
2089         */
2090
2091         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2092                 tmp_ip->pnn = i % numnodes;
2093         }
2094
2095         /* IP failback doesn't make sense with deterministic
2096          * IPs, since the modulo step above implicitly fails
2097          * back IPs to their "home" node.
2098          */
2099         if (1 == ctdb->tunable.no_ip_failback) {
2100                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2101         }
2102
2103         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2104
2105         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2106
2107         /* No failback here! */
2108 }
2109
2110 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2111                                           struct ctdb_ipflags *ipflags,
2112                                           struct public_ip_list *all_ips)
2113 {
2114         /* This should be pushed down into basic_failback. */
2115         struct public_ip_list *tmp_ip;
2116         int num_ips = 0;
2117         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2118                 num_ips++;
2119         }
2120
2121         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2122
2123         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2124
2125         /* If we don't want IPs to fail back then don't rebalance IPs. */
2126         if (1 == ctdb->tunable.no_ip_failback) {
2127                 return;
2128         }
2129
2130         /* Now, try to make sure the ip adresses are evenly distributed
2131            across the nodes.
2132         */
2133         basic_failback(ctdb, ipflags, all_ips, num_ips);
2134 }
2135
2136 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2137                           struct ctdb_ipflags *ipflags,
2138                           struct public_ip_list *all_ips,
2139                           uint32_t *force_rebalance_nodes)
2140 {
2141         uint32_t *lcp2_imbalances;
2142         bool *rebalance_candidates;
2143         int numnodes, num_rebalance_candidates, i;
2144
2145         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2146
2147         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2148
2149         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2150                   &lcp2_imbalances, &rebalance_candidates);
2151
2152         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2153
2154         /* If we don't want IPs to fail back then don't rebalance IPs. */
2155         if (1 == ctdb->tunable.no_ip_failback) {
2156                 goto finished;
2157         }
2158
2159         /* It is only worth continuing if we have suitable target
2160          * nodes to transfer IPs to.  This check is much cheaper than
2161          * continuing on...
2162          */
2163         numnodes = talloc_array_length(ipflags);
2164         num_rebalance_candidates = 0;
2165         for (i=0; i<numnodes; i++) {
2166                 if (rebalance_candidates[i]) {
2167                         num_rebalance_candidates++;
2168                 }
2169         }
2170         if (num_rebalance_candidates == 0) {
2171                 goto finished;
2172         }
2173
2174         /* Now, try to make sure the ip adresses are evenly distributed
2175            across the nodes.
2176         */
2177         lcp2_failback(ctdb, ipflags, all_ips,
2178                       lcp2_imbalances, rebalance_candidates);
2179
2180 finished:
2181         talloc_free(tmp_ctx);
2182 }
2183
2184 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2185 {
2186         int i;
2187
2188         for (i=0;i<nodemap->num;i++) {
2189                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2190                         /* Found one completely healthy node */
2191                         return false;
2192                 }
2193         }
2194
2195         return true;
2196 }
2197
2198 /* The calculation part of the IP allocation algorithm. */
2199 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2200                                    struct ctdb_ipflags *ipflags,
2201                                    struct public_ip_list **all_ips_p,
2202                                    uint32_t *force_rebalance_nodes)
2203 {
2204         /* since nodes only know about those public addresses that
2205            can be served by that particular node, no single node has
2206            a full list of all public addresses that exist in the cluster.
2207            Walk over all node structures and create a merged list of
2208            all public addresses that exist in the cluster.
2209
2210            keep the tree of ips around as ctdb->ip_tree
2211         */
2212         *all_ips_p = create_merged_ip_list(ctdb);
2213
2214         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2215                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2216         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2217                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2218         } else {
2219                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2220         }
2221
2222         /* at this point ->pnn is the node which will own each IP
2223            or -1 if there is no node that can cover this ip
2224         */
2225
2226         return;
2227 }
2228
2229 struct get_tunable_callback_data {
2230         const char *tunable;
2231         uint32_t *out;
2232         bool fatal;
2233 };
2234
2235 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2236                                  int32_t res, TDB_DATA outdata,
2237                                  void *callback)
2238 {
2239         struct get_tunable_callback_data *cd =
2240                 (struct get_tunable_callback_data *)callback;
2241         int size;
2242
2243         if (res != 0) {
2244                 /* Already handled in fail callback */
2245                 return;
2246         }
2247
2248         if (outdata.dsize != sizeof(uint32_t)) {
2249                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2250                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2251                                  (int)outdata.dsize));
2252                 cd->fatal = true;
2253                 return;
2254         }
2255
2256         size = talloc_array_length(cd->out);
2257         if (pnn >= size) {
2258                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2259                                  cd->tunable, pnn, size));
2260                 return;
2261         }
2262
2263                 
2264         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2265 }
2266
2267 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2268                                        int32_t res, TDB_DATA outdata,
2269                                        void *callback)
2270 {
2271         struct get_tunable_callback_data *cd =
2272                 (struct get_tunable_callback_data *)callback;
2273
2274         switch (res) {
2275         case -ETIME:
2276                 DEBUG(DEBUG_ERR,
2277                       ("Timed out getting tunable \"%s\" from node %d\n",
2278                        cd->tunable, pnn));
2279                 cd->fatal = true;
2280                 break;
2281         case -EINVAL:
2282         case -1:
2283                 DEBUG(DEBUG_WARNING,
2284                       ("Tunable \"%s\" not implemented on node %d\n",
2285                        cd->tunable, pnn));
2286                 break;
2287         default:
2288                 DEBUG(DEBUG_ERR,
2289                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2290                        cd->tunable, pnn));
2291                 cd->fatal = true;
2292         }
2293 }
2294
2295 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2296                                         TALLOC_CTX *tmp_ctx,
2297                                         struct ctdb_node_map_old *nodemap,
2298                                         const char *tunable,
2299                                         uint32_t default_value)
2300 {
2301         TDB_DATA data;
2302         struct ctdb_control_get_tunable *t;
2303         uint32_t *nodes;
2304         uint32_t *tvals;
2305         struct get_tunable_callback_data callback_data;
2306         int i;
2307
2308         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2309         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2310         for (i=0; i<nodemap->num; i++) {
2311                 tvals[i] = default_value;
2312         }
2313                 
2314         callback_data.out = tvals;
2315         callback_data.tunable = tunable;
2316         callback_data.fatal = false;
2317
2318         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2319         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2320         t = (struct ctdb_control_get_tunable *)data.dptr;
2321         t->length = strlen(tunable)+1;
2322         memcpy(t->name, tunable, t->length);
2323         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2324         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2325                                       nodes, 0, TAKEOVER_TIMEOUT(),
2326                                       false, data,
2327                                       get_tunable_callback,
2328                                       get_tunable_fail_callback,
2329                                       &callback_data) != 0) {
2330                 if (callback_data.fatal) {
2331                         talloc_free(tvals);
2332                         tvals = NULL;
2333                 }
2334         }
2335         talloc_free(nodes);
2336         talloc_free(data.dptr);
2337
2338         return tvals;
2339 }
2340
2341 /* Set internal flags for IP allocation:
2342  *   Clear ip flags
2343  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2344  *   Set NOIPHOST ip flag for each INACTIVE node
2345  *   if all nodes are disabled:
2346  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2347  *   else
2348  *     Set NOIPHOST ip flags for disabled nodes
2349  */
2350 static struct ctdb_ipflags *
2351 set_ipflags_internal(struct ctdb_context *ctdb,
2352                      TALLOC_CTX *tmp_ctx,
2353                      struct ctdb_node_map_old *nodemap,
2354                      uint32_t *tval_noiptakeover,
2355                      uint32_t *tval_noiphostonalldisabled)
2356 {
2357         int i;
2358         struct ctdb_ipflags *ipflags;
2359
2360         /* Clear IP flags - implicit due to talloc_zero */
2361         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2362         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2363
2364         for (i=0;i<nodemap->num;i++) {
2365                 /* Can not take IPs on node with NoIPTakeover set */
2366                 if (tval_noiptakeover[i] != 0) {
2367                         ipflags[i].noiptakeover = true;
2368                 }
2369
2370                 /* Can not host IPs on INACTIVE node */
2371                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2372                         ipflags[i].noiphost = true;
2373                 }
2374         }
2375
2376         if (all_nodes_are_disabled(nodemap)) {
2377                 /* If all nodes are disabled, can not host IPs on node
2378                  * with NoIPHostOnAllDisabled set
2379                  */
2380                 for (i=0;i<nodemap->num;i++) {
2381                         if (tval_noiphostonalldisabled[i] != 0) {
2382                                 ipflags[i].noiphost = true;
2383                         }
2384                 }
2385         } else {
2386                 /* If some nodes are not disabled, then can not host
2387                  * IPs on DISABLED node
2388                  */
2389                 for (i=0;i<nodemap->num;i++) {
2390                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2391                                 ipflags[i].noiphost = true;
2392                         }
2393                 }
2394         }
2395
2396         return ipflags;
2397 }
2398
2399 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2400                                         TALLOC_CTX *tmp_ctx,
2401                                         struct ctdb_node_map_old *nodemap)
2402 {
2403         uint32_t *tval_noiptakeover;
2404         uint32_t *tval_noiphostonalldisabled;
2405         struct ctdb_ipflags *ipflags;
2406
2407
2408         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2409                                                    "NoIPTakeover", 0);
2410         if (tval_noiptakeover == NULL) {
2411                 return NULL;
2412         }
2413
2414         tval_noiphostonalldisabled =
2415                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2416                                        "NoIPHostOnAllDisabled", 0);
2417         if (tval_noiphostonalldisabled == NULL) {
2418                 /* Caller frees tmp_ctx */
2419                 return NULL;
2420         }
2421
2422         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2423                                        tval_noiptakeover,
2424                                        tval_noiphostonalldisabled);
2425
2426         talloc_free(tval_noiptakeover);
2427         talloc_free(tval_noiphostonalldisabled);
2428
2429         return ipflags;
2430 }
2431
2432 struct iprealloc_callback_data {
2433         bool *retry_nodes;
2434         int retry_count;
2435         client_async_callback fail_callback;
2436         void *fail_callback_data;
2437         struct ctdb_node_map_old *nodemap;
2438 };
2439
2440 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2441                                         int32_t res, TDB_DATA outdata,
2442                                         void *callback)
2443 {
2444         int numnodes;
2445         struct iprealloc_callback_data *cd =
2446                 (struct iprealloc_callback_data *)callback;
2447
2448         numnodes = talloc_array_length(cd->retry_nodes);
2449         if (pnn > numnodes) {
2450                 DEBUG(DEBUG_ERR,
2451                       ("ipreallocated failure from node %d, "
2452                        "but only %d nodes in nodemap\n",
2453                        pnn, numnodes));
2454                 return;
2455         }
2456
2457         /* Can't run the "ipreallocated" event on a INACTIVE node */
2458         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2459                 DEBUG(DEBUG_WARNING,
2460                       ("ipreallocated failed on inactive node %d, ignoring\n",
2461                        pnn));
2462                 return;
2463         }
2464
2465         switch (res) {
2466         case -ETIME:
2467                 /* If the control timed out then that's a real error,
2468                  * so call the real fail callback
2469                  */
2470                 if (cd->fail_callback) {
2471                         cd->fail_callback(ctdb, pnn, res, outdata,
2472                                           cd->fail_callback_data);
2473                 } else {
2474                         DEBUG(DEBUG_WARNING,
2475                               ("iprealloc timed out but no callback registered\n"));
2476                 }
2477                 break;
2478         default:
2479                 /* If not a timeout then either the ipreallocated
2480                  * eventscript (or some setup) failed.  This might
2481                  * have failed because the IPREALLOCATED control isn't
2482                  * implemented - right now there is no way of knowing
2483                  * because the error codes are all folded down to -1.
2484                  * Consider retrying using EVENTSCRIPT control...
2485                  */
2486                 DEBUG(DEBUG_WARNING,
2487                       ("ipreallocated failure from node %d, flagging retry\n",
2488                        pnn));
2489                 cd->retry_nodes[pnn] = true;
2490                 cd->retry_count++;
2491         }
2492 }
2493
2494 struct takeover_callback_data {
2495         bool *node_failed;
2496         client_async_callback fail_callback;
2497         void *fail_callback_data;
2498         struct ctdb_node_map_old *nodemap;
2499 };
2500
2501 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2502                                        uint32_t node_pnn, int32_t res,
2503                                        TDB_DATA outdata, void *callback_data)
2504 {
2505         struct takeover_callback_data *cd =
2506                 talloc_get_type_abort(callback_data,
2507                                       struct takeover_callback_data);
2508         int i;
2509
2510         for (i = 0; i < cd->nodemap->num; i++) {
2511                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2512                         break;
2513                 }
2514         }
2515
2516         if (i == cd->nodemap->num) {
2517                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2518                 return;
2519         }
2520
2521         if (!cd->node_failed[i]) {
2522                 cd->node_failed[i] = true;
2523                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2524                                   cd->fail_callback_data);
2525         }
2526 }
2527
2528 /*
2529   make any IP alias changes for public addresses that are necessary 
2530  */
2531 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2532                       uint32_t *force_rebalance_nodes,
2533                       client_async_callback fail_callback, void *callback_data)
2534 {
2535         int i, j, ret;
2536         struct ctdb_public_ip ip;
2537         uint32_t *nodes;
2538         struct public_ip_list *all_ips, *tmp_ip;
2539         TDB_DATA data;
2540         struct timeval timeout;
2541         struct client_async_data *async_data;
2542         struct ctdb_client_control_state *state;
2543         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2544         struct ctdb_ipflags *ipflags;
2545         struct takeover_callback_data *takeover_data;
2546         struct iprealloc_callback_data iprealloc_data;
2547         bool *retry_data;
2548         bool can_host_ips;
2549
2550         /*
2551          * ip failover is completely disabled, just send out the 
2552          * ipreallocated event.
2553          */
2554         if (ctdb->tunable.disable_ip_failover != 0) {
2555                 goto ipreallocated;
2556         }
2557
2558         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2559         if (ipflags == NULL) {
2560                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2561                 talloc_free(tmp_ctx);
2562                 return -1;
2563         }
2564
2565         /* Fetch known/available public IPs from each active node */
2566         ret = ctdb_reload_remote_public_ips(ctdb, nodemap);
2567         if (ret != 0) {
2568                 talloc_free(tmp_ctx);
2569                 return -1;
2570         }
2571
2572         /* Short-circuit IP allocation if no node has available IPs */
2573         can_host_ips = false;
2574         for (i=0; i < ctdb->num_nodes; i++) {
2575                 if (ctdb->nodes[i]->available_public_ips != NULL) {
2576                         can_host_ips = true;
2577                 }
2578         }
2579         if (!can_host_ips) {
2580                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2581                 return 0;
2582         }
2583
2584         /* Do the IP reassignment calculations */
2585         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2586
2587         /* Now tell all nodes to release any public IPs should not
2588          * host.  This will be a NOOP on nodes that don't currently
2589          * hold the given IP.
2590          */
2591         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2592         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2593
2594         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2595                                                        bool, nodemap->num);
2596         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2597         takeover_data->fail_callback = fail_callback;
2598         takeover_data->fail_callback_data = callback_data;
2599         takeover_data->nodemap = nodemap;
2600
2601         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2602         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2603
2604         async_data->fail_callback = takeover_run_fail_callback;
2605         async_data->callback_data = takeover_data;
2606
2607         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2608
2609         /* Send a RELEASE_IP to all nodes that should not be hosting
2610          * each IP.  For each IP, all but one of these will be
2611          * redundant.  However, the redundant ones are used to tell
2612          * nodes which node should be hosting the IP so that commands
2613          * like "ctdb ip" can display a particular nodes idea of who
2614          * is hosting what. */
2615         for (i=0;i<nodemap->num;i++) {
2616                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2617                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2618                         continue;
2619                 }
2620
2621                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2622                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2623                                 /* This node should be serving this
2624                                    vnn so don't tell it to release the ip
2625                                 */
2626                                 continue;
2627                         }
2628                         ip.pnn  = tmp_ip->pnn;
2629                         ip.addr = tmp_ip->addr;
2630
2631                         timeout = TAKEOVER_TIMEOUT();
2632                         data.dsize = sizeof(ip);
2633                         data.dptr  = (uint8_t *)&ip;
2634                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2635                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2636                                                   data, async_data,
2637                                                   &timeout, NULL);
2638                         if (state == NULL) {
2639                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2640                                 talloc_free(tmp_ctx);
2641                                 return -1;
2642                         }
2643
2644                         ctdb_client_async_add(async_data, state);
2645                 }
2646         }
2647         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2648                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2649                 talloc_free(tmp_ctx);
2650                 return -1;
2651         }
2652         talloc_free(async_data);
2653
2654
2655         /* For each IP, send a TAKOVER_IP to the node that should be
2656          * hosting it.  Many of these will often be redundant (since
2657          * the allocation won't have changed) but they can be useful
2658          * to recover from inconsistencies. */
2659         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2660         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2661
2662         async_data->fail_callback = fail_callback;
2663         async_data->callback_data = callback_data;
2664
2665         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2666                 if (tmp_ip->pnn == -1) {
2667                         /* this IP won't be taken over */
2668                         continue;
2669                 }
2670
2671                 ip.pnn  = tmp_ip->pnn;
2672                 ip.addr = tmp_ip->addr;
2673
2674                 timeout = TAKEOVER_TIMEOUT();
2675                 data.dsize = sizeof(ip);
2676                 data.dptr  = (uint8_t *)&ip;
2677                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2678                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2679                                           data, async_data, &timeout, NULL);
2680                 if (state == NULL) {
2681                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2682                         talloc_free(tmp_ctx);
2683                         return -1;
2684                 }
2685
2686                 ctdb_client_async_add(async_data, state);
2687         }
2688         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2689                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2690                 talloc_free(tmp_ctx);
2691                 return -1;
2692         }
2693
2694 ipreallocated:
2695         /*
2696          * Tell all nodes to run eventscripts to process the
2697          * "ipreallocated" event.  This can do a lot of things,
2698          * including restarting services to reconfigure them if public
2699          * IPs have moved.  Once upon a time this event only used to
2700          * update natgw.
2701          */
2702         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2703         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2704         iprealloc_data.retry_nodes = retry_data;
2705         iprealloc_data.retry_count = 0;
2706         iprealloc_data.fail_callback = fail_callback;
2707         iprealloc_data.fail_callback_data = callback_data;
2708         iprealloc_data.nodemap = nodemap;
2709
2710         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2711         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2712                                         nodes, 0, TAKEOVER_TIMEOUT(),
2713                                         false, tdb_null,
2714                                         NULL, iprealloc_fail_callback,
2715                                         &iprealloc_data);
2716         if (ret != 0) {
2717                 /* If the control failed then we should retry to any
2718                  * nodes flagged by iprealloc_fail_callback using the
2719                  * EVENTSCRIPT control.  This is a best-effort at
2720                  * backward compatiblity when running a mixed cluster
2721                  * where some nodes have not yet been upgraded to
2722                  * support the IPREALLOCATED control.
2723                  */
2724                 DEBUG(DEBUG_WARNING,
2725                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2726
2727                 nodes = talloc_array(tmp_ctx, uint32_t,
2728                                      iprealloc_data.retry_count);
2729                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2730
2731                 j = 0;
2732                 for (i=0; i<nodemap->num; i++) {
2733                         if (iprealloc_data.retry_nodes[i]) {
2734                                 nodes[j] = i;
2735                                 j++;
2736                         }
2737                 }
2738
2739                 data.dptr  = discard_const("ipreallocated");
2740                 data.dsize = strlen((char *)data.dptr) + 1; 
2741                 ret = ctdb_client_async_control(ctdb,
2742                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2743                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2744                                                 false, data,
2745                                                 NULL, fail_callback,
2746                                                 callback_data);
2747                 if (ret != 0) {
2748                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2749                 }
2750         }
2751
2752         talloc_free(tmp_ctx);
2753         return ret;
2754 }
2755
2756
2757 /*
2758   destroy a ctdb_client_ip structure
2759  */
2760 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2761 {
2762         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2763                 ctdb_addr_to_str(&ip->addr),
2764                 ntohs(ip->addr.ip.sin_port),
2765                 ip->client_id));
2766
2767         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2768         return 0;
2769 }
2770
2771 /*
2772   called by a client to inform us of a TCP connection that it is managing
2773   that should tickled with an ACK when IP takeover is done
2774  */
2775 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2776                                 TDB_DATA indata)
2777 {
2778         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2779         struct ctdb_connection *tcp_sock = NULL;
2780         struct ctdb_tcp_list *tcp;
2781         struct ctdb_connection t;
2782         int ret;
2783         TDB_DATA data;
2784         struct ctdb_client_ip *ip;
2785         struct ctdb_vnn *vnn;
2786         ctdb_sock_addr addr;
2787
2788         /* If we don't have public IPs, tickles are useless */
2789         if (ctdb->vnn == NULL) {
2790                 return 0;
2791         }
2792
2793         tcp_sock = (struct ctdb_connection *)indata.dptr;
2794
2795         addr = tcp_sock->src;
2796         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2797         addr = tcp_sock->dst;
2798         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2799
2800         ZERO_STRUCT(addr);
2801         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2802         vnn = find_public_ip_vnn(ctdb, &addr);
2803         if (vnn == NULL) {
2804                 switch (addr.sa.sa_family) {
2805                 case AF_INET:
2806                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2807                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2808                                         ctdb_addr_to_str(&addr)));
2809                         }
2810                         break;
2811                 case AF_INET6:
2812                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2813                                 ctdb_addr_to_str(&addr)));
2814                         break;
2815                 default:
2816                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2817                 }
2818
2819                 return 0;
2820         }
2821
2822         if (vnn->pnn != ctdb->pnn) {
2823                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2824                         ctdb_addr_to_str(&addr),
2825                         client_id, client->pid));
2826                 /* failing this call will tell smbd to die */
2827                 return -1;
2828         }
2829
2830         ip = talloc(client, struct ctdb_client_ip);
2831         CTDB_NO_MEMORY(ctdb, ip);
2832
2833         ip->ctdb      = ctdb;
2834         ip->addr      = addr;
2835         ip->client_id = client_id;
2836         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2837         DLIST_ADD(ctdb->client_ip_list, ip);
2838
2839         tcp = talloc(client, struct ctdb_tcp_list);
2840         CTDB_NO_MEMORY(ctdb, tcp);
2841
2842         tcp->connection.src = tcp_sock->src;
2843         tcp->connection.dst = tcp_sock->dst;
2844
2845         DLIST_ADD(client->tcp_list, tcp);
2846
2847         t.src = tcp_sock->src;
2848         t.dst = tcp_sock->dst;
2849
2850         data.dptr = (uint8_t *)&t;
2851         data.dsize = sizeof(t);
2852
2853         switch (addr.sa.sa_family) {
2854         case AF_INET:
2855                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2856                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2857                         ctdb_addr_to_str(&tcp_sock->src),
2858                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2859                 break;
2860         case AF_INET6:
2861                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2862                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2863                         ctdb_addr_to_str(&tcp_sock->src),
2864                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2865                 break;
2866         default:
2867                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2868         }
2869
2870
2871         /* tell all nodes about this tcp connection */
2872         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2873                                        CTDB_CONTROL_TCP_ADD,
2874                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2875         if (ret != 0) {
2876                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2877                 return -1;
2878         }
2879
2880         return 0;
2881 }
2882
2883 /*
2884   find a tcp address on a list
2885  */
2886 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2887                                            struct ctdb_connection *tcp)
2888 {
2889         int i;
2890
2891         if (array == NULL) {
2892                 return NULL;
2893         }
2894
2895         for (i=0;i<array->num;i++) {
2896                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2897                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2898                         return &array->connections[i];
2899                 }
2900         }
2901         return NULL;
2902 }
2903
2904
2905
2906 /*
2907   called by a daemon to inform us of a TCP connection that one of its
2908   clients managing that should tickled with an ACK when IP takeover is
2909   done
2910  */
2911 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2912 {
2913         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2914         struct ctdb_tcp_array *tcparray;
2915         struct ctdb_connection tcp;
2916         struct ctdb_vnn *vnn;
2917
2918         /* If we don't have public IPs, tickles are useless */
2919         if (ctdb->vnn == NULL) {
2920                 return 0;
2921         }
2922
2923         vnn = find_public_ip_vnn(ctdb, &p->dst);
2924         if (vnn == NULL) {
2925                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2926                         ctdb_addr_to_str(&p->dst)));
2927
2928                 return -1;
2929         }
2930
2931
2932         tcparray = vnn->tcp_array;
2933
2934         /* If this is the first tickle */
2935         if (tcparray == NULL) {
2936                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2937                 CTDB_NO_MEMORY(ctdb, tcparray);
2938                 vnn->tcp_array = tcparray;
2939
2940                 tcparray->num = 0;
2941                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2942                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2943
2944                 tcparray->connections[tcparray->num].src = p->src;
2945                 tcparray->connections[tcparray->num].dst = p->dst;
2946                 tcparray->num++;
2947
2948                 if (tcp_update_needed) {
2949                         vnn->tcp_update_needed = true;
2950                 }
2951                 return 0;
2952         }
2953
2954
2955         /* Do we already have this tickle ?*/
2956         tcp.src = p->src;
2957         tcp.dst = p->dst;
2958         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2959                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2960                         ctdb_addr_to_str(&tcp.dst),
2961                         ntohs(tcp.dst.ip.sin_port),
2962                         vnn->pnn));
2963                 return 0;
2964         }
2965
2966         /* A new tickle, we must add it to the array */
2967         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2968                                         struct ctdb_connection,
2969                                         tcparray->num+1);
2970         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2971
2972         tcparray->connections[tcparray->num].src = p->src;
2973         tcparray->connections[tcparray->num].dst = p->dst;
2974         tcparray->num++;
2975
2976         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2977                 ctdb_addr_to_str(&tcp.dst),
2978                 ntohs(tcp.dst.ip.sin_port),
2979                 vnn->pnn));
2980
2981         if (tcp_update_needed) {
2982                 vnn->tcp_update_needed = true;
2983         }
2984
2985         return 0;
2986 }
2987
2988
2989 /*
2990   called by a daemon to inform us of a TCP connection that one of its
2991   clients managing that should tickled with an ACK when IP takeover is
2992   done
2993  */
2994 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
2995 {
2996         struct ctdb_connection *tcpp;
2997         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
2998
2999         if (vnn == NULL) {
3000                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3001                         ctdb_addr_to_str(&conn->dst)));
3002                 return;
3003         }
3004
3005         /* if the array is empty we cant remove it
3006            and we don't need to do anything
3007          */
3008         if (vnn->tcp_array == NULL) {
3009                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3010                         ctdb_addr_to_str(&conn->dst),
3011                         ntohs(conn->dst.ip.sin_port)));
3012                 return;
3013         }
3014
3015
3016         /* See if we know this connection
3017            if we don't know this connection  then we dont need to do anything
3018          */
3019         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3020         if (tcpp == NULL) {
3021                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3022                         ctdb_addr_to_str(&conn->dst),
3023                         ntohs(conn->dst.ip.sin_port)));
3024                 return;
3025         }
3026
3027
3028         /* We need to remove this entry from the array.
3029            Instead of allocating a new array and copying data to it
3030            we cheat and just copy the last entry in the existing array
3031            to the entry that is to be removed and just shring the 
3032            ->num field
3033          */
3034         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3035         vnn->tcp_array->num--;
3036
3037         /* If we deleted the last entry we also need to remove the entire array
3038          */
3039         if (vnn->tcp_array->num == 0) {
3040                 talloc_free(vnn->tcp_array);
3041                 vnn->tcp_array = NULL;
3042         }               
3043
3044         vnn->tcp_update_needed = true;
3045
3046         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3047                 ctdb_addr_to_str(&conn->src),
3048                 ntohs(conn->src.ip.sin_port)));
3049 }
3050
3051
3052 /*
3053   called by a daemon to inform us of a TCP connection that one of its
3054   clients used are no longer needed in the tickle database
3055  */
3056 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3057 {
3058         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3059
3060         /* If we don't have public IPs, tickles are useless */
3061         if (ctdb->vnn == NULL) {
3062                 return 0;
3063         }
3064
3065         ctdb_remove_connection(ctdb, conn);
3066
3067         return 0;
3068 }
3069
3070
3071 /*
3072   Called when another daemon starts - causes all tickles for all
3073   public addresses we are serving to be sent to the new node on the
3074   next check.  This actually causes the next scheduled call to
3075   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3076   doesn't require careful error handling.
3077  */
3078 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3079 {
3080         struct ctdb_vnn *vnn;
3081
3082         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3083                            (unsigned long) pnn));
3084
3085         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3086                 vnn->tcp_update_needed = true;
3087         }
3088
3089         return 0;
3090 }
3091
3092
3093 /*
3094   called when a client structure goes away - hook to remove
3095   elements from the tcp_list in all daemons
3096  */
3097 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3098 {
3099         while (client->tcp_list) {
3100                 struct ctdb_tcp_list *tcp = client->tcp_list;
3101                 DLIST_REMOVE(client->tcp_list, tcp);
3102                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3103         }
3104 }
3105
3106
3107 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3108 {
3109         struct ctdb_vnn *vnn;
3110         int count = 0;
3111
3112         if (ctdb->tunable.disable_ip_failover == 1) {
3113                 return;
3114         }
3115
3116         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3117                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3118                         ctdb_vnn_unassign_iface(ctdb, vnn);
3119                         continue;
3120                 }
3121                 if (!vnn->iface) {
3122                         continue;
3123                 }
3124
3125                 /* Don't allow multiple releases at once.  Some code,
3126                  * particularly ctdb_tickle_sentenced_connections() is
3127                  * not re-entrant */
3128                 if (vnn->update_in_flight) {
3129                         DEBUG(DEBUG_WARNING,
3130                               (__location__
3131                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3132                                     ctdb_addr_to_str(&vnn->public_address),
3133                                     vnn->public_netmask_bits,
3134                                     ctdb_vnn_iface_string(vnn)));
3135                         continue;
3136                 }
3137                 vnn->update_in_flight = true;
3138
3139                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3140                                     ctdb_addr_to_str(&vnn->public_address),
3141                                     vnn->public_netmask_bits,
3142                                     ctdb_vnn_iface_string(vnn)));
3143
3144                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3145                                   ctdb_vnn_iface_string(vnn),
3146                                   ctdb_addr_to_str(&vnn->public_address),
3147                                   vnn->public_netmask_bits);
3148                 release_kill_clients(ctdb, &vnn->public_address);
3149                 ctdb_vnn_unassign_iface(ctdb, vnn);
3150                 vnn->update_in_flight = false;
3151                 count++;
3152         }
3153
3154         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3155 }
3156
3157
3158 /*
3159   get list of public IPs
3160  */
3161 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3162                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3163 {
3164         int i, num, len;
3165         struct ctdb_public_ip_list_old *ips;
3166         struct ctdb_vnn *vnn;
3167         bool only_available = false;
3168
3169         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3170                 only_available = true;
3171         }
3172
3173         /* count how many public ip structures we have */
3174         num = 0;
3175         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3176                 num++;
3177         }
3178
3179         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3180                 num*sizeof(struct ctdb_public_ip);
3181         ips = talloc_zero_size(outdata, len);
3182         CTDB_NO_MEMORY(ctdb, ips);
3183
3184         i = 0;
3185         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3186                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3187                         continue;
3188                 }
3189                 ips->ips[i].pnn  = vnn->pnn;
3190                 ips->ips[i].addr = vnn->public_address;
3191                 i++;
3192         }
3193         ips->num = i;
3194         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3195                 i*sizeof(struct ctdb_public_ip);
3196
3197         outdata->dsize = len;
3198         outdata->dptr  = (uint8_t *)ips;
3199
3200         return 0;
3201 }
3202
3203
3204 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3205                                         struct ctdb_req_control_old *c,
3206                                         TDB_DATA indata,
3207                                         TDB_DATA *outdata)
3208 {
3209         int i, num, len;
3210         ctdb_sock_addr *addr;
3211         struct ctdb_public_ip_info_old *info;
3212         struct ctdb_vnn *vnn;
3213
3214         addr = (ctdb_sock_addr *)indata.dptr;
3215
3216         vnn = find_public_ip_vnn(ctdb, addr);
3217         if (vnn == NULL) {
3218                 /* if it is not a public ip   it could be our 'single ip' */
3219                 if (ctdb->single_ip_vnn) {
3220                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3221                                 vnn = ctdb->single_ip_vnn;
3222                         }
3223                 }
3224         }
3225         if (vnn == NULL) {
3226                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3227                                  "'%s'not a public address\n",
3228                                  ctdb_addr_to_str(addr)));
3229                 return -1;
3230         }
3231
3232         /* count how many public ip structures we have */
3233         num = 0;
3234         for (;vnn->ifaces[num];) {
3235                 num++;
3236         }
3237
3238         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3239                 num*sizeof(struct ctdb_iface);
3240         info = talloc_zero_size(outdata, len);
3241         CTDB_NO_MEMORY(ctdb, info);
3242
3243         info->ip.addr = vnn->public_address;
3244         info->ip.pnn = vnn->pnn;
3245         info->active_idx = 0xFFFFFFFF;
3246
3247         for (i=0; vnn->ifaces[i]; i++) {
3248                 struct ctdb_interface *cur;
3249
3250                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3251                 if (cur == NULL) {
3252                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3253                                            vnn->ifaces[i]));
3254                         return -1;
3255                 }
3256                 if (vnn->iface == cur) {
3257                         info->active_idx = i;
3258                 }
3259                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3260                 info->ifaces[i].link_state = cur->link_up;
3261                 info->ifaces[i].references = cur->references;
3262         }
3263         info->num = i;
3264         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3265                 i*sizeof(struct ctdb_iface);
3266
3267         outdata->dsize = len;
3268         outdata->dptr  = (uint8_t *)info;
3269
3270         return 0;
3271 }
3272
3273 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3274                                 struct ctdb_req_control_old *c,
3275                                 TDB_DATA *outdata)
3276 {
3277         int i, num, len;
3278         struct ctdb_iface_list_old *ifaces;
3279         struct ctdb_interface *cur;
3280
3281         /* count how many public ip structures we have */
3282         num = 0;
3283         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3284                 num++;
3285         }
3286
3287         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3288                 num*sizeof(struct ctdb_iface);
3289         ifaces = talloc_zero_size(outdata, len);
3290         CTDB_NO_MEMORY(ctdb, ifaces);
3291
3292         i = 0;
3293         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3294                 strcpy(ifaces->ifaces[i].name, cur->name);
3295                 ifaces->ifaces[i].link_state = cur->link_up;
3296                 ifaces->ifaces[i].references = cur->references;
3297                 i++;
3298         }
3299         ifaces->num = i;
3300         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3301                 i*sizeof(struct ctdb_iface);
3302
3303         outdata->dsize = len;
3304         outdata->dptr  = (uint8_t *)ifaces;
3305
3306         return 0;
3307 }
3308
3309 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3310                                     struct ctdb_req_control_old *c,
3311                                     TDB_DATA indata)
3312 {
3313         struct ctdb_iface *info;
3314         struct ctdb_interface *iface;
3315         bool link_up = false;
3316
3317         info = (struct ctdb_iface *)indata.dptr;
3318
3319         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3320                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3321                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3322                                   len, len, info->name));
3323                 return -1;
3324         }
3325
3326         switch (info->link_state) {
3327         case 0:
3328                 link_up = false;
3329                 break;
3330         case 1:
3331                 link_up = true;
3332                 break;
3333         default:
3334                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3335                                   (unsigned int)info->link_state));
3336                 return -1;
3337         }
3338
3339         if (info->references != 0) {
3340                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3341                                   (unsigned int)info->references));
3342                 return -1;
3343         }
3344
3345         iface = ctdb_find_iface(ctdb, info->name);
3346         if (iface == NULL) {
3347                 return -1;
3348         }
3349
3350         if (link_up == iface->link_up) {
3351                 return 0;
3352         }
3353
3354         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3355               ("iface[%s] has changed it's link status %s => %s\n",
3356                iface->name,
3357                iface->link_up?"up":"down",
3358                link_up?"up":"down"));
3359
3360         iface->link_up = link_up;
3361         return 0;
3362 }
3363
3364
3365 /* 
3366    structure containing the listening socket and the list of tcp connections
3367    that the ctdb daemon is to kill
3368 */
3369 struct ctdb_kill_tcp {
3370         struct ctdb_vnn *vnn;
3371         struct ctdb_context *ctdb;
3372         int capture_fd;
3373         struct tevent_fd *fde;
3374         trbt_tree_t *connections;
3375         void *private_data;
3376 };
3377
3378 /*
3379   a tcp connection that is to be killed
3380  */
3381 struct ctdb_killtcp_con {
3382         ctdb_sock_addr src_addr;
3383         ctdb_sock_addr dst_addr;
3384         int count;
3385         struct ctdb_kill_tcp *killtcp;
3386 };
3387
3388 /* this function is used to create a key to represent this socketpair
3389    in the killtcp tree.
3390    this key is used to insert and lookup matching socketpairs that are
3391    to be tickled and RST
3392 */
3393 #define KILLTCP_KEYLEN  10
3394 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3395 {
3396         static uint32_t key[KILLTCP_KEYLEN];
3397
3398         bzero(key, sizeof(key));
3399
3400         if (src->sa.sa_family != dst->sa.sa_family) {
3401                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3402                 return key;
3403         }
3404         
3405         switch (src->sa.sa_family) {
3406         case AF_INET:
3407                 key[0]  = dst->ip.sin_addr.s_addr;
3408                 key[1]  = src->ip.sin_addr.s_addr;
3409                 key[2]  = dst->ip.sin_port;
3410                 key[3]  = src->ip.sin_port;
3411                 break;
3412         case AF_INET6: {
3413                 uint32_t *dst6_addr32 =
3414                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3415                 uint32_t *src6_addr32 =
3416                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3417                 key[0]  = dst6_addr32[3];
3418                 key[1]  = src6_addr32[3];
3419                 key[2]  = dst6_addr32[2];
3420                 key[3]  = src6_addr32[2];
3421                 key[4]  = dst6_addr32[1];
3422                 key[5]  = src6_addr32[1];
3423                 key[6]  = dst6_addr32[0];
3424                 key[7]  = src6_addr32[0];
3425                 key[8]  = dst->ip6.sin6_port;
3426                 key[9]  = src->ip6.sin6_port;
3427                 break;
3428         }
3429         default:
3430                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3431                 return key;
3432         }
3433
3434         return key;
3435 }
3436
3437 /*
3438   called when we get a read event on the raw socket
3439  */
3440 static void capture_tcp_handler(struct tevent_context *ev,
3441                                 struct tevent_fd *fde,
3442                                 uint16_t flags, void *private_data)
3443 {
3444         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3445         struct ctdb_killtcp_con *con;
3446         ctdb_sock_addr src, dst;
3447         uint32_t ack_seq, seq;
3448
3449         if (!(flags & TEVENT_FD_READ)) {
3450                 return;
3451         }
3452
3453         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3454                                 killtcp->private_data,
3455                                 &src, &dst,
3456                                 &ack_seq, &seq) != 0) {
3457                 /* probably a non-tcp ACK packet */
3458                 return;
3459         }
3460
3461         /* check if we have this guy in our list of connections
3462            to kill
3463         */
3464         con = trbt_lookuparray32(killtcp->connections, 
3465                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3466         if (con == NULL) {
3467                 /* no this was some other packet we can just ignore */
3468                 return;
3469         }
3470
3471         /* This one has been tickled !
3472            now reset him and remove him from the list.
3473          */
3474         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3475                 ntohs(con->dst_addr.ip.sin_port),
3476                 ctdb_addr_to_str(&con->src_addr),
3477                 ntohs(con->src_addr.ip.sin_port)));
3478
3479         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3480         talloc_free(con);
3481 }
3482
3483
3484 /* when traversing the list of all tcp connections to send tickle acks to
3485    (so that we can capture the ack coming back and kill the connection
3486     by a RST)
3487    this callback is called for each connection we are currently trying to kill
3488 */
3489 static int tickle_connection_traverse(void *param, void *data)
3490 {
3491         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3492
3493         /* have tried too many times, just give up */
3494         if (con->count >= 5) {
3495                 /* can't delete in traverse: reparent to delete_cons */
3496                 talloc_steal(param, con);
3497                 return 0;
3498         }
3499
3500         /* othervise, try tickling it again */
3501         con->count++;
3502         ctdb_sys_send_tcp(
3503                 (ctdb_sock_addr *)&con->dst_addr,
3504                 (ctdb_sock_addr *)&con->src_addr,
3505                 0, 0, 0);
3506         return 0;
3507 }
3508
3509
3510 /* 
3511    called every second until all sentenced connections have been reset
3512  */
3513 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3514                                               struct tevent_timer *te,
3515                                               struct timeval t, void *private_data)
3516 {
3517         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3518         void *delete_cons = talloc_new(NULL);
3519
3520         /* loop over all connections sending tickle ACKs */
3521         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3522
3523         /* now we've finished traverse, it's safe to do deletion. */
3524         talloc_free(delete_cons);
3525
3526         /* If there are no more connections to kill we can remove the
3527            entire killtcp structure
3528          */
3529         if ( (killtcp->connections == NULL) || 
3530              (killtcp->connections->root == NULL) ) {
3531                 talloc_free(killtcp);
3532                 return;
3533         }
3534
3535         /* try tickling them again in a seconds time
3536          */
3537         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3538                          timeval_current_ofs(1, 0),
3539                          ctdb_tickle_sentenced_connections, killtcp);
3540 }
3541
3542 /*
3543   destroy the killtcp structure
3544  */
3545 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3546 {
3547         struct ctdb_vnn *tmpvnn;
3548
3549         /* verify that this vnn is still active */
3550         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3551                 if (tmpvnn == killtcp->vnn) {
3552                         break;
3553                 }
3554         }
3555
3556         if (tmpvnn == NULL) {
3557                 return 0;
3558         }
3559
3560         if (killtcp->vnn->killtcp != killtcp) {
3561                 return 0;
3562         }
3563
3564         killtcp->vnn->killtcp = NULL;
3565
3566         return 0;
3567 }
3568
3569
3570 /* nothing fancy here, just unconditionally replace any existing
3571    connection structure with the new one.
3572
3573    don't even free the old one if it did exist, that one is talloc_stolen
3574    by the same node in the tree anyway and will be deleted when the new data 
3575    is deleted
3576 */
3577 static void *add_killtcp_callback(void *parm, void *data)
3578 {
3579         return parm;
3580 }
3581
3582 /*
3583   add a tcp socket to the list of connections we want to RST
3584  */
3585 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3586                                        ctdb_sock_addr *s,
3587                                        ctdb_sock_addr *d)
3588 {
3589         ctdb_sock_addr src, dst;
3590         struct ctdb_kill_tcp *killtcp;
3591         struct ctdb_killtcp_con *con;
3592         struct ctdb_vnn *vnn;
3593
3594         ctdb_canonicalize_ip(s, &src);
3595         ctdb_canonicalize_ip(d, &dst);
3596
3597         vnn = find_public_ip_vnn(ctdb, &dst);
3598         if (vnn == NULL) {
3599                 vnn = find_public_ip_vnn(ctdb, &src);
3600         }
3601         if (vnn == NULL) {
3602                 /* if it is not a public ip   it could be our 'single ip' */
3603                 if (ctdb->single_ip_vnn) {
3604                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3605                                 vnn = ctdb->single_ip_vnn;
3606                         }
3607                 }
3608         }
3609         if (vnn == NULL) {
3610                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3611                 return -1;
3612         }
3613
3614         killtcp = vnn->killtcp;
3615         
3616         /* If this is the first connection to kill we must allocate
3617            a new structure
3618          */
3619         if (killtcp == NULL) {
3620                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3621                 CTDB_NO_MEMORY(ctdb, killtcp);
3622
3623                 killtcp->vnn         = vnn;
3624                 killtcp->ctdb        = ctdb;
3625                 killtcp->capture_fd  = -1;
3626                 killtcp->connections = trbt_create(killtcp, 0);
3627
3628                 vnn->killtcp         = killtcp;
3629                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3630         }
3631
3632
3633
3634         /* create a structure that describes this connection we want to
3635            RST and store it in killtcp->connections
3636         */
3637         con = talloc(killtcp, struct ctdb_killtcp_con);
3638         CTDB_NO_MEMORY(ctdb, con);
3639         con->src_addr = src;
3640         con->dst_addr = dst;
3641         con->count    = 0;
3642         con->killtcp  = killtcp;
3643
3644
3645         trbt_insertarray32_callback(killtcp->connections,
3646                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3647                         add_killtcp_callback, con);
3648
3649         /* 
3650            If we don't have a socket to listen on yet we must create it
3651          */
3652         if (killtcp->capture_fd == -1) {
3653                 const char *iface = ctdb_vnn_iface_string(vnn);
3654                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3655                 if (killtcp->capture_fd == -1) {
3656                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3657                                           "socket on iface '%s' for killtcp (%s)\n",
3658                                           iface, strerror(errno)));
3659                         goto failed;
3660                 }
3661         }
3662
3663
3664         if (killtcp->fde == NULL) {
3665                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3666                                              killtcp->capture_fd,
3667                                              TEVENT_FD_READ,
3668                                              capture_tcp_handler, killtcp);
3669                 tevent_fd_set_auto_close(killtcp->fde);
3670
3671                 /* We also need to set up some events to tickle all these connections
3672                    until they are all reset
3673                 */
3674                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3675                                  ctdb_tickle_sentenced_connections, killtcp);
3676         }
3677
3678         /* tickle him once now */
3679         ctdb_sys_send_tcp(
3680                 &con->dst_addr,
3681                 &con->src_addr,
3682                 0, 0, 0);
3683
3684         return 0;
3685
3686 failed:
3687         talloc_free(vnn->killtcp);
3688         vnn->killtcp = NULL;
3689         return -1;
3690 }
3691
3692 /*
3693   kill a TCP connection.
3694  */
3695 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3696 {
3697         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3698
3699         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3700 }
3701
3702 /*
3703   called by a daemon to inform us of the entire list of TCP tickles for
3704   a particular public address.
3705   this control should only be sent by the node that is currently serving
3706   that public address.
3707  */
3708 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3709 {
3710         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3711         struct ctdb_tcp_array *tcparray;
3712         struct ctdb_vnn *vnn;
3713
3714         /* We must at least have tickles.num or else we cant verify the size
3715            of the received data blob
3716          */
3717         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3718                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3719                 return -1;
3720         }
3721
3722         /* verify that the size of data matches what we expect */
3723         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3724                          + sizeof(struct ctdb_connection) * list->num) {
3725                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3726                 return -1;
3727         }
3728
3729         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3730                            ctdb_addr_to_str(&list->addr)));
3731
3732         vnn = find_public_ip_vnn(ctdb, &list->addr);
3733         if (vnn == NULL) {
3734                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3735                         ctdb_addr_to_str(&list->addr)));
3736
3737                 return 1;
3738         }
3739
3740         /* remove any old ticklelist we might have */
3741         talloc_free(vnn->tcp_array);
3742         vnn->tcp_array = NULL;
3743
3744         tcparray = talloc(vnn, struct ctdb_tcp_array);
3745         CTDB_NO_MEMORY(ctdb, tcparray);
3746
3747         tcparray->num = list->num;
3748
3749         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3750         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3751
3752         memcpy(tcparray->connections, &list->connections[0],
3753                sizeof(struct ctdb_connection)*tcparray->num);
3754
3755         /* We now have a new fresh tickle list array for this vnn */
3756         vnn->tcp_array = tcparray;
3757
3758         return 0;
3759 }
3760
3761 /*
3762   called to return the full list of tickles for the puclic address associated 
3763   with the provided vnn
3764  */
3765 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3766 {
3767         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3768         struct ctdb_tickle_list_old *list;
3769         struct ctdb_tcp_array *tcparray;
3770         int num;
3771         struct ctdb_vnn *vnn;
3772
3773         vnn = find_public_ip_vnn(ctdb, addr);
3774         if (vnn == NULL) {
3775                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3776                         ctdb_addr_to_str(addr)));
3777
3778                 return 1;
3779         }
3780
3781         tcparray = vnn->tcp_array;
3782         if (tcparray) {
3783                 num = tcparray->num;
3784         } else {
3785                 num = 0;
3786         }
3787
3788         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3789                         + sizeof(struct ctdb_connection) * num;
3790
3791         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3792         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3793         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3794
3795         list->addr = *addr;
3796         list->num = num;
3797         if (num) {
3798                 memcpy(&list->connections[0], tcparray->connections,
3799                         sizeof(struct ctdb_connection) * num);
3800         }
3801
3802         return 0;
3803 }
3804
3805
3806 /*
3807   set the list of all tcp tickles for a public address
3808  */
3809 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3810                                             ctdb_sock_addr *addr,
3811                                             struct ctdb_tcp_array *tcparray)
3812 {
3813         int ret, num;
3814         TDB_DATA data;
3815         struct ctdb_tickle_list_old *list;
3816
3817         if (tcparray) {
3818                 num = tcparray->num;
3819         } else {
3820                 num = 0;
3821         }
3822
3823         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3824                         sizeof(struct ctdb_connection) * num;
3825         data.dptr = talloc_size(ctdb, data.dsize);
3826         CTDB_NO_MEMORY(ctdb, data.dptr);
3827
3828         list = (struct ctdb_tickle_list_old *)data.dptr;
3829         list->addr = *addr;
3830         list->num = num;
3831         if (tcparray) {
3832                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3833         }
3834
3835         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3836                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3837                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3838         if (ret != 0) {
3839                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3840                 return -1;
3841         }
3842
3843         talloc_free(data.dptr);
3844
3845         return ret;
3846 }
3847
3848
3849 /*
3850   perform tickle updates if required
3851  */
3852 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3853                                     struct tevent_timer *te,
3854                                     struct timeval t, void *private_data)
3855 {
3856         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3857         int ret;
3858         struct ctdb_vnn *vnn;
3859
3860         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3861                 /* we only send out updates for public addresses that 
3862                    we have taken over
3863                  */
3864                 if (ctdb->pnn != vnn->pnn) {
3865                         continue;
3866                 }
3867                 /* We only send out the updates if we need to */
3868                 if (!vnn->tcp_update_needed) {
3869                         continue;
3870                 }
3871                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3872                                                        &vnn->public_address,
3873                                                        vnn->tcp_array);
3874                 if (ret != 0) {
3875                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3876                                 ctdb_addr_to_str(&vnn->public_address)));
3877                 } else {
3878                         DEBUG(DEBUG_INFO,
3879                               ("Sent tickle update for public address %s\n",
3880                                ctdb_addr_to_str(&vnn->public_address)));
3881                         vnn->tcp_update_needed = false;
3882                 }
3883         }
3884
3885         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3886                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3887                          ctdb_update_tcp_tickles, ctdb);
3888 }
3889
3890 /*
3891   start periodic update of tcp tickles
3892  */
3893 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3894 {
3895         ctdb->tickle_update_context = talloc_new(ctdb);
3896
3897         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3898                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3899                          ctdb_update_tcp_tickles, ctdb);
3900 }
3901
3902
3903
3904
3905 struct control_gratious_arp {
3906         struct ctdb_context *ctdb;
3907         ctdb_sock_addr addr;
3908         const char *iface;
3909         int count;
3910 };
3911
3912 /*
3913   send a control_gratuitous arp
3914  */
3915 static void send_gratious_arp(struct tevent_context *ev,
3916                               struct tevent_timer *te,
3917                               struct timeval t, void *private_data)
3918 {
3919         int ret;
3920         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3921                                                         struct control_gratious_arp);
3922
3923         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3924         if (ret != 0) {
3925                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3926                                  arp->iface, strerror(errno)));
3927         }
3928
3929
3930         arp->count++;
3931         if (arp->count == CTDB_ARP_REPEAT) {
3932                 talloc_free(arp);
3933                 return;
3934         }
3935
3936         tevent_add_timer(arp->ctdb->ev, arp,
3937                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3938                          send_gratious_arp, arp);
3939 }
3940
3941
3942 /*
3943   send a gratious arp 
3944  */
3945 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3946 {
3947         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3948         struct control_gratious_arp *arp;
3949
3950         /* verify the size of indata */
3951         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3952                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3953                                  (unsigned)indata.dsize, 
3954                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3955                 return -1;
3956         }
3957         if (indata.dsize != 
3958                 ( offsetof(struct ctdb_addr_info_old, iface)
3959                 + gratious_arp->len ) ){
3960
3961                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3962                         "but should be %u bytes\n", 
3963                          (unsigned)indata.dsize, 
3964                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
3965                 return -1;
3966         }
3967
3968
3969         arp = talloc(ctdb, struct control_gratious_arp);
3970         CTDB_NO_MEMORY(ctdb, arp);
3971
3972         arp->ctdb  = ctdb;
3973         arp->addr   = gratious_arp->addr;
3974         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3975         CTDB_NO_MEMORY(ctdb, arp->iface);
3976         arp->count = 0;
3977
3978         tevent_add_timer(arp->ctdb->ev, arp,
3979                          timeval_zero(), send_gratious_arp, arp);
3980
3981         return 0;
3982 }
3983
3984 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3985 {
3986         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3987         int ret;
3988
3989         /* verify the size of indata */
3990         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3991                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3992                 return -1;
3993         }
3994         if (indata.dsize != 
3995                 ( offsetof(struct ctdb_addr_info_old, iface)
3996                 + pub->len ) ){
3997
3998                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3999                         "but should be %u bytes\n", 
4000                          (unsigned)indata.dsize, 
4001                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4002                 return -1;
4003         }
4004
4005         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4006
4007         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4008
4009         if (ret != 0) {
4010                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4011                 return -1;
4012         }
4013
4014         return 0;
4015 }
4016
4017 struct delete_ip_callback_state {
4018         struct ctdb_req_control_old *c;
4019 };
4020
4021 /*
4022   called when releaseip event finishes for del_public_address
4023  */
4024 static void delete_ip_callback(struct ctdb_context *ctdb,
4025                                int32_t status, TDB_DATA data,
4026                                const char *errormsg,
4027                                void *private_data)
4028 {
4029         struct delete_ip_callback_state *state =
4030                 talloc_get_type(private_data, struct delete_ip_callback_state);
4031
4032         /* If release failed then fail. */
4033         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4034         talloc_free(private_data);
4035 }
4036
4037 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4038                                         struct ctdb_req_control_old *c,
4039                                         TDB_DATA indata, bool *async_reply)
4040 {
4041         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4042         struct ctdb_vnn *vnn;
4043
4044         /* verify the size of indata */
4045         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4046                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4047                 return -1;
4048         }
4049         if (indata.dsize != 
4050                 ( offsetof(struct ctdb_addr_info_old, iface)
4051                 + pub->len ) ){
4052
4053                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4054                         "but should be %u bytes\n", 
4055                          (unsigned)indata.dsize, 
4056                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4057                 return -1;
4058         }
4059
4060         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4061
4062         /* walk over all public addresses until we find a match */
4063         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4064                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4065                         if (vnn->pnn == ctdb->pnn) {
4066                                 struct delete_ip_callback_state *state;
4067                                 struct ctdb_public_ip *ip;
4068                                 TDB_DATA data;
4069                                 int ret;
4070
4071                                 vnn->delete_pending = true;
4072
4073                                 state = talloc(ctdb,
4074                                                struct delete_ip_callback_state);
4075                                 CTDB_NO_MEMORY(ctdb, state);
4076                                 state->c = c;
4077
4078                                 ip = talloc(state, struct ctdb_public_ip);
4079                                 if (ip == NULL) {
4080                                         DEBUG(DEBUG_ERR,
4081                                               (__location__ " Out of memory\n"));
4082                                         talloc_free(state);
4083                                         return -1;
4084                                 }
4085                                 ip->pnn = -1;
4086                                 ip->addr = pub->addr;
4087
4088                                 data.dsize = sizeof(struct ctdb_public_ip);
4089                                 data.dptr = (unsigned char *)ip;
4090
4091                                 ret = ctdb_daemon_send_control(ctdb,
4092                                                                ctdb_get_pnn(ctdb),
4093                                                                0,
4094                                                                CTDB_CONTROL_RELEASE_IP,
4095                                                                0, 0,
4096                                                                data,
4097                                                                delete_ip_callback,
4098                                                                state);
4099                                 if (ret == -1) {
4100                                         DEBUG(DEBUG_ERR,
4101                                               (__location__ "Unable to send "
4102                                                "CTDB_CONTROL_RELEASE_IP\n"));
4103                                         talloc_free(state);
4104                                         return -1;
4105                                 }
4106
4107                                 state->c = talloc_steal(state, c);
4108                                 *async_reply = true;
4109                         } else {
4110                                 /* This IP is not hosted on the
4111                                  * current node so just delete it
4112                                  * now. */
4113                                 do_delete_ip(ctdb, vnn);
4114                         }
4115
4116                         return 0;
4117                 }
4118         }
4119
4120         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4121                          ctdb_addr_to_str(&pub->addr)));
4122         return -1;
4123 }
4124
4125
4126 struct ipreallocated_callback_state {
4127         struct ctdb_req_control_old *c;
4128 };
4129
4130 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4131                                         int status, void *p)
4132 {
4133         struct ipreallocated_callback_state *state =
4134                 talloc_get_type(p, struct ipreallocated_callback_state);
4135
4136         if (status != 0) {
4137                 DEBUG(DEBUG_ERR,
4138                       (" \"ipreallocated\" event script failed (status %d)\n",
4139                        status));
4140                 if (status == -ETIME) {
4141                         ctdb_ban_self(ctdb);
4142                 }
4143         }
4144
4145         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4146         talloc_free(state);
4147 }
4148
4149 /* A control to run the ipreallocated event */
4150 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4151                                    struct ctdb_req_control_old *c,
4152                                    bool *async_reply)
4153 {
4154         int ret;
4155         struct ipreallocated_callback_state *state;
4156
4157         state = talloc(ctdb, struct ipreallocated_callback_state);
4158         CTDB_NO_MEMORY(ctdb, state);
4159
4160         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4161
4162         ret = ctdb_event_script_callback(ctdb, state,
4163                                          ctdb_ipreallocated_callback, state,
4164                                          CTDB_EVENT_IPREALLOCATED,
4165                                          "%s", "");
4166
4167         if (ret != 0) {
4168                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4169                 talloc_free(state);
4170                 return -1;
4171         }
4172
4173         /* tell the control that we will be reply asynchronously */
4174         state->c    = talloc_steal(state, c);
4175         *async_reply = true;
4176
4177         return 0;
4178 }
4179
4180
4181 /* This function is called from the recovery daemon to verify that a remote
4182    node has the expected ip allocation.
4183    This is verified against ctdb->ip_tree
4184 */
4185 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4186                                        struct ctdb_public_ip_list_old *ips,
4187                                        uint32_t pnn)
4188 {
4189         struct public_ip_list *tmp_ip;
4190         int i;
4191
4192         if (ctdb->ip_tree == NULL) {
4193                 /* don't know the expected allocation yet, assume remote node
4194                    is correct. */
4195                 return 0;
4196         }
4197
4198         if (ips == NULL) {
4199                 return 0;
4200         }
4201
4202         for (i=0; i<ips->num; i++) {
4203                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4204                 if (tmp_ip == NULL) {
4205                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4206                         return -1;
4207                 }
4208
4209                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4210                         continue;
4211                 }
4212
4213                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4214                         DEBUG(DEBUG_ERR,
4215                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4216                                pnn,
4217                                ctdb_addr_to_str(&ips->ips[i].addr),
4218                                ips->ips[i].pnn, tmp_ip->pnn));
4219                         return -1;
4220                 }
4221         }
4222
4223         return 0;
4224 }
4225
4226 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4227 {
4228         struct public_ip_list *tmp_ip;
4229
4230         /* IP tree is never built if DisableIPFailover is set */
4231         if (ctdb->tunable.disable_ip_failover != 0) {
4232                 return 0;
4233         }
4234
4235         if (ctdb->ip_tree == NULL) {
4236                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4237                 return -1;
4238         }
4239
4240         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4241         if (tmp_ip == NULL) {
4242                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4243                 return -1;
4244         }
4245
4246         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4247         tmp_ip->pnn = ip->pnn;
4248
4249         return 0;
4250 }
4251
4252 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4253 {
4254         TALLOC_FREE(ctdb->ip_tree);
4255 }
4256
4257 struct ctdb_reloadips_handle {
4258         struct ctdb_context *ctdb;
4259         struct ctdb_req_control_old *c;
4260         int status;
4261         int fd[2];
4262         pid_t child;
4263         struct tevent_fd *fde;
4264 };
4265
4266 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4267 {
4268         if (h == h->ctdb->reload_ips) {
4269                 h->ctdb->reload_ips = NULL;
4270         }
4271         if (h->c != NULL) {
4272                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4273                 h->c = NULL;
4274         }
4275         ctdb_kill(h->ctdb, h->child, SIGKILL);
4276         return 0;
4277 }
4278
4279 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4280                                          struct tevent_timer *te,
4281                                          struct timeval t, void *private_data)
4282 {
4283         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4284
4285         talloc_free(h);
4286 }
4287
4288 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4289                                          struct tevent_fd *fde,
4290                                          uint16_t flags, void *private_data)
4291 {
4292         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4293
4294         char res;
4295         int ret;
4296
4297         ret = sys_read(h->fd[0], &res, 1);
4298         if (ret < 1 || res != 0) {
4299                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4300                 res = 1;
4301         }
4302         h->status = res;
4303
4304         talloc_free(h);
4305 }
4306
4307 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4308 {
4309         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4310         struct ctdb_public_ip_list_old *ips;
4311         struct ctdb_vnn *vnn;
4312         struct client_async_data *async_data;
4313         struct timeval timeout;
4314         TDB_DATA data;
4315         struct ctdb_client_control_state *state;
4316         bool first_add;
4317         int i, ret;
4318
4319         CTDB_NO_MEMORY(ctdb, mem_ctx);
4320
4321         /* Read IPs from local node */
4322         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4323                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4324         if (ret != 0) {
4325                 DEBUG(DEBUG_ERR,
4326                       ("Unable to fetch public IPs from local node\n"));
4327                 talloc_free(mem_ctx);
4328                 return -1;
4329         }
4330
4331         /* Read IPs file - this is safe since this is a child process */
4332         ctdb->vnn = NULL;
4333         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4334                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4335                 talloc_free(mem_ctx);
4336                 return -1;
4337         }
4338
4339         async_data = talloc_zero(mem_ctx, struct client_async_data);
4340         CTDB_NO_MEMORY(ctdb, async_data);
4341
4342         /* Compare IPs between node and file for IPs to be deleted */
4343         for (i = 0; i < ips->num; i++) {
4344                 /* */
4345                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4346                         if (ctdb_same_ip(&vnn->public_address,
4347                                          &ips->ips[i].addr)) {
4348                                 /* IP is still in file */
4349                                 break;
4350                         }
4351                 }
4352
4353                 if (vnn == NULL) {
4354                         /* Delete IP ips->ips[i] */
4355                         struct ctdb_addr_info_old *pub;
4356
4357                         DEBUG(DEBUG_NOTICE,
4358                               ("IP %s no longer configured, deleting it\n",
4359                                ctdb_addr_to_str(&ips->ips[i].addr)));
4360
4361                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4362                         CTDB_NO_MEMORY(ctdb, pub);
4363
4364                         pub->addr  = ips->ips[i].addr;
4365                         pub->mask  = 0;
4366                         pub->len   = 0;
4367
4368                         timeout = TAKEOVER_TIMEOUT();
4369
4370                         data.dsize = offsetof(struct ctdb_addr_info_old,
4371                                               iface) + pub->len;
4372                         data.dptr = (uint8_t *)pub;
4373
4374                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4375                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4376                                                   0, data, async_data,
4377                                                   &timeout, NULL);
4378                         if (state == NULL) {
4379                                 DEBUG(DEBUG_ERR,
4380                                       (__location__
4381                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4382                                 goto failed;
4383                         }
4384
4385                         ctdb_client_async_add(async_data, state);
4386                 }
4387         }
4388
4389         /* Compare IPs between node and file for IPs to be added */
4390         first_add = true;
4391         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4392                 for (i = 0; i < ips->num; i++) {
4393                         if (ctdb_same_ip(&vnn->public_address,
4394                                          &ips->ips[i].addr)) {
4395                                 /* IP already on node */
4396                                 break;
4397                         }
4398                 }
4399                 if (i == ips->num) {
4400                         /* Add IP ips->ips[i] */
4401                         struct ctdb_addr_info_old *pub;
4402                         const char *ifaces = NULL;
4403                         uint32_t len;
4404                         int iface = 0;
4405
4406                         DEBUG(DEBUG_NOTICE,
4407                               ("New IP %s configured, adding it\n",
4408                                ctdb_addr_to_str(&vnn->public_address)));
4409                         if (first_add) {
4410                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4411
4412                                 data.dsize = sizeof(pnn);
4413                                 data.dptr  = (uint8_t *)&pnn;
4414
4415                                 ret = ctdb_client_send_message(
4416                                         ctdb,
4417                                         CTDB_BROADCAST_CONNECTED,
4418                                         CTDB_SRVID_REBALANCE_NODE,
4419                                         data);
4420                                 if (ret != 0) {
4421                                         DEBUG(DEBUG_WARNING,
4422                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4423                                 }
4424
4425                                 first_add = false;
4426                         }
4427
4428                         ifaces = vnn->ifaces[0];
4429                         iface = 1;
4430                         while (vnn->ifaces[iface] != NULL) {
4431                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4432                                                          vnn->ifaces[iface]);
4433                                 iface++;
4434                         }
4435
4436                         len   = strlen(ifaces) + 1;
4437                         pub = talloc_zero_size(mem_ctx,
4438                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4439                         CTDB_NO_MEMORY(ctdb, pub);
4440
4441                         pub->addr  = vnn->public_address;
4442                         pub->mask  = vnn->public_netmask_bits;
4443                         pub->len   = len;
4444                         memcpy(&pub->iface[0], ifaces, pub->len);
4445
4446                         timeout = TAKEOVER_TIMEOUT();
4447
4448                         data.dsize = offsetof(struct ctdb_addr_info_old,
4449                                               iface) + pub->len;
4450                         data.dptr = (uint8_t *)pub;
4451
4452                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4453                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4454                                                   0, data, async_data,
4455                                                   &timeout, NULL);
4456                         if (state == NULL) {
4457                                 DEBUG(DEBUG_ERR,
4458                                       (__location__
4459                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4460                                 goto failed;
4461                         }
4462
4463                         ctdb_client_async_add(async_data, state);
4464                 }
4465         }
4466
4467         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4468                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4469                 goto failed;
4470         }
4471
4472         talloc_free(mem_ctx);
4473         return 0;
4474
4475 failed:
4476         talloc_free(mem_ctx);
4477         return -1;
4478 }
4479
4480 /* This control is sent to force the node to re-read the public addresses file
4481    and drop any addresses we should nnot longer host, and add new addresses
4482    that we are now able to host
4483 */
4484 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4485 {
4486         struct ctdb_reloadips_handle *h;
4487         pid_t parent = getpid();
4488
4489         if (ctdb->reload_ips != NULL) {
4490                 talloc_free(ctdb->reload_ips);
4491                 ctdb->reload_ips = NULL;
4492         }
4493
4494         h = talloc(ctdb, struct ctdb_reloadips_handle);
4495         CTDB_NO_MEMORY(ctdb, h);
4496         h->ctdb     = ctdb;
4497         h->c        = NULL;
4498         h->status   = -1;
4499         
4500         if (pipe(h->fd) == -1) {
4501                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4502                 talloc_free(h);
4503                 return -1;
4504         }
4505
4506         h->child = ctdb_fork(ctdb);
4507         if (h->child == (pid_t)-1) {
4508                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4509                 close(h->fd[0]);
4510                 close(h->fd[1]);
4511                 talloc_free(h);
4512                 return -1;
4513         }
4514
4515         /* child process */
4516         if (h->child == 0) {
4517                 signed char res = 0;
4518
4519                 close(h->fd[0]);
4520                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4521
4522                 prctl_set_comment("ctdb_reloadips");
4523                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4524                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4525                         res = -1;
4526                 } else {
4527                         res = ctdb_reloadips_child(ctdb);
4528                         if (res != 0) {
4529                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4530                         }
4531                 }
4532
4533                 sys_write(h->fd[1], &res, 1);
4534                 /* make sure we die when our parent dies */
4535                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4536                         sleep(5);
4537                 }
4538                 _exit(0);
4539         }
4540
4541         h->c             = talloc_steal(h, c);
4542
4543         close(h->fd[1]);
4544         set_close_on_exec(h->fd[0]);
4545
4546         talloc_set_destructor(h, ctdb_reloadips_destructor);
4547
4548
4549         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4550                                ctdb_reloadips_child_handler, (void *)h);
4551         tevent_fd_set_auto_close(h->fde);
4552
4553         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4554                          ctdb_reloadips_timeout_event, h);
4555
4556         /* we reply later */
4557         *async_reply = true;
4558         return 0;
4559 }