ctdb-daemon: Do not support connection tracking if there are no public IPs
[gd/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          CTDB_EVENT_TAKE_IP,
514                                          "%s %s %u",
515                                          ctdb_vnn_iface_string(vnn),
516                                          ctdb_addr_to_str(&vnn->public_address),
517                                          vnn->public_netmask_bits);
518
519         if (ret != 0) {
520                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
521                         ctdb_addr_to_str(&vnn->public_address),
522                         ctdb_vnn_iface_string(vnn)));
523                 talloc_free(state);
524                 return -1;
525         }
526
527         return 0;
528 }
529
530 struct ctdb_do_updateip_state {
531         struct ctdb_req_control *c;
532         struct ctdb_iface *old;
533         struct ctdb_vnn *vnn;
534 };
535
536 /*
537   called when updateip event finishes
538  */
539 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
540                                       void *private_data)
541 {
542         struct ctdb_do_updateip_state *state =
543                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
544         int32_t ret;
545
546         if (status != 0) {
547                 if (status == -ETIME) {
548                         ctdb_ban_self(ctdb);
549                 }
550                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
551                         ctdb_addr_to_str(&state->vnn->public_address),
552                         state->old->name,
553                         ctdb_vnn_iface_string(state->vnn)));
554
555                 /*
556                  * All we can do is reset the old interface
557                  * and let the next run fix it
558                  */
559                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
560                 state->vnn->iface = state->old;
561                 state->vnn->iface->references++;
562
563                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
564                 talloc_free(state);
565                 return;
566         }
567
568         if (ctdb->do_checkpublicip) {
569
570         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
571         if (ret != 0) {
572                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
573                 talloc_free(state);
574                 return;
575         }
576
577         }
578
579         /* the control succeeded */
580         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
581         talloc_free(state);
582         return;
583 }
584
585 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
586 {
587         state->vnn->update_in_flight = false;
588         return 0;
589 }
590
591 /*
592   update (move) an ip address
593  */
594 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
595                                 struct ctdb_req_control *c,
596                                 struct ctdb_vnn *vnn)
597 {
598         int ret;
599         struct ctdb_do_updateip_state *state;
600         struct ctdb_iface *old = vnn->iface;
601         const char *new_name;
602
603         if (vnn->update_in_flight) {
604                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
605                                     "update for this IP already in flight\n",
606                                     ctdb_addr_to_str(&vnn->public_address),
607                                     vnn->public_netmask_bits));
608                 return -1;
609         }
610
611         ctdb_vnn_unassign_iface(ctdb, vnn);
612         ret = ctdb_vnn_assign_iface(ctdb, vnn);
613         if (ret != 0) {
614                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
615                                  "assin a usable interface (old iface '%s')\n",
616                                  ctdb_addr_to_str(&vnn->public_address),
617                                  vnn->public_netmask_bits,
618                                  old->name));
619                 return -1;
620         }
621
622         new_name = ctdb_vnn_iface_string(vnn);
623         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
624                 /* A benign update from one interface onto itself.
625                  * no need to run the eventscripts in this case, just return
626                  * success.
627                  */
628                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
629                 return 0;
630         }
631
632         state = talloc(vnn, struct ctdb_do_updateip_state);
633         CTDB_NO_MEMORY(ctdb, state);
634
635         state->c = talloc_steal(ctdb, c);
636         state->old = old;
637         state->vnn = vnn;
638
639         vnn->update_in_flight = true;
640         talloc_set_destructor(state, ctdb_updateip_destructor);
641
642         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
643                             "interface %s to %s\n",
644                             ctdb_addr_to_str(&vnn->public_address),
645                             vnn->public_netmask_bits,
646                             old->name,
647                             new_name));
648
649         ret = ctdb_event_script_callback(ctdb,
650                                          state,
651                                          ctdb_do_updateip_callback,
652                                          state,
653                                          CTDB_EVENT_UPDATE_IP,
654                                          "%s %s %s %u",
655                                          state->old->name,
656                                          new_name,
657                                          ctdb_addr_to_str(&vnn->public_address),
658                                          vnn->public_netmask_bits);
659         if (ret != 0) {
660                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
661                                  ctdb_addr_to_str(&vnn->public_address),
662                                  old->name, new_name));
663                 talloc_free(state);
664                 return -1;
665         }
666
667         return 0;
668 }
669
670 /*
671   Find the vnn of the node that has a public ip address
672   returns -1 if the address is not known as a public address
673  */
674 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
675 {
676         struct ctdb_vnn *vnn;
677
678         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
679                 if (ctdb_same_ip(&vnn->public_address, addr)) {
680                         return vnn;
681                 }
682         }
683
684         return NULL;
685 }
686
687 /*
688   take over an ip address
689  */
690 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
691                                  struct ctdb_req_control *c,
692                                  TDB_DATA indata,
693                                  bool *async_reply)
694 {
695         int ret;
696         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
697         struct ctdb_vnn *vnn;
698         bool have_ip = false;
699         bool do_updateip = false;
700         bool do_takeip = false;
701         struct ctdb_iface *best_iface = NULL;
702
703         if (pip->pnn != ctdb->pnn) {
704                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
705                                  "with pnn %d, but we're node %d\n",
706                                  ctdb_addr_to_str(&pip->addr),
707                                  pip->pnn, ctdb->pnn));
708                 return -1;
709         }
710
711         /* update out vnn list */
712         vnn = find_public_ip_vnn(ctdb, &pip->addr);
713         if (vnn == NULL) {
714                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
715                         ctdb_addr_to_str(&pip->addr)));
716                 return 0;
717         }
718
719         if (ctdb->do_checkpublicip) {
720                 have_ip = ctdb_sys_have_ip(&pip->addr);
721         }
722         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
723         if (best_iface == NULL) {
724                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
725                                  "a usable interface (old %s, have_ip %d)\n",
726                                  ctdb_addr_to_str(&vnn->public_address),
727                                  vnn->public_netmask_bits,
728                                  ctdb_vnn_iface_string(vnn),
729                                  have_ip));
730                 return -1;
731         }
732
733         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
734                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
735                 have_ip = false;
736         }
737
738
739         if (vnn->iface == NULL && have_ip) {
740                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
741                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
742                                  ctdb_addr_to_str(&vnn->public_address)));
743                 return 0;
744         }
745
746         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
747                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
748                                   "and we have it on iface[%s], but it was assigned to node %d"
749                                   "and we are node %d, banning ourself\n",
750                                  ctdb_addr_to_str(&vnn->public_address),
751                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
752                 ctdb_ban_self(ctdb);
753                 return -1;
754         }
755
756         if (vnn->pnn == -1 && have_ip) {
757                 vnn->pnn = ctdb->pnn;
758                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
759                                   "and we already have it on iface[%s], update local daemon\n",
760                                  ctdb_addr_to_str(&vnn->public_address),
761                                   ctdb_vnn_iface_string(vnn)));
762                 return 0;
763         }
764
765         if (vnn->iface) {
766                 if (vnn->iface != best_iface) {
767                         if (!vnn->iface->link_up) {
768                                 do_updateip = true;
769                         } else if (vnn->iface->references > (best_iface->references + 1)) {
770                                 /* only move when the rebalance gains something */
771                                         do_updateip = true;
772                         }
773                 }
774         }
775
776         if (!have_ip) {
777                 if (do_updateip) {
778                         ctdb_vnn_unassign_iface(ctdb, vnn);
779                         do_updateip = false;
780                 }
781                 do_takeip = true;
782         }
783
784         if (do_takeip) {
785                 ret = ctdb_do_takeip(ctdb, c, vnn);
786                 if (ret != 0) {
787                         return -1;
788                 }
789         } else if (do_updateip) {
790                 ret = ctdb_do_updateip(ctdb, c, vnn);
791                 if (ret != 0) {
792                         return -1;
793                 }
794         } else {
795                 /*
796                  * The interface is up and the kernel known the ip
797                  * => do nothing
798                  */
799                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
800                         ctdb_addr_to_str(&pip->addr),
801                         vnn->public_netmask_bits,
802                         ctdb_vnn_iface_string(vnn)));
803                 return 0;
804         }
805
806         /* tell ctdb_control.c that we will be replying asynchronously */
807         *async_reply = true;
808
809         return 0;
810 }
811
812 /*
813   takeover an ip address old v4 style
814  */
815 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
816                                 struct ctdb_req_control *c,
817                                 TDB_DATA indata, 
818                                 bool *async_reply)
819 {
820         TDB_DATA data;
821         
822         data.dsize = sizeof(struct ctdb_public_ip);
823         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
824         CTDB_NO_MEMORY(ctdb, data.dptr);
825         
826         memcpy(data.dptr, indata.dptr, indata.dsize);
827         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
828 }
829
830 /*
831   kill any clients that are registered with a IP that is being released
832  */
833 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
834 {
835         struct ctdb_client_ip *ip;
836
837         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
838                 ctdb_addr_to_str(addr)));
839
840         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
841                 ctdb_sock_addr tmp_addr;
842
843                 tmp_addr = ip->addr;
844                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
845                         ip->client_id,
846                         ctdb_addr_to_str(&ip->addr)));
847
848                 if (ctdb_same_ip(&tmp_addr, addr)) {
849                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
850                                                                      ip->client_id, 
851                                                                      struct ctdb_client);
852                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
853                                 ip->client_id,
854                                 ctdb_addr_to_str(&ip->addr),
855                                 client->pid));
856
857                         if (client->pid != 0) {
858                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
859                                         (unsigned)client->pid,
860                                         ctdb_addr_to_str(addr),
861                                         ip->client_id));
862                                 kill(client->pid, SIGKILL);
863                         }
864                 }
865         }
866 }
867
868 /*
869   called when releaseip event finishes
870  */
871 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
872                                 void *private_data)
873 {
874         struct takeover_callback_state *state = 
875                 talloc_get_type(private_data, struct takeover_callback_state);
876         TDB_DATA data;
877
878         if (status == -ETIME) {
879                 ctdb_ban_self(ctdb);
880         }
881
882         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
883                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
884                                   ctdb_addr_to_str(state->addr)));
885                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
886                 talloc_free(state);
887                 return;
888         }
889
890         /* send a message to all clients of this node telling them
891            that the cluster has been reconfigured and they should
892            release any sockets on this IP */
893         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
894         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
895         data.dsize = strlen((char *)data.dptr)+1;
896
897         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
898
899         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
900
901         /* kill clients that have registered with this IP */
902         release_kill_clients(ctdb, state->addr);
903
904         ctdb_vnn_unassign_iface(ctdb, state->vnn);
905
906         /* the control succeeded */
907         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
908         talloc_free(state);
909 }
910
911 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
912 {
913         state->vnn->update_in_flight = false;
914         return 0;
915 }
916
917 /*
918   release an ip address
919  */
920 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
921                                 struct ctdb_req_control *c,
922                                 TDB_DATA indata, 
923                                 bool *async_reply)
924 {
925         int ret;
926         struct takeover_callback_state *state;
927         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
928         struct ctdb_vnn *vnn;
929         char *iface;
930
931         /* update our vnn list */
932         vnn = find_public_ip_vnn(ctdb, &pip->addr);
933         if (vnn == NULL) {
934                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
935                         ctdb_addr_to_str(&pip->addr)));
936                 return 0;
937         }
938         vnn->pnn = pip->pnn;
939
940         /* stop any previous arps */
941         talloc_free(vnn->takeover_ctx);
942         vnn->takeover_ctx = NULL;
943
944         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
945          * lazy multicast to drop an IP from any node that isn't the
946          * intended new node.  The following causes makes ctdbd ignore
947          * a release for any address it doesn't host.
948          */
949         if (ctdb->do_checkpublicip) {
950                 if (!ctdb_sys_have_ip(&pip->addr)) {
951                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
952                                 ctdb_addr_to_str(&pip->addr),
953                                 vnn->public_netmask_bits,
954                                 ctdb_vnn_iface_string(vnn)));
955                         ctdb_vnn_unassign_iface(ctdb, vnn);
956                         return 0;
957                 }
958         } else {
959                 if (vnn->iface == NULL) {
960                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
961                                            ctdb_addr_to_str(&pip->addr),
962                                            vnn->public_netmask_bits));
963                         return 0;
964                 }
965         }
966
967         /* There is a potential race between take_ip and us because we
968          * update the VNN via a callback that run when the
969          * eventscripts have been run.  Avoid the race by allowing one
970          * update to be in flight at a time.
971          */
972         if (vnn->update_in_flight) {
973                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
974                                     "update for this IP already in flight\n",
975                                     ctdb_addr_to_str(&vnn->public_address),
976                                     vnn->public_netmask_bits));
977                 return -1;
978         }
979
980         if (ctdb->do_checkpublicip) {
981                 iface = ctdb_sys_find_ifname(&pip->addr);
982                 if (iface == NULL) {
983                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
984                         return 0;
985                 }
986                 if (vnn->iface == NULL) {
987                         DEBUG(DEBUG_WARNING,
988                               ("Public IP %s is hosted on interface %s but we have no VNN\n",
989                                ctdb_addr_to_str(&pip->addr),
990                                iface));
991                 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
992                         DEBUG(DEBUG_WARNING,
993                               ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
994                                ctdb_addr_to_str(&pip->addr),
995                                iface,
996                                ctdb_vnn_iface_string(vnn)));
997                         /* Should we fix vnn->iface?  If we do, what
998                          * happens to reference counts?
999                          */
1000                 }
1001         } else {
1002                 iface = strdup(ctdb_vnn_iface_string(vnn));
1003         }
1004
1005         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1006                 ctdb_addr_to_str(&pip->addr),
1007                 vnn->public_netmask_bits,
1008                 iface,
1009                 pip->pnn));
1010
1011         state = talloc(ctdb, struct takeover_callback_state);
1012         CTDB_NO_MEMORY(ctdb, state);
1013
1014         state->c = talloc_steal(state, c);
1015         state->addr = talloc(state, ctdb_sock_addr);       
1016         CTDB_NO_MEMORY(ctdb, state->addr);
1017         *state->addr = pip->addr;
1018         state->vnn   = vnn;
1019
1020         vnn->update_in_flight = true;
1021         talloc_set_destructor(state, ctdb_releaseip_destructor);
1022
1023         ret = ctdb_event_script_callback(ctdb, 
1024                                          state, release_ip_callback, state,
1025                                          CTDB_EVENT_RELEASE_IP,
1026                                          "%s %s %u",
1027                                          iface,
1028                                          ctdb_addr_to_str(&pip->addr),
1029                                          vnn->public_netmask_bits);
1030         free(iface);
1031         if (ret != 0) {
1032                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1033                         ctdb_addr_to_str(&pip->addr),
1034                         ctdb_vnn_iface_string(vnn)));
1035                 talloc_free(state);
1036                 return -1;
1037         }
1038
1039         /* tell the control that we will be reply asynchronously */
1040         *async_reply = true;
1041         return 0;
1042 }
1043
1044 /*
1045   release an ip address old v4 style
1046  */
1047 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1048                                 struct ctdb_req_control *c,
1049                                 TDB_DATA indata, 
1050                                 bool *async_reply)
1051 {
1052         TDB_DATA data;
1053         
1054         data.dsize = sizeof(struct ctdb_public_ip);
1055         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1056         CTDB_NO_MEMORY(ctdb, data.dptr);
1057         
1058         memcpy(data.dptr, indata.dptr, indata.dsize);
1059         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1060 }
1061
1062
1063 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1064                                    ctdb_sock_addr *addr,
1065                                    unsigned mask, const char *ifaces,
1066                                    bool check_address)
1067 {
1068         struct ctdb_vnn      *vnn;
1069         uint32_t num = 0;
1070         char *tmp;
1071         const char *iface;
1072         int i;
1073         int ret;
1074
1075         tmp = strdup(ifaces);
1076         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1077                 if (!ctdb_sys_check_iface_exists(iface)) {
1078                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1079                         free(tmp);
1080                         return -1;
1081                 }
1082         }
1083         free(tmp);
1084
1085         /* Verify that we dont have an entry for this ip yet */
1086         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1087                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1088                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1089                                 ctdb_addr_to_str(addr)));
1090                         return -1;
1091                 }               
1092         }
1093
1094         /* create a new vnn structure for this ip address */
1095         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1096         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1097         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1098         tmp = talloc_strdup(vnn, ifaces);
1099         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1100         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1101                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1102                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1103                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1104                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1105                 num++;
1106         }
1107         talloc_free(tmp);
1108         vnn->ifaces[num] = NULL;
1109         vnn->public_address      = *addr;
1110         vnn->public_netmask_bits = mask;
1111         vnn->pnn                 = -1;
1112         if (check_address) {
1113                 if (ctdb_sys_have_ip(addr)) {
1114                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1115                         vnn->pnn = ctdb->pnn;
1116                 }
1117         }
1118
1119         for (i=0; vnn->ifaces[i]; i++) {
1120                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121                 if (ret != 0) {
1122                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1123                                            "for public_address[%s]\n",
1124                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1125                         talloc_free(vnn);
1126                         return -1;
1127                 }
1128         }
1129
1130         DLIST_ADD(ctdb->vnn, vnn);
1131
1132         return 0;
1133 }
1134
1135 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1136                                   struct timeval t, void *private_data)
1137 {
1138         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1139                                                         struct ctdb_context);
1140         struct ctdb_vnn *vnn;
1141
1142         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1143                 int i;
1144
1145                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1146                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1147                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1148                                         vnn->ifaces[i],
1149                                         ctdb_addr_to_str(&vnn->public_address)));
1150                         }
1151                 }
1152         }
1153
1154         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1155                 timeval_current_ofs(30, 0), 
1156                 ctdb_check_interfaces_event, ctdb);
1157 }
1158
1159
1160 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1161 {
1162         if (ctdb->check_public_ifaces_ctx != NULL) {
1163                 talloc_free(ctdb->check_public_ifaces_ctx);
1164                 ctdb->check_public_ifaces_ctx = NULL;
1165         }
1166
1167         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1168         if (ctdb->check_public_ifaces_ctx == NULL) {
1169                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1170         }
1171
1172         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1173                 timeval_current_ofs(30, 0), 
1174                 ctdb_check_interfaces_event, ctdb);
1175
1176         return 0;
1177 }
1178
1179
1180 /*
1181   setup the public address lists from a file
1182 */
1183 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1184 {
1185         char **lines;
1186         int nlines;
1187         int i;
1188
1189         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1190         if (lines == NULL) {
1191                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1192                 return -1;
1193         }
1194         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1195                 nlines--;
1196         }
1197
1198         for (i=0;i<nlines;i++) {
1199                 unsigned mask;
1200                 ctdb_sock_addr addr;
1201                 const char *addrstr;
1202                 const char *ifaces;
1203                 char *tok, *line;
1204
1205                 line = lines[i];
1206                 while ((*line == ' ') || (*line == '\t')) {
1207                         line++;
1208                 }
1209                 if (*line == '#') {
1210                         continue;
1211                 }
1212                 if (strcmp(line, "") == 0) {
1213                         continue;
1214                 }
1215                 tok = strtok(line, " \t");
1216                 addrstr = tok;
1217                 tok = strtok(NULL, " \t");
1218                 if (tok == NULL) {
1219                         if (NULL == ctdb->default_public_interface) {
1220                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1221                                          i+1));
1222                                 talloc_free(lines);
1223                                 return -1;
1224                         }
1225                         ifaces = ctdb->default_public_interface;
1226                 } else {
1227                         ifaces = tok;
1228                 }
1229
1230                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1231                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1232                         talloc_free(lines);
1233                         return -1;
1234                 }
1235                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1236                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1237                         talloc_free(lines);
1238                         return -1;
1239                 }
1240         }
1241
1242
1243         talloc_free(lines);
1244         return 0;
1245 }
1246
1247 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1248                               const char *iface,
1249                               const char *ip)
1250 {
1251         struct ctdb_vnn *svnn;
1252         struct ctdb_iface *cur = NULL;
1253         bool ok;
1254         int ret;
1255
1256         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1257         CTDB_NO_MEMORY(ctdb, svnn);
1258
1259         svnn->ifaces = talloc_array(svnn, const char *, 2);
1260         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1261         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1262         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1263         svnn->ifaces[1] = NULL;
1264
1265         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1266         if (!ok) {
1267                 talloc_free(svnn);
1268                 return -1;
1269         }
1270
1271         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1272         if (ret != 0) {
1273                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1274                                    "for single_ip[%s]\n",
1275                                    svnn->ifaces[0],
1276                                    ctdb_addr_to_str(&svnn->public_address)));
1277                 talloc_free(svnn);
1278                 return -1;
1279         }
1280
1281         /* assume the single public ip interface is initially "good" */
1282         cur = ctdb_find_iface(ctdb, iface);
1283         if (cur == NULL) {
1284                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1285                 return -1;
1286         }
1287         cur->link_up = true;
1288
1289         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1290         if (ret != 0) {
1291                 talloc_free(svnn);
1292                 return -1;
1293         }
1294
1295         ctdb->single_ip_vnn = svnn;
1296         return 0;
1297 }
1298
1299 struct ctdb_public_ip_list {
1300         struct ctdb_public_ip_list *next;
1301         uint32_t pnn;
1302         ctdb_sock_addr addr;
1303 };
1304
1305 /* Given a physical node, return the number of
1306    public addresses that is currently assigned to this node.
1307 */
1308 static int node_ip_coverage(struct ctdb_context *ctdb, 
1309         int32_t pnn,
1310         struct ctdb_public_ip_list *ips)
1311 {
1312         int num=0;
1313
1314         for (;ips;ips=ips->next) {
1315                 if (ips->pnn == pnn) {
1316                         num++;
1317                 }
1318         }
1319         return num;
1320 }
1321
1322
1323 /* Can the given node host the given IP: is the public IP known to the
1324  * node and is NOIPHOST unset?
1325 */
1326 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1327                              struct ctdb_ipflags ipflags,
1328                              struct ctdb_public_ip_list *ip)
1329 {
1330         struct ctdb_all_public_ips *public_ips;
1331         int i;
1332
1333         if (ipflags.noiphost) {
1334                 return false;
1335         }
1336
1337         public_ips = ctdb->nodes[pnn]->available_public_ips;
1338
1339         if (public_ips == NULL) {
1340                 return false;
1341         }
1342
1343         for (i=0; i<public_ips->num; i++) {
1344                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1345                         /* yes, this node can serve this public ip */
1346                         return true;
1347                 }
1348         }
1349
1350         return false;
1351 }
1352
1353 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1354                                  struct ctdb_ipflags ipflags,
1355                                  struct ctdb_public_ip_list *ip)
1356 {
1357         if (ipflags.noiptakeover) {
1358                 return false;
1359         }
1360
1361         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1362 }
1363
1364 /* search the node lists list for a node to takeover this ip.
1365    pick the node that currently are serving the least number of ips
1366    so that the ips get spread out evenly.
1367 */
1368 static int find_takeover_node(struct ctdb_context *ctdb, 
1369                 struct ctdb_ipflags *ipflags,
1370                 struct ctdb_public_ip_list *ip,
1371                 struct ctdb_public_ip_list *all_ips)
1372 {
1373         int pnn, min=0, num;
1374         int i, numnodes;
1375
1376         numnodes = talloc_array_length(ipflags);
1377         pnn    = -1;
1378         for (i=0; i<numnodes; i++) {
1379                 /* verify that this node can serve this ip */
1380                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1381                         /* no it couldnt   so skip to the next node */
1382                         continue;
1383                 }
1384
1385                 num = node_ip_coverage(ctdb, i, all_ips);
1386                 /* was this the first node we checked ? */
1387                 if (pnn == -1) {
1388                         pnn = i;
1389                         min  = num;
1390                 } else {
1391                         if (num < min) {
1392                                 pnn = i;
1393                                 min  = num;
1394                         }
1395                 }
1396         }       
1397         if (pnn == -1) {
1398                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1399                         ctdb_addr_to_str(&ip->addr)));
1400
1401                 return -1;
1402         }
1403
1404         ip->pnn = pnn;
1405         return 0;
1406 }
1407
1408 #define IP_KEYLEN       4
1409 static uint32_t *ip_key(ctdb_sock_addr *ip)
1410 {
1411         static uint32_t key[IP_KEYLEN];
1412
1413         bzero(key, sizeof(key));
1414
1415         switch (ip->sa.sa_family) {
1416         case AF_INET:
1417                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1418                 break;
1419         case AF_INET6: {
1420                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1421                 key[0]  = htonl(s6_a32[0]);
1422                 key[1]  = htonl(s6_a32[1]);
1423                 key[2]  = htonl(s6_a32[2]);
1424                 key[3]  = htonl(s6_a32[3]);
1425                 break;
1426         }
1427         default:
1428                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1429                 return key;
1430         }
1431
1432         return key;
1433 }
1434
1435 static void *add_ip_callback(void *parm, void *data)
1436 {
1437         struct ctdb_public_ip_list *this_ip = parm; 
1438         struct ctdb_public_ip_list *prev_ip = data; 
1439
1440         if (prev_ip == NULL) {
1441                 return parm;
1442         }
1443         if (this_ip->pnn == -1) {
1444                 this_ip->pnn = prev_ip->pnn;
1445         }
1446
1447         return parm;
1448 }
1449
1450 static int getips_count_callback(void *param, void *data)
1451 {
1452         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1453         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1454
1455         new_ip->next = *ip_list;
1456         *ip_list     = new_ip;
1457         return 0;
1458 }
1459
1460 static struct ctdb_public_ip_list *
1461 create_merged_ip_list(struct ctdb_context *ctdb)
1462 {
1463         int i, j;
1464         struct ctdb_public_ip_list *ip_list;
1465         struct ctdb_all_public_ips *public_ips;
1466
1467         if (ctdb->ip_tree != NULL) {
1468                 talloc_free(ctdb->ip_tree);
1469                 ctdb->ip_tree = NULL;
1470         }
1471         ctdb->ip_tree = trbt_create(ctdb, 0);
1472
1473         for (i=0;i<ctdb->num_nodes;i++) {
1474                 public_ips = ctdb->nodes[i]->known_public_ips;
1475
1476                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1477                         continue;
1478                 }
1479
1480                 /* there were no public ips for this node */
1481                 if (public_ips == NULL) {
1482                         continue;
1483                 }               
1484
1485                 for (j=0;j<public_ips->num;j++) {
1486                         struct ctdb_public_ip_list *tmp_ip; 
1487
1488                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1489                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1490                         /* Do not use information about IP addresses hosted
1491                          * on other nodes, it may not be accurate */
1492                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1493                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1494                         } else {
1495                                 tmp_ip->pnn = -1;
1496                         }
1497                         tmp_ip->addr = public_ips->ips[j].addr;
1498                         tmp_ip->next = NULL;
1499
1500                         trbt_insertarray32_callback(ctdb->ip_tree,
1501                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1502                                 add_ip_callback,
1503                                 tmp_ip);
1504                 }
1505         }
1506
1507         ip_list = NULL;
1508         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1509
1510         return ip_list;
1511 }
1512
1513 /* 
1514  * This is the length of the longtest common prefix between the IPs.
1515  * It is calculated by XOR-ing the 2 IPs together and counting the
1516  * number of leading zeroes.  The implementation means that all
1517  * addresses end up being 128 bits long.
1518  *
1519  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1520  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1521  * lots of nodes and IP addresses?
1522  */
1523 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1524 {
1525         uint32_t ip1_k[IP_KEYLEN];
1526         uint32_t *t;
1527         int i;
1528         uint32_t x;
1529
1530         uint32_t distance = 0;
1531
1532         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1533         t = ip_key(ip2);
1534         for (i=0; i<IP_KEYLEN; i++) {
1535                 x = ip1_k[i] ^ t[i];
1536                 if (x == 0) {
1537                         distance += 32;
1538                 } else {
1539                         /* Count number of leading zeroes. 
1540                          * FIXME? This could be optimised...
1541                          */
1542                         while ((x & (1 << 31)) == 0) {
1543                                 x <<= 1;
1544                                 distance += 1;
1545                         }
1546                 }
1547         }
1548
1549         return distance;
1550 }
1551
1552 /* Calculate the IP distance for the given IP relative to IPs on the
1553    given node.  The ips argument is generally the all_ips variable
1554    used in the main part of the algorithm.
1555  */
1556 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1557                                   struct ctdb_public_ip_list *ips,
1558                                   int pnn)
1559 {
1560         struct ctdb_public_ip_list *t;
1561         uint32_t d;
1562
1563         uint32_t sum = 0;
1564
1565         for (t=ips; t != NULL; t=t->next) {
1566                 if (t->pnn != pnn) {
1567                         continue;
1568                 }
1569
1570                 /* Optimisation: We never calculate the distance
1571                  * between an address and itself.  This allows us to
1572                  * calculate the effect of removing an address from a
1573                  * node by simply calculating the distance between
1574                  * that address and all of the exitsing addresses.
1575                  * Moreover, we assume that we're only ever dealing
1576                  * with addresses from all_ips so we can identify an
1577                  * address via a pointer rather than doing a more
1578                  * expensive address comparison. */
1579                 if (&(t->addr) == ip) {
1580                         continue;
1581                 }
1582
1583                 d = ip_distance(ip, &(t->addr));
1584                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1585         }
1586
1587         return sum;
1588 }
1589
1590 /* Return the LCP2 imbalance metric for addresses currently assigned
1591    to the given node.
1592  */
1593 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1594 {
1595         struct ctdb_public_ip_list *t;
1596
1597         uint32_t imbalance = 0;
1598
1599         for (t=all_ips; t!=NULL; t=t->next) {
1600                 if (t->pnn != pnn) {
1601                         continue;
1602                 }
1603                 /* Pass the rest of the IPs rather than the whole
1604                    all_ips input list.
1605                 */
1606                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1607         }
1608
1609         return imbalance;
1610 }
1611
1612 /* Allocate any unassigned IPs just by looping through the IPs and
1613  * finding the best node for each.
1614  */
1615 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1616                                       struct ctdb_ipflags *ipflags,
1617                                       struct ctdb_public_ip_list *all_ips)
1618 {
1619         struct ctdb_public_ip_list *tmp_ip;
1620
1621         /* loop over all ip's and find a physical node to cover for 
1622            each unassigned ip.
1623         */
1624         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1625                 if (tmp_ip->pnn == -1) {
1626                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1627                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1628                                         ctdb_addr_to_str(&tmp_ip->addr)));
1629                         }
1630                 }
1631         }
1632 }
1633
1634 /* Basic non-deterministic rebalancing algorithm.
1635  */
1636 static void basic_failback(struct ctdb_context *ctdb,
1637                            struct ctdb_ipflags *ipflags,
1638                            struct ctdb_public_ip_list *all_ips,
1639                            int num_ips)
1640 {
1641         int i, numnodes;
1642         int maxnode, maxnum, minnode, minnum, num, retries;
1643         struct ctdb_public_ip_list *tmp_ip;
1644
1645         numnodes = talloc_array_length(ipflags);
1646         retries = 0;
1647
1648 try_again:
1649         maxnum=0;
1650         minnum=0;
1651
1652         /* for each ip address, loop over all nodes that can serve
1653            this ip and make sure that the difference between the node
1654            serving the most and the node serving the least ip's are
1655            not greater than 1.
1656         */
1657         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1658                 if (tmp_ip->pnn == -1) {
1659                         continue;
1660                 }
1661
1662                 /* Get the highest and lowest number of ips's served by any 
1663                    valid node which can serve this ip.
1664                 */
1665                 maxnode = -1;
1666                 minnode = -1;
1667                 for (i=0; i<numnodes; i++) {
1668                         /* only check nodes that can actually serve this ip */
1669                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1670                                 /* no it couldnt   so skip to the next node */
1671                                 continue;
1672                         }
1673
1674                         num = node_ip_coverage(ctdb, i, all_ips);
1675                         if (maxnode == -1) {
1676                                 maxnode = i;
1677                                 maxnum  = num;
1678                         } else {
1679                                 if (num > maxnum) {
1680                                         maxnode = i;
1681                                         maxnum  = num;
1682                                 }
1683                         }
1684                         if (minnode == -1) {
1685                                 minnode = i;
1686                                 minnum  = num;
1687                         } else {
1688                                 if (num < minnum) {
1689                                         minnode = i;
1690                                         minnum  = num;
1691                                 }
1692                         }
1693                 }
1694                 if (maxnode == -1) {
1695                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1696                                 ctdb_addr_to_str(&tmp_ip->addr)));
1697
1698                         continue;
1699                 }
1700
1701                 /* if the spread between the smallest and largest coverage by
1702                    a node is >=2 we steal one of the ips from the node with
1703                    most coverage to even things out a bit.
1704                    try to do this a limited number of times since we dont
1705                    want to spend too much time balancing the ip coverage.
1706                 */
1707                 if ( (maxnum > minnum+1)
1708                      && (retries < (num_ips + 5)) ){
1709                         struct ctdb_public_ip_list *tmp;
1710
1711                         /* Reassign one of maxnode's VNNs */
1712                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1713                                 if (tmp->pnn == maxnode) {
1714                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1715                                         retries++;
1716                                         goto try_again;;
1717                                 }
1718                         }
1719                 }
1720         }
1721 }
1722
1723 static void lcp2_init(struct ctdb_context *tmp_ctx,
1724                       struct ctdb_ipflags *ipflags,
1725                       struct ctdb_public_ip_list *all_ips,
1726                       uint32_t *force_rebalance_nodes,
1727                       uint32_t **lcp2_imbalances,
1728                       bool **rebalance_candidates)
1729 {
1730         int i, numnodes;
1731         struct ctdb_public_ip_list *tmp_ip;
1732
1733         numnodes = talloc_array_length(ipflags);
1734
1735         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1736         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1737         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1738         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1739
1740         for (i=0; i<numnodes; i++) {
1741                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1742                 /* First step: assume all nodes are candidates */
1743                 (*rebalance_candidates)[i] = true;
1744         }
1745
1746         /* 2nd step: if a node has IPs assigned then it must have been
1747          * healthy before, so we remove it from consideration.  This
1748          * is overkill but is all we have because we don't maintain
1749          * state between takeover runs.  An alternative would be to
1750          * keep state and invalidate it every time the recovery master
1751          * changes.
1752          */
1753         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1754                 if (tmp_ip->pnn != -1) {
1755                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1756                 }
1757         }
1758
1759         /* 3rd step: if a node is forced to re-balance then
1760            we allow failback onto the node */
1761         if (force_rebalance_nodes == NULL) {
1762                 return;
1763         }
1764         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1765                 uint32_t pnn = force_rebalance_nodes[i];
1766                 if (pnn >= numnodes) {
1767                         DEBUG(DEBUG_ERR,
1768                               (__location__ "unknown node %u\n", pnn));
1769                         continue;
1770                 }
1771
1772                 DEBUG(DEBUG_NOTICE,
1773                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1774                 (*rebalance_candidates)[pnn] = true;
1775         }
1776 }
1777
1778 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1779  * the IP/node combination that will cost the least.
1780  */
1781 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1782                                      struct ctdb_ipflags *ipflags,
1783                                      struct ctdb_public_ip_list *all_ips,
1784                                      uint32_t *lcp2_imbalances)
1785 {
1786         struct ctdb_public_ip_list *tmp_ip;
1787         int dstnode, numnodes;
1788
1789         int minnode;
1790         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1791         struct ctdb_public_ip_list *minip;
1792
1793         bool should_loop = true;
1794         bool have_unassigned = true;
1795
1796         numnodes = talloc_array_length(ipflags);
1797
1798         while (have_unassigned && should_loop) {
1799                 should_loop = false;
1800
1801                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1802                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1803
1804                 minnode = -1;
1805                 mindsum = 0;
1806                 minip = NULL;
1807
1808                 /* loop over each unassigned ip. */
1809                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1810                         if (tmp_ip->pnn != -1) {
1811                                 continue;
1812                         }
1813
1814                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1815                                 /* only check nodes that can actually takeover this ip */
1816                                 if (!can_node_takeover_ip(ctdb, dstnode,
1817                                                           ipflags[dstnode],
1818                                                           tmp_ip)) {
1819                                         /* no it couldnt   so skip to the next node */
1820                                         continue;
1821                                 }
1822
1823                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1824                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1825                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1826                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1827                                                    dstnode,
1828                                                    dstimbl - lcp2_imbalances[dstnode]));
1829
1830
1831                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1832                                         minnode = dstnode;
1833                                         minimbl = dstimbl;
1834                                         mindsum = dstdsum;
1835                                         minip = tmp_ip;
1836                                         should_loop = true;
1837                                 }
1838                         }
1839                 }
1840
1841                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1842
1843                 /* If we found one then assign it to the given node. */
1844                 if (minnode != -1) {
1845                         minip->pnn = minnode;
1846                         lcp2_imbalances[minnode] = minimbl;
1847                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1848                                           ctdb_addr_to_str(&(minip->addr)),
1849                                           minnode,
1850                                           mindsum));
1851                 }
1852
1853                 /* There might be a better way but at least this is clear. */
1854                 have_unassigned = false;
1855                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1856                         if (tmp_ip->pnn == -1) {
1857                                 have_unassigned = true;
1858                         }
1859                 }
1860         }
1861
1862         /* We know if we have an unassigned addresses so we might as
1863          * well optimise.
1864          */
1865         if (have_unassigned) {
1866                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1867                         if (tmp_ip->pnn == -1) {
1868                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1869                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1870                         }
1871                 }
1872         }
1873 }
1874
1875 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1876  * to move IPs from, determines the best IP/destination node
1877  * combination to move from the source node.
1878  */
1879 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1880                                     struct ctdb_ipflags *ipflags,
1881                                     struct ctdb_public_ip_list *all_ips,
1882                                     int srcnode,
1883                                     uint32_t *lcp2_imbalances,
1884                                     bool *rebalance_candidates)
1885 {
1886         int dstnode, mindstnode, numnodes;
1887         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1888         uint32_t minsrcimbl, mindstimbl;
1889         struct ctdb_public_ip_list *minip;
1890         struct ctdb_public_ip_list *tmp_ip;
1891
1892         /* Find an IP and destination node that best reduces imbalance. */
1893         srcimbl = 0;
1894         minip = NULL;
1895         minsrcimbl = 0;
1896         mindstnode = -1;
1897         mindstimbl = 0;
1898
1899         numnodes = talloc_array_length(ipflags);
1900
1901         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1902         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1903                            srcnode, lcp2_imbalances[srcnode]));
1904
1905         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1906                 /* Only consider addresses on srcnode. */
1907                 if (tmp_ip->pnn != srcnode) {
1908                         continue;
1909                 }
1910
1911                 /* What is this IP address costing the source node? */
1912                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1913                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1914
1915                 /* Consider this IP address would cost each potential
1916                  * destination node.  Destination nodes are limited to
1917                  * those that are newly healthy, since we don't want
1918                  * to do gratuitous failover of IPs just to make minor
1919                  * balance improvements.
1920                  */
1921                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1922                         if (!rebalance_candidates[dstnode]) {
1923                                 continue;
1924                         }
1925
1926                         /* only check nodes that can actually takeover this ip */
1927                         if (!can_node_takeover_ip(ctdb, dstnode,
1928                                                   ipflags[dstnode], tmp_ip)) {
1929                                 /* no it couldnt   so skip to the next node */
1930                                 continue;
1931                         }
1932
1933                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1934                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1935                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1936                                            srcnode, -srcdsum,
1937                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1938                                            dstnode, dstdsum));
1939
1940                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1941                             (dstdsum < srcdsum) &&                      \
1942                             ((mindstnode == -1) ||                              \
1943                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1944
1945                                 minip = tmp_ip;
1946                                 minsrcimbl = srcimbl;
1947                                 mindstnode = dstnode;
1948                                 mindstimbl = dstimbl;
1949                         }
1950                 }
1951         }
1952         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1953
1954         if (mindstnode != -1) {
1955                 /* We found a move that makes things better... */
1956                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1957                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1958                                   ctdb_addr_to_str(&(minip->addr)),
1959                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1960
1961
1962                 lcp2_imbalances[srcnode] = minsrcimbl;
1963                 lcp2_imbalances[mindstnode] = mindstimbl;
1964                 minip->pnn = mindstnode;
1965
1966                 return true;
1967         }
1968
1969         return false;
1970         
1971 }
1972
1973 struct lcp2_imbalance_pnn {
1974         uint32_t imbalance;
1975         int pnn;
1976 };
1977
1978 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1979 {
1980         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1981         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1982
1983         if (lipa->imbalance > lipb->imbalance) {
1984                 return -1;
1985         } else if (lipa->imbalance == lipb->imbalance) {
1986                 return 0;
1987         } else {
1988                 return 1;
1989         }
1990 }
1991
1992 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1993  * node with the highest LCP2 imbalance, and then determines the best
1994  * IP/destination node combination to move from the source node.
1995  */
1996 static void lcp2_failback(struct ctdb_context *ctdb,
1997                           struct ctdb_ipflags *ipflags,
1998                           struct ctdb_public_ip_list *all_ips,
1999                           uint32_t *lcp2_imbalances,
2000                           bool *rebalance_candidates)
2001 {
2002         int i, numnodes;
2003         struct lcp2_imbalance_pnn * lips;
2004         bool again;
2005
2006         numnodes = talloc_array_length(ipflags);
2007
2008 try_again:
2009         /* Put the imbalances and nodes into an array, sort them and
2010          * iterate through candidates.  Usually the 1st one will be
2011          * used, so this doesn't cost much...
2012          */
2013         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2014         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2015         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2016         for (i=0; i<numnodes; i++) {
2017                 lips[i].imbalance = lcp2_imbalances[i];
2018                 lips[i].pnn = i;
2019                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2020         }
2021         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2022               lcp2_cmp_imbalance_pnn);
2023
2024         again = false;
2025         for (i=0; i<numnodes; i++) {
2026                 /* This means that all nodes had 0 or 1 addresses, so
2027                  * can't be imbalanced.
2028                  */
2029                 if (lips[i].imbalance == 0) {
2030                         break;
2031                 }
2032
2033                 if (lcp2_failback_candidate(ctdb,
2034                                             ipflags,
2035                                             all_ips,
2036                                             lips[i].pnn,
2037                                             lcp2_imbalances,
2038                                             rebalance_candidates)) {
2039                         again = true;
2040                         break;
2041                 }
2042         }
2043
2044         talloc_free(lips);
2045         if (again) {
2046                 goto try_again;
2047         }
2048 }
2049
2050 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2051                                     struct ctdb_ipflags *ipflags,
2052                                     struct ctdb_public_ip_list *all_ips)
2053 {
2054         struct ctdb_public_ip_list *tmp_ip;
2055
2056         /* verify that the assigned nodes can serve that public ip
2057            and set it to -1 if not
2058         */
2059         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2060                 if (tmp_ip->pnn == -1) {
2061                         continue;
2062                 }
2063                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2064                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2065                         /* this node can not serve this ip. */
2066                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2067                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2068                                            tmp_ip->pnn));
2069                         tmp_ip->pnn = -1;
2070                 }
2071         }
2072 }
2073
2074 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2075                                        struct ctdb_ipflags *ipflags,
2076                                        struct ctdb_public_ip_list *all_ips)
2077 {
2078         struct ctdb_public_ip_list *tmp_ip;
2079         int i, numnodes;
2080
2081         numnodes = talloc_array_length(ipflags);
2082
2083         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2084        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2085         *  always be allocated the same way for a specific set of
2086         *  available/unavailable nodes.
2087         */
2088
2089         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2090                 tmp_ip->pnn = i % numnodes;
2091         }
2092
2093         /* IP failback doesn't make sense with deterministic
2094          * IPs, since the modulo step above implicitly fails
2095          * back IPs to their "home" node.
2096          */
2097         if (1 == ctdb->tunable.no_ip_failback) {
2098                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2099         }
2100
2101         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2102
2103         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2104
2105         /* No failback here! */
2106 }
2107
2108 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2109                                           struct ctdb_ipflags *ipflags,
2110                                           struct ctdb_public_ip_list *all_ips)
2111 {
2112         /* This should be pushed down into basic_failback. */
2113         struct ctdb_public_ip_list *tmp_ip;
2114         int num_ips = 0;
2115         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2116                 num_ips++;
2117         }
2118
2119         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2120
2121         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2122
2123         /* If we don't want IPs to fail back then don't rebalance IPs. */
2124         if (1 == ctdb->tunable.no_ip_failback) {
2125                 return;
2126         }
2127
2128         /* Now, try to make sure the ip adresses are evenly distributed
2129            across the nodes.
2130         */
2131         basic_failback(ctdb, ipflags, all_ips, num_ips);
2132 }
2133
2134 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2135                           struct ctdb_ipflags *ipflags,
2136                           struct ctdb_public_ip_list *all_ips,
2137                           uint32_t *force_rebalance_nodes)
2138 {
2139         uint32_t *lcp2_imbalances;
2140         bool *rebalance_candidates;
2141         int numnodes, num_rebalance_candidates, i;
2142
2143         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2144
2145         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2146
2147         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2148                   &lcp2_imbalances, &rebalance_candidates);
2149
2150         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2151
2152         /* If we don't want IPs to fail back then don't rebalance IPs. */
2153         if (1 == ctdb->tunable.no_ip_failback) {
2154                 goto finished;
2155         }
2156
2157         /* It is only worth continuing if we have suitable target
2158          * nodes to transfer IPs to.  This check is much cheaper than
2159          * continuing on...
2160          */
2161         numnodes = talloc_array_length(ipflags);
2162         num_rebalance_candidates = 0;
2163         for (i=0; i<numnodes; i++) {
2164                 if (rebalance_candidates[i]) {
2165                         num_rebalance_candidates++;
2166                 }
2167         }
2168         if (num_rebalance_candidates == 0) {
2169                 goto finished;
2170         }
2171
2172         /* Now, try to make sure the ip adresses are evenly distributed
2173            across the nodes.
2174         */
2175         lcp2_failback(ctdb, ipflags, all_ips,
2176                       lcp2_imbalances, rebalance_candidates);
2177
2178 finished:
2179         talloc_free(tmp_ctx);
2180 }
2181
2182 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2183 {
2184         int i, num_healthy;
2185
2186         /* Count how many completely healthy nodes we have */
2187         num_healthy = 0;
2188         for (i=0;i<nodemap->num;i++) {
2189                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2190                         num_healthy++;
2191                 }
2192         }
2193
2194         return num_healthy == 0;
2195 }
2196
2197 /* The calculation part of the IP allocation algorithm. */
2198 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2199                                    struct ctdb_ipflags *ipflags,
2200                                    struct ctdb_public_ip_list **all_ips_p,
2201                                    uint32_t *force_rebalance_nodes)
2202 {
2203         /* since nodes only know about those public addresses that
2204            can be served by that particular node, no single node has
2205            a full list of all public addresses that exist in the cluster.
2206            Walk over all node structures and create a merged list of
2207            all public addresses that exist in the cluster.
2208
2209            keep the tree of ips around as ctdb->ip_tree
2210         */
2211         *all_ips_p = create_merged_ip_list(ctdb);
2212
2213         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2214                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2215         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2216                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2217         } else {
2218                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2219         }
2220
2221         /* at this point ->pnn is the node which will own each IP
2222            or -1 if there is no node that can cover this ip
2223         */
2224
2225         return;
2226 }
2227
2228 struct get_tunable_callback_data {
2229         const char *tunable;
2230         uint32_t *out;
2231         bool fatal;
2232 };
2233
2234 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2235                                  int32_t res, TDB_DATA outdata,
2236                                  void *callback)
2237 {
2238         struct get_tunable_callback_data *cd =
2239                 (struct get_tunable_callback_data *)callback;
2240         int size;
2241
2242         if (res != 0) {
2243                 /* Already handled in fail callback */
2244                 return;
2245         }
2246
2247         if (outdata.dsize != sizeof(uint32_t)) {
2248                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2249                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2250                                  (int)outdata.dsize));
2251                 cd->fatal = true;
2252                 return;
2253         }
2254
2255         size = talloc_array_length(cd->out);
2256         if (pnn >= size) {
2257                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2258                                  cd->tunable, pnn, size));
2259                 return;
2260         }
2261
2262                 
2263         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2264 }
2265
2266 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2267                                        int32_t res, TDB_DATA outdata,
2268                                        void *callback)
2269 {
2270         struct get_tunable_callback_data *cd =
2271                 (struct get_tunable_callback_data *)callback;
2272
2273         switch (res) {
2274         case -ETIME:
2275                 DEBUG(DEBUG_ERR,
2276                       ("Timed out getting tunable \"%s\" from node %d\n",
2277                        cd->tunable, pnn));
2278                 cd->fatal = true;
2279                 break;
2280         case -EINVAL:
2281         case -1:
2282                 DEBUG(DEBUG_WARNING,
2283                       ("Tunable \"%s\" not implemented on node %d\n",
2284                        cd->tunable, pnn));
2285                 break;
2286         default:
2287                 DEBUG(DEBUG_ERR,
2288                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2289                        cd->tunable, pnn));
2290                 cd->fatal = true;
2291         }
2292 }
2293
2294 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2295                                         TALLOC_CTX *tmp_ctx,
2296                                         struct ctdb_node_map *nodemap,
2297                                         const char *tunable,
2298                                         uint32_t default_value)
2299 {
2300         TDB_DATA data;
2301         struct ctdb_control_get_tunable *t;
2302         uint32_t *nodes;
2303         uint32_t *tvals;
2304         struct get_tunable_callback_data callback_data;
2305         int i;
2306
2307         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2308         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2309         for (i=0; i<nodemap->num; i++) {
2310                 tvals[i] = default_value;
2311         }
2312                 
2313         callback_data.out = tvals;
2314         callback_data.tunable = tunable;
2315         callback_data.fatal = false;
2316
2317         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2318         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2319         t = (struct ctdb_control_get_tunable *)data.dptr;
2320         t->length = strlen(tunable)+1;
2321         memcpy(t->name, tunable, t->length);
2322         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2323         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2324                                       nodes, 0, TAKEOVER_TIMEOUT(),
2325                                       false, data,
2326                                       get_tunable_callback,
2327                                       get_tunable_fail_callback,
2328                                       &callback_data) != 0) {
2329                 if (callback_data.fatal) {
2330                         talloc_free(tvals);
2331                         tvals = NULL;
2332                 }
2333         }
2334         talloc_free(nodes);
2335         talloc_free(data.dptr);
2336
2337         return tvals;
2338 }
2339
2340 struct get_runstate_callback_data {
2341         enum ctdb_runstate *out;
2342         bool fatal;
2343 };
2344
2345 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2346                                   int32_t res, TDB_DATA outdata,
2347                                   void *callback_data)
2348 {
2349         struct get_runstate_callback_data *cd =
2350                 (struct get_runstate_callback_data *)callback_data;
2351         int size;
2352
2353         if (res != 0) {
2354                 /* Already handled in fail callback */
2355                 return;
2356         }
2357
2358         if (outdata.dsize != sizeof(uint32_t)) {
2359                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2360                                  pnn, (int)sizeof(uint32_t),
2361                                  (int)outdata.dsize));
2362                 cd->fatal = true;
2363                 return;
2364         }
2365
2366         size = talloc_array_length(cd->out);
2367         if (pnn >= size) {
2368                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2369                                  pnn, size));
2370                 return;
2371         }
2372
2373         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2374 }
2375
2376 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2377                                        int32_t res, TDB_DATA outdata,
2378                                        void *callback)
2379 {
2380         struct get_runstate_callback_data *cd =
2381                 (struct get_runstate_callback_data *)callback;
2382
2383         switch (res) {
2384         case -ETIME:
2385                 DEBUG(DEBUG_ERR,
2386                       ("Timed out getting runstate from node %d\n", pnn));
2387                 cd->fatal = true;
2388                 break;
2389         default:
2390                 DEBUG(DEBUG_WARNING,
2391                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2392                        pnn));
2393         }
2394 }
2395
2396 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2397                                                     TALLOC_CTX *tmp_ctx,
2398                                                     struct ctdb_node_map *nodemap,
2399                                                     enum ctdb_runstate default_value)
2400 {
2401         uint32_t *nodes;
2402         enum ctdb_runstate *rs;
2403         struct get_runstate_callback_data callback_data;
2404         int i;
2405
2406         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2407         CTDB_NO_MEMORY_NULL(ctdb, rs);
2408         for (i=0; i<nodemap->num; i++) {
2409                 rs[i] = default_value;
2410         }
2411
2412         callback_data.out = rs;
2413         callback_data.fatal = false;
2414
2415         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2416         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2417                                       nodes, 0, TAKEOVER_TIMEOUT(),
2418                                       true, tdb_null,
2419                                       get_runstate_callback,
2420                                       get_runstate_fail_callback,
2421                                       &callback_data) != 0) {
2422                 if (callback_data.fatal) {
2423                         free(rs);
2424                         rs = NULL;
2425                 }
2426         }
2427         talloc_free(nodes);
2428
2429         return rs;
2430 }
2431
2432 /* Set internal flags for IP allocation:
2433  *   Clear ip flags
2434  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2435  *   Set NOIPHOST ip flag for each INACTIVE node
2436  *   if all nodes are disabled:
2437  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2438  *   else
2439  *     Set NOIPHOST ip flags for disabled nodes
2440  */
2441 static struct ctdb_ipflags *
2442 set_ipflags_internal(struct ctdb_context *ctdb,
2443                      TALLOC_CTX *tmp_ctx,
2444                      struct ctdb_node_map *nodemap,
2445                      uint32_t *tval_noiptakeover,
2446                      uint32_t *tval_noiphostonalldisabled,
2447                      enum ctdb_runstate *runstate)
2448 {
2449         int i;
2450         struct ctdb_ipflags *ipflags;
2451
2452         /* Clear IP flags - implicit due to talloc_zero */
2453         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2454         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2455
2456         for (i=0;i<nodemap->num;i++) {
2457                 /* Can not take IPs on node with NoIPTakeover set */
2458                 if (tval_noiptakeover[i] != 0) {
2459                         ipflags[i].noiptakeover = true;
2460                 }
2461
2462                 /* Can not host IPs on node not in RUNNING state */
2463                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2464                         ipflags[i].noiphost = true;
2465                         continue;
2466                 }
2467                 /* Can not host IPs on INACTIVE node */
2468                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2469                         ipflags[i].noiphost = true;
2470                 }
2471         }
2472
2473         if (all_nodes_are_disabled(nodemap)) {
2474                 /* If all nodes are disabled, can not host IPs on node
2475                  * with NoIPHostOnAllDisabled set
2476                  */
2477                 for (i=0;i<nodemap->num;i++) {
2478                         if (tval_noiphostonalldisabled[i] != 0) {
2479                                 ipflags[i].noiphost = true;
2480                         }
2481                 }
2482         } else {
2483                 /* If some nodes are not disabled, then can not host
2484                  * IPs on DISABLED node
2485                  */
2486                 for (i=0;i<nodemap->num;i++) {
2487                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2488                                 ipflags[i].noiphost = true;
2489                         }
2490                 }
2491         }
2492
2493         return ipflags;
2494 }
2495
2496 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2497                                         TALLOC_CTX *tmp_ctx,
2498                                         struct ctdb_node_map *nodemap)
2499 {
2500         uint32_t *tval_noiptakeover;
2501         uint32_t *tval_noiphostonalldisabled;
2502         struct ctdb_ipflags *ipflags;
2503         enum ctdb_runstate *runstate;
2504
2505
2506         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2507                                                    "NoIPTakeover", 0);
2508         if (tval_noiptakeover == NULL) {
2509                 return NULL;
2510         }
2511
2512         tval_noiphostonalldisabled =
2513                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2514                                        "NoIPHostOnAllDisabled", 0);
2515         if (tval_noiphostonalldisabled == NULL) {
2516                 /* Caller frees tmp_ctx */
2517                 return NULL;
2518         }
2519
2520         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2521          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2522          * reasonable behaviour on a mixed cluster during upgrade.
2523          */
2524         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2525                                            CTDB_RUNSTATE_RUNNING);
2526         if (runstate == NULL) {
2527                 /* Caller frees tmp_ctx */
2528                 return NULL;
2529         }
2530
2531         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2532                                        tval_noiptakeover,
2533                                        tval_noiphostonalldisabled,
2534                                        runstate);
2535
2536         talloc_free(tval_noiptakeover);
2537         talloc_free(tval_noiphostonalldisabled);
2538         talloc_free(runstate);
2539
2540         return ipflags;
2541 }
2542
2543 struct iprealloc_callback_data {
2544         bool *retry_nodes;
2545         int retry_count;
2546         client_async_callback fail_callback;
2547         void *fail_callback_data;
2548         struct ctdb_node_map *nodemap;
2549 };
2550
2551 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2552                                         int32_t res, TDB_DATA outdata,
2553                                         void *callback)
2554 {
2555         int numnodes;
2556         struct iprealloc_callback_data *cd =
2557                 (struct iprealloc_callback_data *)callback;
2558
2559         numnodes = talloc_array_length(cd->retry_nodes);
2560         if (pnn > numnodes) {
2561                 DEBUG(DEBUG_ERR,
2562                       ("ipreallocated failure from node %d, "
2563                        "but only %d nodes in nodemap\n",
2564                        pnn, numnodes));
2565                 return;
2566         }
2567
2568         /* Can't run the "ipreallocated" event on a INACTIVE node */
2569         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2570                 DEBUG(DEBUG_WARNING,
2571                       ("ipreallocated failed on inactive node %d, ignoring\n",
2572                        pnn));
2573                 return;
2574         }
2575
2576         switch (res) {
2577         case -ETIME:
2578                 /* If the control timed out then that's a real error,
2579                  * so call the real fail callback
2580                  */
2581                 if (cd->fail_callback) {
2582                         cd->fail_callback(ctdb, pnn, res, outdata,
2583                                           cd->fail_callback_data);
2584                 } else {
2585                         DEBUG(DEBUG_WARNING,
2586                               ("iprealloc timed out but no callback registered\n"));
2587                 }
2588                 break;
2589         default:
2590                 /* If not a timeout then either the ipreallocated
2591                  * eventscript (or some setup) failed.  This might
2592                  * have failed because the IPREALLOCATED control isn't
2593                  * implemented - right now there is no way of knowing
2594                  * because the error codes are all folded down to -1.
2595                  * Consider retrying using EVENTSCRIPT control...
2596                  */
2597                 DEBUG(DEBUG_WARNING,
2598                       ("ipreallocated failure from node %d, flagging retry\n",
2599                        pnn));
2600                 cd->retry_nodes[pnn] = true;
2601                 cd->retry_count++;
2602         }
2603 }
2604
2605 struct takeover_callback_data {
2606         bool *node_failed;
2607         client_async_callback fail_callback;
2608         void *fail_callback_data;
2609         struct ctdb_node_map *nodemap;
2610 };
2611
2612 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2613                                        uint32_t node_pnn, int32_t res,
2614                                        TDB_DATA outdata, void *callback_data)
2615 {
2616         struct takeover_callback_data *cd =
2617                 talloc_get_type_abort(callback_data,
2618                                       struct takeover_callback_data);
2619         int i;
2620
2621         for (i = 0; i < cd->nodemap->num; i++) {
2622                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2623                         break;
2624                 }
2625         }
2626
2627         if (i == cd->nodemap->num) {
2628                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2629                 return;
2630         }
2631
2632         if (!cd->node_failed[i]) {
2633                 cd->node_failed[i] = true;
2634                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2635                                   cd->fail_callback_data);
2636         }
2637 }
2638
2639 /*
2640   make any IP alias changes for public addresses that are necessary 
2641  */
2642 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2643                       uint32_t *force_rebalance_nodes,
2644                       client_async_callback fail_callback, void *callback_data)
2645 {
2646         int i, j, ret;
2647         struct ctdb_public_ip ip;
2648         struct ctdb_public_ipv4 ipv4;
2649         uint32_t *nodes;
2650         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2651         TDB_DATA data;
2652         struct timeval timeout;
2653         struct client_async_data *async_data;
2654         struct ctdb_client_control_state *state;
2655         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2656         struct ctdb_ipflags *ipflags;
2657         struct takeover_callback_data *takeover_data;
2658         struct iprealloc_callback_data iprealloc_data;
2659         bool *retry_data;
2660
2661         /*
2662          * ip failover is completely disabled, just send out the 
2663          * ipreallocated event.
2664          */
2665         if (ctdb->tunable.disable_ip_failover != 0) {
2666                 goto ipreallocated;
2667         }
2668
2669         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2670         if (ipflags == NULL) {
2671                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2672                 talloc_free(tmp_ctx);
2673                 return -1;
2674         }
2675
2676         ZERO_STRUCT(ip);
2677
2678         /* Do the IP reassignment calculations */
2679         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2680
2681         /* Now tell all nodes to release any public IPs should not
2682          * host.  This will be a NOOP on nodes that don't currently
2683          * hold the given IP.
2684          */
2685         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2686         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2687
2688         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2689                                                        bool, nodemap->num);
2690         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2691         takeover_data->fail_callback = fail_callback;
2692         takeover_data->fail_callback_data = callback_data;
2693         takeover_data->nodemap = nodemap;
2694
2695         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2696         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2697
2698         async_data->fail_callback = takeover_run_fail_callback;
2699         async_data->callback_data = takeover_data;
2700
2701         for (i=0;i<nodemap->num;i++) {
2702                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2703                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2704                         continue;
2705                 }
2706
2707                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2708                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2709                                 /* This node should be serving this
2710                                    vnn so dont tell it to release the ip
2711                                 */
2712                                 continue;
2713                         }
2714                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2715                                 ipv4.pnn = tmp_ip->pnn;
2716                                 ipv4.sin = tmp_ip->addr.ip;
2717
2718                                 timeout = TAKEOVER_TIMEOUT();
2719                                 data.dsize = sizeof(ipv4);
2720                                 data.dptr  = (uint8_t *)&ipv4;
2721                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2722                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2723                                                 data, async_data,
2724                                                 &timeout, NULL);
2725                         } else {
2726                                 ip.pnn  = tmp_ip->pnn;
2727                                 ip.addr = tmp_ip->addr;
2728
2729                                 timeout = TAKEOVER_TIMEOUT();
2730                                 data.dsize = sizeof(ip);
2731                                 data.dptr  = (uint8_t *)&ip;
2732                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2733                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2734                                                 data, async_data,
2735                                                 &timeout, NULL);
2736                         }
2737
2738                         if (state == NULL) {
2739                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2740                                 talloc_free(tmp_ctx);
2741                                 return -1;
2742                         }
2743                 
2744                         ctdb_client_async_add(async_data, state);
2745                 }
2746         }
2747         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2748                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2749                 talloc_free(tmp_ctx);
2750                 return -1;
2751         }
2752         talloc_free(async_data);
2753
2754
2755         /* tell all nodes to get their own IPs */
2756         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2757         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2758
2759         async_data->fail_callback = fail_callback;
2760         async_data->callback_data = callback_data;
2761
2762         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2763                 if (tmp_ip->pnn == -1) {
2764                         /* this IP won't be taken over */
2765                         continue;
2766                 }
2767
2768                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2769                         ipv4.pnn = tmp_ip->pnn;
2770                         ipv4.sin = tmp_ip->addr.ip;
2771
2772                         timeout = TAKEOVER_TIMEOUT();
2773                         data.dsize = sizeof(ipv4);
2774                         data.dptr  = (uint8_t *)&ipv4;
2775                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2776                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2777                                         data, async_data,
2778                                         &timeout, NULL);
2779                 } else {
2780                         ip.pnn  = tmp_ip->pnn;
2781                         ip.addr = tmp_ip->addr;
2782
2783                         timeout = TAKEOVER_TIMEOUT();
2784                         data.dsize = sizeof(ip);
2785                         data.dptr  = (uint8_t *)&ip;
2786                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2787                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2788                                         data, async_data,
2789                                         &timeout, NULL);
2790                 }
2791                 if (state == NULL) {
2792                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2793                         talloc_free(tmp_ctx);
2794                         return -1;
2795                 }
2796                 
2797                 ctdb_client_async_add(async_data, state);
2798         }
2799         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2800                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2801                 talloc_free(tmp_ctx);
2802                 return -1;
2803         }
2804
2805 ipreallocated:
2806         /* 
2807          * Tell all nodes to run eventscripts to process the
2808          * "ipreallocated" event.  This can do a lot of things,
2809          * including restarting services to reconfigure them if public
2810          * IPs have moved.  Once upon a time this event only used to
2811          * update natwg.
2812          */
2813         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2814         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2815         iprealloc_data.retry_nodes = retry_data;
2816         iprealloc_data.retry_count = 0;
2817         iprealloc_data.fail_callback = fail_callback;
2818         iprealloc_data.fail_callback_data = callback_data;
2819         iprealloc_data.nodemap = nodemap;
2820
2821         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2822         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2823                                         nodes, 0, TAKEOVER_TIMEOUT(),
2824                                         false, tdb_null,
2825                                         NULL, iprealloc_fail_callback,
2826                                         &iprealloc_data);
2827         if (ret != 0) {
2828                 /* If the control failed then we should retry to any
2829                  * nodes flagged by iprealloc_fail_callback using the
2830                  * EVENTSCRIPT control.  This is a best-effort at
2831                  * backward compatiblity when running a mixed cluster
2832                  * where some nodes have not yet been upgraded to
2833                  * support the IPREALLOCATED control.
2834                  */
2835                 DEBUG(DEBUG_WARNING,
2836                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2837
2838                 nodes = talloc_array(tmp_ctx, uint32_t,
2839                                      iprealloc_data.retry_count);
2840                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2841
2842                 j = 0;
2843                 for (i=0; i<nodemap->num; i++) {
2844                         if (iprealloc_data.retry_nodes[i]) {
2845                                 nodes[j] = i;
2846                                 j++;
2847                         }
2848                 }
2849
2850                 data.dptr  = discard_const("ipreallocated");
2851                 data.dsize = strlen((char *)data.dptr) + 1; 
2852                 ret = ctdb_client_async_control(ctdb,
2853                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2854                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2855                                                 false, data,
2856                                                 NULL, fail_callback,
2857                                                 callback_data);
2858                 if (ret != 0) {
2859                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2860                 }
2861         }
2862
2863         talloc_free(tmp_ctx);
2864         return ret;
2865 }
2866
2867
2868 /*
2869   destroy a ctdb_client_ip structure
2870  */
2871 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2872 {
2873         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2874                 ctdb_addr_to_str(&ip->addr),
2875                 ntohs(ip->addr.ip.sin_port),
2876                 ip->client_id));
2877
2878         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2879         return 0;
2880 }
2881
2882 /*
2883   called by a client to inform us of a TCP connection that it is managing
2884   that should tickled with an ACK when IP takeover is done
2885   we handle both the old ipv4 style of packets as well as the new ipv4/6
2886   pdus.
2887  */
2888 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2889                                 TDB_DATA indata)
2890 {
2891         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2892         struct ctdb_control_tcp *old_addr = NULL;
2893         struct ctdb_control_tcp_addr new_addr;
2894         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2895         struct ctdb_tcp_list *tcp;
2896         struct ctdb_tcp_connection t;
2897         int ret;
2898         TDB_DATA data;
2899         struct ctdb_client_ip *ip;
2900         struct ctdb_vnn *vnn;
2901         ctdb_sock_addr addr;
2902
2903         /* If we don't have public IPs, tickles are useless */
2904         if (ctdb->vnn == NULL) {
2905                 return 0;
2906         }
2907
2908         switch (indata.dsize) {
2909         case sizeof(struct ctdb_control_tcp):
2910                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2911                 ZERO_STRUCT(new_addr);
2912                 tcp_sock = &new_addr;
2913                 tcp_sock->src.ip  = old_addr->src;
2914                 tcp_sock->dest.ip = old_addr->dest;
2915                 break;
2916         case sizeof(struct ctdb_control_tcp_addr):
2917                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2918                 break;
2919         default:
2920                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2921                                  "to ctdb_control_tcp_client. size was %d but "
2922                                  "only allowed sizes are %lu and %lu\n",
2923                                  (int)indata.dsize,
2924                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2925                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2926                 return -1;
2927         }
2928
2929         addr = tcp_sock->src;
2930         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2931         addr = tcp_sock->dest;
2932         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2933
2934         ZERO_STRUCT(addr);
2935         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2936         vnn = find_public_ip_vnn(ctdb, &addr);
2937         if (vnn == NULL) {
2938                 switch (addr.sa.sa_family) {
2939                 case AF_INET:
2940                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2941                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2942                                         ctdb_addr_to_str(&addr)));
2943                         }
2944                         break;
2945                 case AF_INET6:
2946                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2947                                 ctdb_addr_to_str(&addr)));
2948                         break;
2949                 default:
2950                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2951                 }
2952
2953                 return 0;
2954         }
2955
2956         if (vnn->pnn != ctdb->pnn) {
2957                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2958                         ctdb_addr_to_str(&addr),
2959                         client_id, client->pid));
2960                 /* failing this call will tell smbd to die */
2961                 return -1;
2962         }
2963
2964         ip = talloc(client, struct ctdb_client_ip);
2965         CTDB_NO_MEMORY(ctdb, ip);
2966
2967         ip->ctdb      = ctdb;
2968         ip->addr      = addr;
2969         ip->client_id = client_id;
2970         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2971         DLIST_ADD(ctdb->client_ip_list, ip);
2972
2973         tcp = talloc(client, struct ctdb_tcp_list);
2974         CTDB_NO_MEMORY(ctdb, tcp);
2975
2976         tcp->connection.src_addr = tcp_sock->src;
2977         tcp->connection.dst_addr = tcp_sock->dest;
2978
2979         DLIST_ADD(client->tcp_list, tcp);
2980
2981         t.src_addr = tcp_sock->src;
2982         t.dst_addr = tcp_sock->dest;
2983
2984         data.dptr = (uint8_t *)&t;
2985         data.dsize = sizeof(t);
2986
2987         switch (addr.sa.sa_family) {
2988         case AF_INET:
2989                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2990                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2991                         ctdb_addr_to_str(&tcp_sock->src),
2992                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2993                 break;
2994         case AF_INET6:
2995                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2996                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2997                         ctdb_addr_to_str(&tcp_sock->src),
2998                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2999                 break;
3000         default:
3001                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3002         }
3003
3004
3005         /* tell all nodes about this tcp connection */
3006         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3007                                        CTDB_CONTROL_TCP_ADD,
3008                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3009         if (ret != 0) {
3010                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3011                 return -1;
3012         }
3013
3014         return 0;
3015 }
3016
3017 /*
3018   find a tcp address on a list
3019  */
3020 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3021                                            struct ctdb_tcp_connection *tcp)
3022 {
3023         int i;
3024
3025         if (array == NULL) {
3026                 return NULL;
3027         }
3028
3029         for (i=0;i<array->num;i++) {
3030                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3031                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3032                         return &array->connections[i];
3033                 }
3034         }
3035         return NULL;
3036 }
3037
3038
3039
3040 /*
3041   called by a daemon to inform us of a TCP connection that one of its
3042   clients managing that should tickled with an ACK when IP takeover is
3043   done
3044  */
3045 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3046 {
3047         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3048         struct ctdb_tcp_array *tcparray;
3049         struct ctdb_tcp_connection tcp;
3050         struct ctdb_vnn *vnn;
3051
3052         /* If we don't have public IPs, tickles are useless */
3053         if (ctdb->vnn == NULL) {
3054                 return 0;
3055         }
3056
3057         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3058         if (vnn == NULL) {
3059                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3060                         ctdb_addr_to_str(&p->dst_addr)));
3061
3062                 return -1;
3063         }
3064
3065
3066         tcparray = vnn->tcp_array;
3067
3068         /* If this is the first tickle */
3069         if (tcparray == NULL) {
3070                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3071                 CTDB_NO_MEMORY(ctdb, tcparray);
3072                 vnn->tcp_array = tcparray;
3073
3074                 tcparray->num = 0;
3075                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3076                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3077
3078                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3079                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3080                 tcparray->num++;
3081
3082                 if (tcp_update_needed) {
3083                         vnn->tcp_update_needed = true;
3084                 }
3085                 return 0;
3086         }
3087
3088
3089         /* Do we already have this tickle ?*/
3090         tcp.src_addr = p->src_addr;
3091         tcp.dst_addr = p->dst_addr;
3092         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3093                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3094                         ctdb_addr_to_str(&tcp.dst_addr),
3095                         ntohs(tcp.dst_addr.ip.sin_port),
3096                         vnn->pnn));
3097                 return 0;
3098         }
3099
3100         /* A new tickle, we must add it to the array */
3101         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3102                                         struct ctdb_tcp_connection,
3103                                         tcparray->num+1);
3104         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3105
3106         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3107         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3108         tcparray->num++;
3109
3110         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3111                 ctdb_addr_to_str(&tcp.dst_addr),
3112                 ntohs(tcp.dst_addr.ip.sin_port),
3113                 vnn->pnn));
3114
3115         if (tcp_update_needed) {
3116                 vnn->tcp_update_needed = true;
3117         }
3118
3119         return 0;
3120 }
3121
3122
3123 /*
3124   called by a daemon to inform us of a TCP connection that one of its
3125   clients managing that should tickled with an ACK when IP takeover is
3126   done
3127  */
3128 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3129 {
3130         struct ctdb_tcp_connection *tcpp;
3131         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3132
3133         if (vnn == NULL) {
3134                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3135                         ctdb_addr_to_str(&conn->dst_addr)));
3136                 return;
3137         }
3138
3139         /* if the array is empty we cant remove it
3140            and we dont need to do anything
3141          */
3142         if (vnn->tcp_array == NULL) {
3143                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3144                         ctdb_addr_to_str(&conn->dst_addr),
3145                         ntohs(conn->dst_addr.ip.sin_port)));
3146                 return;
3147         }
3148
3149
3150         /* See if we know this connection
3151            if we dont know this connection  then we dont need to do anything
3152          */
3153         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3154         if (tcpp == NULL) {
3155                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3156                         ctdb_addr_to_str(&conn->dst_addr),
3157                         ntohs(conn->dst_addr.ip.sin_port)));
3158                 return;
3159         }
3160
3161
3162         /* We need to remove this entry from the array.
3163            Instead of allocating a new array and copying data to it
3164            we cheat and just copy the last entry in the existing array
3165            to the entry that is to be removed and just shring the 
3166            ->num field
3167          */
3168         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3169         vnn->tcp_array->num--;
3170
3171         /* If we deleted the last entry we also need to remove the entire array
3172          */
3173         if (vnn->tcp_array->num == 0) {
3174                 talloc_free(vnn->tcp_array);
3175                 vnn->tcp_array = NULL;
3176         }               
3177
3178         vnn->tcp_update_needed = true;
3179
3180         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3181                 ctdb_addr_to_str(&conn->src_addr),
3182                 ntohs(conn->src_addr.ip.sin_port)));
3183 }
3184
3185
3186 /*
3187   called by a daemon to inform us of a TCP connection that one of its
3188   clients used are no longer needed in the tickle database
3189  */
3190 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3191 {
3192         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3193
3194         /* If we don't have public IPs, tickles are useless */
3195         if (ctdb->vnn == NULL) {
3196                 return 0;
3197         }
3198
3199         ctdb_remove_tcp_connection(ctdb, conn);
3200
3201         return 0;
3202 }
3203
3204
3205 /*
3206   Called when another daemon starts - caises all tickles for all
3207   public addresses we are serving to be sent to the new node on the
3208   next check.  This actually causes the next scheduled call to
3209   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3210   doesn't require careful error handling.
3211  */
3212 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3213 {
3214         struct ctdb_vnn *vnn;
3215
3216         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3217                 vnn->tcp_update_needed = true;
3218         }
3219
3220         return 0;
3221 }
3222
3223
3224 /*
3225   called when a client structure goes away - hook to remove
3226   elements from the tcp_list in all daemons
3227  */
3228 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3229 {
3230         while (client->tcp_list) {
3231                 struct ctdb_tcp_list *tcp = client->tcp_list;
3232                 DLIST_REMOVE(client->tcp_list, tcp);
3233                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3234         }
3235 }
3236
3237
3238 /*
3239   release all IPs on shutdown
3240  */
3241 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3242 {
3243         struct ctdb_vnn *vnn;
3244         int count = 0;
3245
3246         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3247                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3248                         ctdb_vnn_unassign_iface(ctdb, vnn);
3249                         continue;
3250                 }
3251                 if (!vnn->iface) {
3252                         continue;
3253                 }
3254
3255                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3256                                     ctdb_addr_to_str(&vnn->public_address),
3257                                     vnn->public_netmask_bits,
3258                                     ctdb_vnn_iface_string(vnn)));
3259
3260                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3261                                   ctdb_vnn_iface_string(vnn),
3262                                   ctdb_addr_to_str(&vnn->public_address),
3263                                   vnn->public_netmask_bits);
3264                 release_kill_clients(ctdb, &vnn->public_address);
3265                 ctdb_vnn_unassign_iface(ctdb, vnn);
3266                 count++;
3267         }
3268
3269         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3270 }
3271
3272
3273 /*
3274   get list of public IPs
3275  */
3276 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3277                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3278 {
3279         int i, num, len;
3280         struct ctdb_all_public_ips *ips;
3281         struct ctdb_vnn *vnn;
3282         bool only_available = false;
3283
3284         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3285                 only_available = true;
3286         }
3287
3288         /* count how many public ip structures we have */
3289         num = 0;
3290         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3291                 num++;
3292         }
3293
3294         len = offsetof(struct ctdb_all_public_ips, ips) + 
3295                 num*sizeof(struct ctdb_public_ip);
3296         ips = talloc_zero_size(outdata, len);
3297         CTDB_NO_MEMORY(ctdb, ips);
3298
3299         i = 0;
3300         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3301                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3302                         continue;
3303                 }
3304                 ips->ips[i].pnn  = vnn->pnn;
3305                 ips->ips[i].addr = vnn->public_address;
3306                 i++;
3307         }
3308         ips->num = i;
3309         len = offsetof(struct ctdb_all_public_ips, ips) +
3310                 i*sizeof(struct ctdb_public_ip);
3311
3312         outdata->dsize = len;
3313         outdata->dptr  = (uint8_t *)ips;
3314
3315         return 0;
3316 }
3317
3318
3319 /*
3320   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3321  */
3322 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3323                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3324 {
3325         int i, num, len;
3326         struct ctdb_all_public_ipsv4 *ips;
3327         struct ctdb_vnn *vnn;
3328
3329         /* count how many public ip structures we have */
3330         num = 0;
3331         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3332                 if (vnn->public_address.sa.sa_family != AF_INET) {
3333                         continue;
3334                 }
3335                 num++;
3336         }
3337
3338         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3339                 num*sizeof(struct ctdb_public_ipv4);
3340         ips = talloc_zero_size(outdata, len);
3341         CTDB_NO_MEMORY(ctdb, ips);
3342
3343         outdata->dsize = len;
3344         outdata->dptr  = (uint8_t *)ips;
3345
3346         ips->num = num;
3347         i = 0;
3348         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3349                 if (vnn->public_address.sa.sa_family != AF_INET) {
3350                         continue;
3351                 }
3352                 ips->ips[i].pnn = vnn->pnn;
3353                 ips->ips[i].sin = vnn->public_address.ip;
3354                 i++;
3355         }
3356
3357         return 0;
3358 }
3359
3360 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3361                                         struct ctdb_req_control *c,
3362                                         TDB_DATA indata,
3363                                         TDB_DATA *outdata)
3364 {
3365         int i, num, len;
3366         ctdb_sock_addr *addr;
3367         struct ctdb_control_public_ip_info *info;
3368         struct ctdb_vnn *vnn;
3369
3370         addr = (ctdb_sock_addr *)indata.dptr;
3371
3372         vnn = find_public_ip_vnn(ctdb, addr);
3373         if (vnn == NULL) {
3374                 /* if it is not a public ip   it could be our 'single ip' */
3375                 if (ctdb->single_ip_vnn) {
3376                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3377                                 vnn = ctdb->single_ip_vnn;
3378                         }
3379                 }
3380         }
3381         if (vnn == NULL) {
3382                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3383                                  "'%s'not a public address\n",
3384                                  ctdb_addr_to_str(addr)));
3385                 return -1;
3386         }
3387
3388         /* count how many public ip structures we have */
3389         num = 0;
3390         for (;vnn->ifaces[num];) {
3391                 num++;
3392         }
3393
3394         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3395                 num*sizeof(struct ctdb_control_iface_info);
3396         info = talloc_zero_size(outdata, len);
3397         CTDB_NO_MEMORY(ctdb, info);
3398
3399         info->ip.addr = vnn->public_address;
3400         info->ip.pnn = vnn->pnn;
3401         info->active_idx = 0xFFFFFFFF;
3402
3403         for (i=0; vnn->ifaces[i]; i++) {
3404                 struct ctdb_iface *cur;
3405
3406                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3407                 if (cur == NULL) {
3408                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3409                                            vnn->ifaces[i]));
3410                         return -1;
3411                 }
3412                 if (vnn->iface == cur) {
3413                         info->active_idx = i;
3414                 }
3415                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3416                 info->ifaces[i].link_state = cur->link_up;
3417                 info->ifaces[i].references = cur->references;
3418         }
3419         info->num = i;
3420         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3421                 i*sizeof(struct ctdb_control_iface_info);
3422
3423         outdata->dsize = len;
3424         outdata->dptr  = (uint8_t *)info;
3425
3426         return 0;
3427 }
3428
3429 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3430                                 struct ctdb_req_control *c,
3431                                 TDB_DATA *outdata)
3432 {
3433         int i, num, len;
3434         struct ctdb_control_get_ifaces *ifaces;
3435         struct ctdb_iface *cur;
3436
3437         /* count how many public ip structures we have */
3438         num = 0;
3439         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3440                 num++;
3441         }
3442
3443         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3444                 num*sizeof(struct ctdb_control_iface_info);
3445         ifaces = talloc_zero_size(outdata, len);
3446         CTDB_NO_MEMORY(ctdb, ifaces);
3447
3448         i = 0;
3449         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3450                 strcpy(ifaces->ifaces[i].name, cur->name);
3451                 ifaces->ifaces[i].link_state = cur->link_up;
3452                 ifaces->ifaces[i].references = cur->references;
3453                 i++;
3454         }
3455         ifaces->num = i;
3456         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3457                 i*sizeof(struct ctdb_control_iface_info);
3458
3459         outdata->dsize = len;
3460         outdata->dptr  = (uint8_t *)ifaces;
3461
3462         return 0;
3463 }
3464
3465 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3466                                     struct ctdb_req_control *c,
3467                                     TDB_DATA indata)
3468 {
3469         struct ctdb_control_iface_info *info;
3470         struct ctdb_iface *iface;
3471         bool link_up = false;
3472
3473         info = (struct ctdb_control_iface_info *)indata.dptr;
3474
3475         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3476                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3477                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3478                                   len, len, info->name));
3479                 return -1;
3480         }
3481
3482         switch (info->link_state) {
3483         case 0:
3484                 link_up = false;
3485                 break;
3486         case 1:
3487                 link_up = true;
3488                 break;
3489         default:
3490                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3491                                   (unsigned int)info->link_state));
3492                 return -1;
3493         }
3494
3495         if (info->references != 0) {
3496                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3497                                   (unsigned int)info->references));
3498                 return -1;
3499         }
3500
3501         iface = ctdb_find_iface(ctdb, info->name);
3502         if (iface == NULL) {
3503                 return -1;
3504         }
3505
3506         if (link_up == iface->link_up) {
3507                 return 0;
3508         }
3509
3510         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3511               ("iface[%s] has changed it's link status %s => %s\n",
3512                iface->name,
3513                iface->link_up?"up":"down",
3514                link_up?"up":"down"));
3515
3516         iface->link_up = link_up;
3517         return 0;
3518 }
3519
3520
3521 /* 
3522    structure containing the listening socket and the list of tcp connections
3523    that the ctdb daemon is to kill
3524 */
3525 struct ctdb_kill_tcp {
3526         struct ctdb_vnn *vnn;
3527         struct ctdb_context *ctdb;
3528         int capture_fd;
3529         struct fd_event *fde;
3530         trbt_tree_t *connections;
3531         void *private_data;
3532 };
3533
3534 /*
3535   a tcp connection that is to be killed
3536  */
3537 struct ctdb_killtcp_con {
3538         ctdb_sock_addr src_addr;
3539         ctdb_sock_addr dst_addr;
3540         int count;
3541         struct ctdb_kill_tcp *killtcp;
3542 };
3543
3544 /* this function is used to create a key to represent this socketpair
3545    in the killtcp tree.
3546    this key is used to insert and lookup matching socketpairs that are
3547    to be tickled and RST
3548 */
3549 #define KILLTCP_KEYLEN  10
3550 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3551 {
3552         static uint32_t key[KILLTCP_KEYLEN];
3553
3554         bzero(key, sizeof(key));
3555
3556         if (src->sa.sa_family != dst->sa.sa_family) {
3557                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3558                 return key;
3559         }
3560         
3561         switch (src->sa.sa_family) {
3562         case AF_INET:
3563                 key[0]  = dst->ip.sin_addr.s_addr;
3564                 key[1]  = src->ip.sin_addr.s_addr;
3565                 key[2]  = dst->ip.sin_port;
3566                 key[3]  = src->ip.sin_port;
3567                 break;
3568         case AF_INET6: {
3569                 uint32_t *dst6_addr32 =
3570                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3571                 uint32_t *src6_addr32 =
3572                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3573                 key[0]  = dst6_addr32[3];
3574                 key[1]  = src6_addr32[3];
3575                 key[2]  = dst6_addr32[2];
3576                 key[3]  = src6_addr32[2];
3577                 key[4]  = dst6_addr32[1];
3578                 key[5]  = src6_addr32[1];
3579                 key[6]  = dst6_addr32[0];
3580                 key[7]  = src6_addr32[0];
3581                 key[8]  = dst->ip6.sin6_port;
3582                 key[9]  = src->ip6.sin6_port;
3583                 break;
3584         }
3585         default:
3586                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3587                 return key;
3588         }
3589
3590         return key;
3591 }
3592
3593 /*
3594   called when we get a read event on the raw socket
3595  */
3596 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3597                                 uint16_t flags, void *private_data)
3598 {
3599         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3600         struct ctdb_killtcp_con *con;
3601         ctdb_sock_addr src, dst;
3602         uint32_t ack_seq, seq;
3603
3604         if (!(flags & EVENT_FD_READ)) {
3605                 return;
3606         }
3607
3608         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3609                                 killtcp->private_data,
3610                                 &src, &dst,
3611                                 &ack_seq, &seq) != 0) {
3612                 /* probably a non-tcp ACK packet */
3613                 return;
3614         }
3615
3616         /* check if we have this guy in our list of connections
3617            to kill
3618         */
3619         con = trbt_lookuparray32(killtcp->connections, 
3620                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3621         if (con == NULL) {
3622                 /* no this was some other packet we can just ignore */
3623                 return;
3624         }
3625
3626         /* This one has been tickled !
3627            now reset him and remove him from the list.
3628          */
3629         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3630                 ntohs(con->dst_addr.ip.sin_port),
3631                 ctdb_addr_to_str(&con->src_addr),
3632                 ntohs(con->src_addr.ip.sin_port)));
3633
3634         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3635         talloc_free(con);
3636 }
3637
3638
3639 /* when traversing the list of all tcp connections to send tickle acks to
3640    (so that we can capture the ack coming back and kill the connection
3641     by a RST)
3642    this callback is called for each connection we are currently trying to kill
3643 */
3644 static int tickle_connection_traverse(void *param, void *data)
3645 {
3646         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3647
3648         /* have tried too many times, just give up */
3649         if (con->count >= 5) {
3650                 /* can't delete in traverse: reparent to delete_cons */
3651                 talloc_steal(param, con);
3652                 return 0;
3653         }
3654
3655         /* othervise, try tickling it again */
3656         con->count++;
3657         ctdb_sys_send_tcp(
3658                 (ctdb_sock_addr *)&con->dst_addr,
3659                 (ctdb_sock_addr *)&con->src_addr,
3660                 0, 0, 0);
3661         return 0;
3662 }
3663
3664
3665 /* 
3666    called every second until all sentenced connections have been reset
3667  */
3668 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3669                                               struct timeval t, void *private_data)
3670 {
3671         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3672         void *delete_cons = talloc_new(NULL);
3673
3674         /* loop over all connections sending tickle ACKs */
3675         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3676
3677         /* now we've finished traverse, it's safe to do deletion. */
3678         talloc_free(delete_cons);
3679
3680         /* If there are no more connections to kill we can remove the
3681            entire killtcp structure
3682          */
3683         if ( (killtcp->connections == NULL) || 
3684              (killtcp->connections->root == NULL) ) {
3685                 talloc_free(killtcp);
3686                 return;
3687         }
3688
3689         /* try tickling them again in a seconds time
3690          */
3691         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3692                         ctdb_tickle_sentenced_connections, killtcp);
3693 }
3694
3695 /*
3696   destroy the killtcp structure
3697  */
3698 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3699 {
3700         struct ctdb_vnn *tmpvnn;
3701
3702         /* verify that this vnn is still active */
3703         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3704                 if (tmpvnn == killtcp->vnn) {
3705                         break;
3706                 }
3707         }
3708
3709         if (tmpvnn == NULL) {
3710                 return 0;
3711         }
3712
3713         if (killtcp->vnn->killtcp != killtcp) {
3714                 return 0;
3715         }
3716
3717         killtcp->vnn->killtcp = NULL;
3718
3719         return 0;
3720 }
3721
3722
3723 /* nothing fancy here, just unconditionally replace any existing
3724    connection structure with the new one.
3725
3726    dont even free the old one if it did exist, that one is talloc_stolen
3727    by the same node in the tree anyway and will be deleted when the new data 
3728    is deleted
3729 */
3730 static void *add_killtcp_callback(void *parm, void *data)
3731 {
3732         return parm;
3733 }
3734
3735 /*
3736   add a tcp socket to the list of connections we want to RST
3737  */
3738 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3739                                        ctdb_sock_addr *s,
3740                                        ctdb_sock_addr *d)
3741 {
3742         ctdb_sock_addr src, dst;
3743         struct ctdb_kill_tcp *killtcp;
3744         struct ctdb_killtcp_con *con;
3745         struct ctdb_vnn *vnn;
3746
3747         ctdb_canonicalize_ip(s, &src);
3748         ctdb_canonicalize_ip(d, &dst);
3749
3750         vnn = find_public_ip_vnn(ctdb, &dst);
3751         if (vnn == NULL) {
3752                 vnn = find_public_ip_vnn(ctdb, &src);
3753         }
3754         if (vnn == NULL) {
3755                 /* if it is not a public ip   it could be our 'single ip' */
3756                 if (ctdb->single_ip_vnn) {
3757                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3758                                 vnn = ctdb->single_ip_vnn;
3759                         }
3760                 }
3761         }
3762         if (vnn == NULL) {
3763                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3764                 return -1;
3765         }
3766
3767         killtcp = vnn->killtcp;
3768         
3769         /* If this is the first connection to kill we must allocate
3770            a new structure
3771          */
3772         if (killtcp == NULL) {
3773                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3774                 CTDB_NO_MEMORY(ctdb, killtcp);
3775
3776                 killtcp->vnn         = vnn;
3777                 killtcp->ctdb        = ctdb;
3778                 killtcp->capture_fd  = -1;
3779                 killtcp->connections = trbt_create(killtcp, 0);
3780
3781                 vnn->killtcp         = killtcp;
3782                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3783         }
3784
3785
3786
3787         /* create a structure that describes this connection we want to
3788            RST and store it in killtcp->connections
3789         */
3790         con = talloc(killtcp, struct ctdb_killtcp_con);
3791         CTDB_NO_MEMORY(ctdb, con);
3792         con->src_addr = src;
3793         con->dst_addr = dst;
3794         con->count    = 0;
3795         con->killtcp  = killtcp;
3796
3797
3798         trbt_insertarray32_callback(killtcp->connections,
3799                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3800                         add_killtcp_callback, con);
3801
3802         /* 
3803            If we dont have a socket to listen on yet we must create it
3804          */
3805         if (killtcp->capture_fd == -1) {
3806                 const char *iface = ctdb_vnn_iface_string(vnn);
3807                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3808                 if (killtcp->capture_fd == -1) {
3809                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3810                                           "socket on iface '%s' for killtcp (%s)\n",
3811                                           iface, strerror(errno)));
3812                         goto failed;
3813                 }
3814         }
3815
3816
3817         if (killtcp->fde == NULL) {
3818                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3819                                             EVENT_FD_READ,
3820                                             capture_tcp_handler, killtcp);
3821                 tevent_fd_set_auto_close(killtcp->fde);
3822
3823                 /* We also need to set up some events to tickle all these connections
3824                    until they are all reset
3825                 */
3826                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3827                                 ctdb_tickle_sentenced_connections, killtcp);
3828         }
3829
3830         /* tickle him once now */
3831         ctdb_sys_send_tcp(
3832                 &con->dst_addr,
3833                 &con->src_addr,
3834                 0, 0, 0);
3835
3836         return 0;
3837
3838 failed:
3839         talloc_free(vnn->killtcp);
3840         vnn->killtcp = NULL;
3841         return -1;
3842 }
3843
3844 /*
3845   kill a TCP connection.
3846  */
3847 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3848 {
3849         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3850
3851         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3852 }
3853
3854 /*
3855   called by a daemon to inform us of the entire list of TCP tickles for
3856   a particular public address.
3857   this control should only be sent by the node that is currently serving
3858   that public address.
3859  */
3860 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3861 {
3862         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3863         struct ctdb_tcp_array *tcparray;
3864         struct ctdb_vnn *vnn;
3865
3866         /* We must at least have tickles.num or else we cant verify the size
3867            of the received data blob
3868          */
3869         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3870                                         tickles.connections)) {
3871                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3872                 return -1;
3873         }
3874
3875         /* verify that the size of data matches what we expect */
3876         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3877                                 tickles.connections)
3878                          + sizeof(struct ctdb_tcp_connection)
3879                                  * list->tickles.num) {
3880                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3881                 return -1;
3882         }
3883
3884         vnn = find_public_ip_vnn(ctdb, &list->addr);
3885         if (vnn == NULL) {
3886                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3887                         ctdb_addr_to_str(&list->addr)));
3888
3889                 return 1;
3890         }
3891
3892         /* remove any old ticklelist we might have */
3893         talloc_free(vnn->tcp_array);
3894         vnn->tcp_array = NULL;
3895
3896         tcparray = talloc(vnn, struct ctdb_tcp_array);
3897         CTDB_NO_MEMORY(ctdb, tcparray);
3898
3899         tcparray->num = list->tickles.num;
3900
3901         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3902         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3903
3904         memcpy(tcparray->connections, &list->tickles.connections[0],
3905                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3906
3907         /* We now have a new fresh tickle list array for this vnn */
3908         vnn->tcp_array = tcparray;
3909
3910         return 0;
3911 }
3912
3913 /*
3914   called to return the full list of tickles for the puclic address associated 
3915   with the provided vnn
3916  */
3917 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3918 {
3919         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3920         struct ctdb_control_tcp_tickle_list *list;
3921         struct ctdb_tcp_array *tcparray;
3922         int num;
3923         struct ctdb_vnn *vnn;
3924
3925         vnn = find_public_ip_vnn(ctdb, addr);
3926         if (vnn == NULL) {
3927                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3928                         ctdb_addr_to_str(addr)));
3929
3930                 return 1;
3931         }
3932
3933         tcparray = vnn->tcp_array;
3934         if (tcparray) {
3935                 num = tcparray->num;
3936         } else {
3937                 num = 0;
3938         }
3939
3940         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3941                                 tickles.connections)
3942                         + sizeof(struct ctdb_tcp_connection) * num;
3943
3944         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3945         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3946         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3947
3948         list->addr = *addr;
3949         list->tickles.num = num;
3950         if (num) {
3951                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3952                         sizeof(struct ctdb_tcp_connection) * num);
3953         }
3954
3955         return 0;
3956 }
3957
3958
3959 /*
3960   set the list of all tcp tickles for a public address
3961  */
3962 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3963                               struct timeval timeout, uint32_t destnode, 
3964                               ctdb_sock_addr *addr,
3965                               struct ctdb_tcp_array *tcparray)
3966 {
3967         int ret, num;
3968         TDB_DATA data;
3969         struct ctdb_control_tcp_tickle_list *list;
3970
3971         if (tcparray) {
3972                 num = tcparray->num;
3973         } else {
3974                 num = 0;
3975         }
3976
3977         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3978                                 tickles.connections) +
3979                         sizeof(struct ctdb_tcp_connection) * num;
3980         data.dptr = talloc_size(ctdb, data.dsize);
3981         CTDB_NO_MEMORY(ctdb, data.dptr);
3982
3983         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3984         list->addr = *addr;
3985         list->tickles.num = num;
3986         if (tcparray) {
3987                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3988         }
3989
3990         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3991                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3992                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3993         if (ret != 0) {
3994                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3995                 return -1;
3996         }
3997
3998         talloc_free(data.dptr);
3999
4000         return ret;
4001 }
4002
4003
4004 /*
4005   perform tickle updates if required
4006  */
4007 static void ctdb_update_tcp_tickles(struct event_context *ev, 
4008                                 struct timed_event *te, 
4009                                 struct timeval t, void *private_data)
4010 {
4011         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4012         int ret;
4013         struct ctdb_vnn *vnn;
4014
4015         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4016                 /* we only send out updates for public addresses that 
4017                    we have taken over
4018                  */
4019                 if (ctdb->pnn != vnn->pnn) {
4020                         continue;
4021                 }
4022                 /* We only send out the updates if we need to */
4023                 if (!vnn->tcp_update_needed) {
4024                         continue;
4025                 }
4026                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
4027                                 TAKEOVER_TIMEOUT(),
4028                                 CTDB_BROADCAST_CONNECTED,
4029                                 &vnn->public_address,
4030                                 vnn->tcp_array);
4031                 if (ret != 0) {
4032                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4033                                 ctdb_addr_to_str(&vnn->public_address)));
4034                 } else {
4035                         vnn->tcp_update_needed = false;
4036                 }
4037         }
4038
4039         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4040                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4041                              ctdb_update_tcp_tickles, ctdb);
4042 }               
4043         
4044
4045 /*
4046   start periodic update of tcp tickles
4047  */
4048 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4049 {
4050         ctdb->tickle_update_context = talloc_new(ctdb);
4051
4052         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4053                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4054                              ctdb_update_tcp_tickles, ctdb);
4055 }
4056
4057
4058
4059
4060 struct control_gratious_arp {
4061         struct ctdb_context *ctdb;
4062         ctdb_sock_addr addr;
4063         const char *iface;
4064         int count;
4065 };
4066
4067 /*
4068   send a control_gratuitous arp
4069  */
4070 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4071                                   struct timeval t, void *private_data)
4072 {
4073         int ret;
4074         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4075                                                         struct control_gratious_arp);
4076
4077         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4078         if (ret != 0) {
4079                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4080                                  arp->iface, strerror(errno)));
4081         }
4082
4083
4084         arp->count++;
4085         if (arp->count == CTDB_ARP_REPEAT) {
4086                 talloc_free(arp);
4087                 return;
4088         }
4089
4090         event_add_timed(arp->ctdb->ev, arp, 
4091                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4092                         send_gratious_arp, arp);
4093 }
4094
4095
4096 /*
4097   send a gratious arp 
4098  */
4099 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4100 {
4101         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4102         struct control_gratious_arp *arp;
4103
4104         /* verify the size of indata */
4105         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4106                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4107                                  (unsigned)indata.dsize, 
4108                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4109                 return -1;
4110         }
4111         if (indata.dsize != 
4112                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4113                 + gratious_arp->len ) ){
4114
4115                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4116                         "but should be %u bytes\n", 
4117                          (unsigned)indata.dsize, 
4118                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4119                 return -1;
4120         }
4121
4122
4123         arp = talloc(ctdb, struct control_gratious_arp);
4124         CTDB_NO_MEMORY(ctdb, arp);
4125
4126         arp->ctdb  = ctdb;
4127         arp->addr   = gratious_arp->addr;
4128         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4129         CTDB_NO_MEMORY(ctdb, arp->iface);
4130         arp->count = 0;
4131         
4132         event_add_timed(arp->ctdb->ev, arp, 
4133                         timeval_zero(), send_gratious_arp, arp);
4134
4135         return 0;
4136 }
4137
4138 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4139 {
4140         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4141         int ret;
4142
4143         /* verify the size of indata */
4144         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4145                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4146                 return -1;
4147         }
4148         if (indata.dsize != 
4149                 ( offsetof(struct ctdb_control_ip_iface, iface)
4150                 + pub->len ) ){
4151
4152                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4153                         "but should be %u bytes\n", 
4154                          (unsigned)indata.dsize, 
4155                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4156                 return -1;
4157         }
4158
4159         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4160
4161         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4162
4163         if (ret != 0) {
4164                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4165                 return -1;
4166         }
4167
4168         return 0;
4169 }
4170
4171 /*
4172   called when releaseip event finishes for del_public_address
4173  */
4174 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
4175                                 void *private_data)
4176 {
4177         talloc_free(private_data);
4178 }
4179
4180 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4181 {
4182         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4183         struct ctdb_vnn *vnn;
4184         int ret;
4185
4186         /* verify the size of indata */
4187         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4188                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4189                 return -1;
4190         }
4191         if (indata.dsize != 
4192                 ( offsetof(struct ctdb_control_ip_iface, iface)
4193                 + pub->len ) ){
4194
4195                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4196                         "but should be %u bytes\n", 
4197                          (unsigned)indata.dsize, 
4198                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4199                 return -1;
4200         }
4201
4202         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4203
4204         /* walk over all public addresses until we find a match */
4205         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4206                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4207                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4208
4209                         DLIST_REMOVE(ctdb->vnn, vnn);
4210                         talloc_steal(mem_ctx, vnn);
4211                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
4212                         if (vnn->pnn != ctdb->pnn) {
4213                                 if (vnn->iface != NULL) {
4214                                         ctdb_vnn_unassign_iface(ctdb, vnn);
4215                                 }
4216                                 talloc_free(mem_ctx);
4217                                 return 0;
4218                         }
4219                         vnn->pnn = -1;
4220
4221                         ret = ctdb_event_script_callback(ctdb, 
4222                                          mem_ctx, delete_ip_callback, mem_ctx,
4223                                          CTDB_EVENT_RELEASE_IP,
4224                                          "%s %s %u",
4225                                          ctdb_vnn_iface_string(vnn),
4226                                          ctdb_addr_to_str(&vnn->public_address),
4227                                          vnn->public_netmask_bits);
4228                         if (vnn->iface != NULL) {
4229                                 ctdb_vnn_unassign_iface(ctdb, vnn);
4230                         }
4231                         if (ret != 0) {
4232                                 return -1;
4233                         }
4234                         return 0;
4235                 }
4236         }
4237
4238         return -1;
4239 }
4240
4241
4242 struct ipreallocated_callback_state {
4243         struct ctdb_req_control *c;
4244 };
4245
4246 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4247                                         int status, void *p)
4248 {
4249         struct ipreallocated_callback_state *state =
4250                 talloc_get_type(p, struct ipreallocated_callback_state);
4251
4252         if (status != 0) {
4253                 DEBUG(DEBUG_ERR,
4254                       (" \"ipreallocated\" event script failed (status %d)\n",
4255                        status));
4256                 if (status == -ETIME) {
4257                         ctdb_ban_self(ctdb);
4258                 }
4259         }
4260
4261         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4262         talloc_free(state);
4263 }
4264
4265 /* A control to run the ipreallocated event */
4266 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4267                                    struct ctdb_req_control *c,
4268                                    bool *async_reply)
4269 {
4270         int ret;
4271         struct ipreallocated_callback_state *state;
4272
4273         state = talloc(ctdb, struct ipreallocated_callback_state);
4274         CTDB_NO_MEMORY(ctdb, state);
4275
4276         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4277
4278         ret = ctdb_event_script_callback(ctdb, state,
4279                                          ctdb_ipreallocated_callback, state,
4280                                          CTDB_EVENT_IPREALLOCATED,
4281                                          "%s", "");
4282
4283         if (ret != 0) {
4284                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4285                 talloc_free(state);
4286                 return -1;
4287         }
4288
4289         /* tell the control that we will be reply asynchronously */
4290         state->c    = talloc_steal(state, c);
4291         *async_reply = true;
4292
4293         return 0;
4294 }
4295
4296
4297 /* This function is called from the recovery daemon to verify that a remote
4298    node has the expected ip allocation.
4299    This is verified against ctdb->ip_tree
4300 */
4301 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4302                                 struct ctdb_all_public_ips *ips,
4303                                 uint32_t pnn)
4304 {
4305         struct ctdb_public_ip_list *tmp_ip; 
4306         int i;
4307
4308         if (ctdb->ip_tree == NULL) {
4309                 /* dont know the expected allocation yet, assume remote node
4310                    is correct. */
4311                 return 0;
4312         }
4313
4314         if (ips == NULL) {
4315                 return 0;
4316         }
4317
4318         for (i=0; i<ips->num; i++) {
4319                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4320                 if (tmp_ip == NULL) {
4321                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4322                         return -1;
4323                 }
4324
4325                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4326                         continue;
4327                 }
4328
4329                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4330                         DEBUG(DEBUG_ERR,
4331                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4332                                pnn,
4333                                ctdb_addr_to_str(&ips->ips[i].addr),
4334                                ips->ips[i].pnn, tmp_ip->pnn));
4335                         return -1;
4336                 }
4337         }
4338
4339         return 0;
4340 }
4341
4342 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4343 {
4344         struct ctdb_public_ip_list *tmp_ip; 
4345
4346         if (ctdb->ip_tree == NULL) {
4347                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4348                 return -1;
4349         }
4350
4351         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4352         if (tmp_ip == NULL) {
4353                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4354                 return -1;
4355         }
4356
4357         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4358         tmp_ip->pnn = ip->pnn;
4359
4360         return 0;
4361 }
4362
4363
4364 struct ctdb_reloadips_handle {
4365         struct ctdb_context *ctdb;
4366         struct ctdb_req_control *c;
4367         int status;
4368         int fd[2];
4369         pid_t child;
4370         struct fd_event *fde;
4371 };
4372
4373 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4374 {
4375         if (h == h->ctdb->reload_ips) {
4376                 h->ctdb->reload_ips = NULL;
4377         }
4378         if (h->c != NULL) {
4379                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4380                 h->c = NULL;
4381         }
4382         ctdb_kill(h->ctdb, h->child, SIGKILL);
4383         return 0;
4384 }
4385
4386 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4387                                 struct timed_event *te,
4388                                 struct timeval t, void *private_data)
4389 {
4390         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4391
4392         talloc_free(h);
4393 }       
4394
4395 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4396                              uint16_t flags, void *private_data)
4397 {
4398         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4399
4400         char res;
4401         int ret;
4402
4403         ret = read(h->fd[0], &res, 1);
4404         if (ret < 1 || res != 0) {
4405                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4406                 res = 1;
4407         }
4408         h->status = res;
4409
4410         talloc_free(h);
4411 }
4412
4413 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4414 {
4415         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4416         struct ctdb_all_public_ips *ips;
4417         struct ctdb_vnn *vnn;
4418         struct client_async_data *async_data;
4419         struct timeval timeout;
4420         TDB_DATA data;
4421         struct ctdb_client_control_state *state;
4422         bool first_add;
4423         int i, ret;
4424
4425         CTDB_NO_MEMORY(ctdb, mem_ctx);
4426
4427         /* Read IPs from local node */
4428         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4429                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4430         if (ret != 0) {
4431                 DEBUG(DEBUG_ERR,
4432                       ("Unable to fetch public IPs from local node\n"));
4433                 talloc_free(mem_ctx);
4434                 return -1;
4435         }
4436
4437         /* Read IPs file - this is safe since this is a child process */
4438         ctdb->vnn = NULL;
4439         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4440                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4441                 talloc_free(mem_ctx);
4442                 return -1;
4443         }
4444
4445         async_data = talloc_zero(mem_ctx, struct client_async_data);
4446         CTDB_NO_MEMORY(ctdb, async_data);
4447
4448         /* Compare IPs between node and file for IPs to be deleted */
4449         for (i = 0; i < ips->num; i++) {
4450                 /* */
4451                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4452                         if (ctdb_same_ip(&vnn->public_address,
4453                                          &ips->ips[i].addr)) {
4454                                 /* IP is still in file */
4455                                 break;
4456                         }
4457                 }
4458
4459                 if (vnn == NULL) {
4460                         /* Delete IP ips->ips[i] */
4461                         struct ctdb_control_ip_iface *pub;
4462
4463                         DEBUG(DEBUG_NOTICE,
4464                               ("IP %s no longer configured, deleting it\n",
4465                                ctdb_addr_to_str(&ips->ips[i].addr)));
4466
4467                         pub = talloc_zero(mem_ctx,
4468                                           struct ctdb_control_ip_iface);
4469                         CTDB_NO_MEMORY(ctdb, pub);
4470
4471                         pub->addr  = ips->ips[i].addr;
4472                         pub->mask  = 0;
4473                         pub->len   = 0;
4474
4475                         timeout = TAKEOVER_TIMEOUT();
4476
4477                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4478                                               iface) + pub->len;
4479                         data.dptr = (uint8_t *)pub;
4480
4481                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4482                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4483                                                   0, data, async_data,
4484                                                   &timeout, NULL);
4485                         if (state == NULL) {
4486                                 DEBUG(DEBUG_ERR,
4487                                       (__location__
4488                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4489                                 goto failed;
4490                         }
4491
4492                         ctdb_client_async_add(async_data, state);
4493                 }
4494         }
4495
4496         /* Compare IPs between node and file for IPs to be added */
4497         first_add = true;
4498         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4499                 for (i = 0; i < ips->num; i++) {
4500                         if (ctdb_same_ip(&vnn->public_address,
4501                                          &ips->ips[i].addr)) {
4502                                 /* IP already on node */
4503                                 break;
4504                         }
4505                 }
4506                 if (i == ips->num) {
4507                         /* Add IP ips->ips[i] */
4508                         struct ctdb_control_ip_iface *pub;
4509                         const char *ifaces = NULL;
4510                         uint32_t len;
4511                         int iface = 0;
4512
4513                         DEBUG(DEBUG_NOTICE,
4514                               ("New IP %s configured, adding it\n",
4515                                ctdb_addr_to_str(&vnn->public_address)));
4516                         if (first_add) {
4517                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4518
4519                                 data.dsize = sizeof(pnn);
4520                                 data.dptr  = (uint8_t *)&pnn;
4521
4522                                 ret = ctdb_client_send_message(
4523                                         ctdb,
4524                                         CTDB_BROADCAST_CONNECTED,
4525                                         CTDB_SRVID_REBALANCE_NODE,
4526                                         data);
4527                                 if (ret != 0) {
4528                                         DEBUG(DEBUG_WARNING,
4529                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4530                                 }
4531
4532                                 first_add = false;
4533                         }
4534
4535                         ifaces = vnn->ifaces[0];
4536                         iface = 1;
4537                         while (vnn->ifaces[iface] != NULL) {
4538                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4539                                                          vnn->ifaces[iface]);
4540                                 iface++;
4541                         }
4542
4543                         len   = strlen(ifaces) + 1;
4544                         pub = talloc_zero_size(mem_ctx,
4545                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4546                         CTDB_NO_MEMORY(ctdb, pub);
4547
4548                         pub->addr  = vnn->public_address;
4549                         pub->mask  = vnn->public_netmask_bits;
4550                         pub->len   = len;
4551                         memcpy(&pub->iface[0], ifaces, pub->len);
4552
4553                         timeout = TAKEOVER_TIMEOUT();
4554
4555                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4556                                               iface) + pub->len;
4557                         data.dptr = (uint8_t *)pub;
4558
4559                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4560                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4561                                                   0, data, async_data,
4562                                                   &timeout, NULL);
4563                         if (state == NULL) {
4564                                 DEBUG(DEBUG_ERR,
4565                                       (__location__
4566                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4567                                 goto failed;
4568                         }
4569
4570                         ctdb_client_async_add(async_data, state);
4571                 }
4572         }
4573
4574         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4575                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4576                 goto failed;
4577         }
4578
4579         talloc_free(mem_ctx);
4580         return 0;
4581
4582 failed:
4583         talloc_free(mem_ctx);
4584         return -1;
4585 }
4586
4587 /* This control is sent to force the node to re-read the public addresses file
4588    and drop any addresses we should nnot longer host, and add new addresses
4589    that we are now able to host
4590 */
4591 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4592 {
4593         struct ctdb_reloadips_handle *h;
4594         pid_t parent = getpid();
4595
4596         if (ctdb->reload_ips != NULL) {
4597                 talloc_free(ctdb->reload_ips);
4598                 ctdb->reload_ips = NULL;
4599         }
4600
4601         h = talloc(ctdb, struct ctdb_reloadips_handle);
4602         CTDB_NO_MEMORY(ctdb, h);
4603         h->ctdb     = ctdb;
4604         h->c        = NULL;
4605         h->status   = -1;
4606         
4607         if (pipe(h->fd) == -1) {
4608                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4609                 talloc_free(h);
4610                 return -1;
4611         }
4612
4613         h->child = ctdb_fork(ctdb);
4614         if (h->child == (pid_t)-1) {
4615                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4616                 close(h->fd[0]);
4617                 close(h->fd[1]);
4618                 talloc_free(h);
4619                 return -1;
4620         }
4621
4622         /* child process */
4623         if (h->child == 0) {
4624                 signed char res = 0;
4625
4626                 close(h->fd[0]);
4627                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4628
4629                 ctdb_set_process_name("ctdb_reloadips");
4630                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4631                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4632                         res = -1;
4633                 } else {
4634                         res = ctdb_reloadips_child(ctdb);
4635                         if (res != 0) {
4636                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4637                         }
4638                 }
4639
4640                 write(h->fd[1], &res, 1);
4641                 /* make sure we die when our parent dies */
4642                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4643                         sleep(5);
4644                 }
4645                 _exit(0);
4646         }
4647
4648         h->c             = talloc_steal(h, c);
4649
4650         close(h->fd[1]);
4651         set_close_on_exec(h->fd[0]);
4652
4653         talloc_set_destructor(h, ctdb_reloadips_destructor);
4654
4655
4656         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4657                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4658                         (void *)h);
4659         tevent_fd_set_auto_close(h->fde);
4660
4661         event_add_timed(ctdb->ev, h,
4662                         timeval_current_ofs(120, 0),
4663                         ctdb_reloadips_timeout_event, h);
4664
4665         /* we reply later */
4666         *async_reply = true;
4667         return 0;
4668 }