recoverd: Handle errors carefully when fetching tunables
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          false,
514                                          CTDB_EVENT_TAKE_IP,
515                                          "%s %s %u",
516                                          ctdb_vnn_iface_string(vnn),
517                                          ctdb_addr_to_str(&vnn->public_address),
518                                          vnn->public_netmask_bits);
519
520         if (ret != 0) {
521                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522                         ctdb_addr_to_str(&vnn->public_address),
523                         ctdb_vnn_iface_string(vnn)));
524                 talloc_free(state);
525                 return -1;
526         }
527
528         return 0;
529 }
530
531 struct ctdb_do_updateip_state {
532         struct ctdb_req_control *c;
533         struct ctdb_iface *old;
534         struct ctdb_vnn *vnn;
535 };
536
537 /*
538   called when updateip event finishes
539  */
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541                                       void *private_data)
542 {
543         struct ctdb_do_updateip_state *state =
544                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545         int32_t ret;
546
547         if (status != 0) {
548                 if (status == -ETIME) {
549                         ctdb_ban_self(ctdb);
550                 }
551                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552                         ctdb_addr_to_str(&state->vnn->public_address),
553                         state->old->name,
554                         ctdb_vnn_iface_string(state->vnn)));
555
556                 /*
557                  * All we can do is reset the old interface
558                  * and let the next run fix it
559                  */
560                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561                 state->vnn->iface = state->old;
562                 state->vnn->iface->references++;
563
564                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         if (ctdb->do_checkpublicip) {
570
571         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572         if (ret != 0) {
573                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574                 talloc_free(state);
575                 return;
576         }
577
578         }
579
580         /* the control succeeded */
581         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582         talloc_free(state);
583         return;
584 }
585
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587 {
588         state->vnn->update_in_flight = false;
589         return 0;
590 }
591
592 /*
593   update (move) an ip address
594  */
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596                                 struct ctdb_req_control *c,
597                                 struct ctdb_vnn *vnn)
598 {
599         int ret;
600         struct ctdb_do_updateip_state *state;
601         struct ctdb_iface *old = vnn->iface;
602         const char *new_name;
603
604         if (vnn->update_in_flight) {
605                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606                                     "update for this IP already in flight\n",
607                                     ctdb_addr_to_str(&vnn->public_address),
608                                     vnn->public_netmask_bits));
609                 return -1;
610         }
611
612         ctdb_vnn_unassign_iface(ctdb, vnn);
613         ret = ctdb_vnn_assign_iface(ctdb, vnn);
614         if (ret != 0) {
615                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616                                  "assin a usable interface (old iface '%s')\n",
617                                  ctdb_addr_to_str(&vnn->public_address),
618                                  vnn->public_netmask_bits,
619                                  old->name));
620                 return -1;
621         }
622
623         new_name = ctdb_vnn_iface_string(vnn);
624         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625                 /* A benign update from one interface onto itself.
626                  * no need to run the eventscripts in this case, just return
627                  * success.
628                  */
629                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630                 return 0;
631         }
632
633         state = talloc(vnn, struct ctdb_do_updateip_state);
634         CTDB_NO_MEMORY(ctdb, state);
635
636         state->c = talloc_steal(ctdb, c);
637         state->old = old;
638         state->vnn = vnn;
639
640         vnn->update_in_flight = true;
641         talloc_set_destructor(state, ctdb_updateip_destructor);
642
643         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644                             "interface %s to %s\n",
645                             ctdb_addr_to_str(&vnn->public_address),
646                             vnn->public_netmask_bits,
647                             old->name,
648                             new_name));
649
650         ret = ctdb_event_script_callback(ctdb,
651                                          state,
652                                          ctdb_do_updateip_callback,
653                                          state,
654                                          false,
655                                          CTDB_EVENT_UPDATE_IP,
656                                          "%s %s %s %u",
657                                          state->old->name,
658                                          new_name,
659                                          ctdb_addr_to_str(&vnn->public_address),
660                                          vnn->public_netmask_bits);
661         if (ret != 0) {
662                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663                                  ctdb_addr_to_str(&vnn->public_address),
664                                  old->name, new_name));
665                 talloc_free(state);
666                 return -1;
667         }
668
669         return 0;
670 }
671
672 /*
673   Find the vnn of the node that has a public ip address
674   returns -1 if the address is not known as a public address
675  */
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677 {
678         struct ctdb_vnn *vnn;
679
680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681                 if (ctdb_same_ip(&vnn->public_address, addr)) {
682                         return vnn;
683                 }
684         }
685
686         return NULL;
687 }
688
689 /*
690   take over an ip address
691  */
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693                                  struct ctdb_req_control *c,
694                                  TDB_DATA indata,
695                                  bool *async_reply)
696 {
697         int ret;
698         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699         struct ctdb_vnn *vnn;
700         bool have_ip = false;
701         bool do_updateip = false;
702         bool do_takeip = false;
703         struct ctdb_iface *best_iface = NULL;
704
705         if (pip->pnn != ctdb->pnn) {
706                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707                                  "with pnn %d, but we're node %d\n",
708                                  ctdb_addr_to_str(&pip->addr),
709                                  pip->pnn, ctdb->pnn));
710                 return -1;
711         }
712
713         /* update out vnn list */
714         vnn = find_public_ip_vnn(ctdb, &pip->addr);
715         if (vnn == NULL) {
716                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717                         ctdb_addr_to_str(&pip->addr)));
718                 return 0;
719         }
720
721         if (ctdb->do_checkpublicip) {
722                 have_ip = ctdb_sys_have_ip(&pip->addr);
723         }
724         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725         if (best_iface == NULL) {
726                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727                                  "a usable interface (old %s, have_ip %d)\n",
728                                  ctdb_addr_to_str(&vnn->public_address),
729                                  vnn->public_netmask_bits,
730                                  ctdb_vnn_iface_string(vnn),
731                                  have_ip));
732                 return -1;
733         }
734
735         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737                 have_ip = false;
738         }
739
740
741         if (vnn->iface == NULL && have_ip) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744                                  ctdb_addr_to_str(&vnn->public_address)));
745                 return 0;
746         }
747
748         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750                                   "and we have it on iface[%s], but it was assigned to node %d"
751                                   "and we are node %d, banning ourself\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754                 ctdb_ban_self(ctdb);
755                 return -1;
756         }
757
758         if (vnn->pnn == -1 && have_ip) {
759                 vnn->pnn = ctdb->pnn;
760                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761                                   "and we already have it on iface[%s], update local daemon\n",
762                                  ctdb_addr_to_str(&vnn->public_address),
763                                   ctdb_vnn_iface_string(vnn)));
764                 return 0;
765         }
766
767         if (vnn->iface) {
768                 if (vnn->iface != best_iface) {
769                         if (!vnn->iface->link_up) {
770                                 do_updateip = true;
771                         } else if (vnn->iface->references > (best_iface->references + 1)) {
772                                 /* only move when the rebalance gains something */
773                                         do_updateip = true;
774                         }
775                 }
776         }
777
778         if (!have_ip) {
779                 if (do_updateip) {
780                         ctdb_vnn_unassign_iface(ctdb, vnn);
781                         do_updateip = false;
782                 }
783                 do_takeip = true;
784         }
785
786         if (do_takeip) {
787                 ret = ctdb_do_takeip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else if (do_updateip) {
792                 ret = ctdb_do_updateip(ctdb, c, vnn);
793                 if (ret != 0) {
794                         return -1;
795                 }
796         } else {
797                 /*
798                  * The interface is up and the kernel known the ip
799                  * => do nothing
800                  */
801                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits,
804                         ctdb_vnn_iface_string(vnn)));
805                 return 0;
806         }
807
808         /* tell ctdb_control.c that we will be replying asynchronously */
809         *async_reply = true;
810
811         return 0;
812 }
813
814 /*
815   takeover an ip address old v4 style
816  */
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
818                                 struct ctdb_req_control *c,
819                                 TDB_DATA indata, 
820                                 bool *async_reply)
821 {
822         TDB_DATA data;
823         
824         data.dsize = sizeof(struct ctdb_public_ip);
825         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826         CTDB_NO_MEMORY(ctdb, data.dptr);
827         
828         memcpy(data.dptr, indata.dptr, indata.dsize);
829         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
830 }
831
832 /*
833   kill any clients that are registered with a IP that is being released
834  */
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
836 {
837         struct ctdb_client_ip *ip;
838
839         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840                 ctdb_addr_to_str(addr)));
841
842         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843                 ctdb_sock_addr tmp_addr;
844
845                 tmp_addr = ip->addr;
846                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
847                         ip->client_id,
848                         ctdb_addr_to_str(&ip->addr)));
849
850                 if (ctdb_same_ip(&tmp_addr, addr)) {
851                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
852                                                                      ip->client_id, 
853                                                                      struct ctdb_client);
854                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
855                                 ip->client_id,
856                                 ctdb_addr_to_str(&ip->addr),
857                                 client->pid));
858
859                         if (client->pid != 0) {
860                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861                                         (unsigned)client->pid,
862                                         ctdb_addr_to_str(addr),
863                                         ip->client_id));
864                                 ctdb_kill(ctdb, client->pid, SIGKILL);
865                         }
866                 }
867         }
868 }
869
870 /*
871   called when releaseip event finishes
872  */
873 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
874                                 void *private_data)
875 {
876         struct takeover_callback_state *state = 
877                 talloc_get_type(private_data, struct takeover_callback_state);
878         TDB_DATA data;
879
880         if (status == -ETIME) {
881                 ctdb_ban_self(ctdb);
882         }
883
884         /* send a message to all clients of this node telling them
885            that the cluster has been reconfigured and they should
886            release any sockets on this IP */
887         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
888         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
889         data.dsize = strlen((char *)data.dptr)+1;
890
891         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
892
893         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
894
895         /* kill clients that have registered with this IP */
896         release_kill_clients(ctdb, state->addr);
897
898         ctdb_vnn_unassign_iface(ctdb, state->vnn);
899
900         /* the control succeeded */
901         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
902         talloc_free(state);
903 }
904
905 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
906 {
907         state->vnn->update_in_flight = false;
908         return 0;
909 }
910
911 /*
912   release an ip address
913  */
914 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
915                                 struct ctdb_req_control *c,
916                                 TDB_DATA indata, 
917                                 bool *async_reply)
918 {
919         int ret;
920         struct takeover_callback_state *state;
921         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
922         struct ctdb_vnn *vnn;
923         char *iface;
924
925         /* update our vnn list */
926         vnn = find_public_ip_vnn(ctdb, &pip->addr);
927         if (vnn == NULL) {
928                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
929                         ctdb_addr_to_str(&pip->addr)));
930                 return 0;
931         }
932         vnn->pnn = pip->pnn;
933
934         /* stop any previous arps */
935         talloc_free(vnn->takeover_ctx);
936         vnn->takeover_ctx = NULL;
937
938         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
939          * lazy multicast to drop an IP from any node that isn't the
940          * intended new node.  The following causes makes ctdbd ignore
941          * a release for any address it doesn't host.
942          */
943         if (ctdb->do_checkpublicip) {
944                 if (!ctdb_sys_have_ip(&pip->addr)) {
945                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
946                                 ctdb_addr_to_str(&pip->addr),
947                                 vnn->public_netmask_bits,
948                                 ctdb_vnn_iface_string(vnn)));
949                         ctdb_vnn_unassign_iface(ctdb, vnn);
950                         return 0;
951                 }
952         } else {
953                 if (vnn->iface == NULL) {
954                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
955                                            ctdb_addr_to_str(&pip->addr),
956                                            vnn->public_netmask_bits));
957                         return 0;
958                 }
959         }
960
961         /* There is a potential race between take_ip and us because we
962          * update the VNN via a callback that run when the
963          * eventscripts have been run.  Avoid the race by allowing one
964          * update to be in flight at a time.
965          */
966         if (vnn->update_in_flight) {
967                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
968                                     "update for this IP already in flight\n",
969                                     ctdb_addr_to_str(&vnn->public_address),
970                                     vnn->public_netmask_bits));
971                 return -1;
972         }
973
974         if (ctdb->do_checkpublicip) {
975                 iface = ctdb_sys_find_ifname(&pip->addr);
976                 if (iface == NULL) {
977                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
978                         return 0;
979                 }
980         } else {
981                 iface = strdup(ctdb_vnn_iface_string(vnn));
982         }
983
984         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
985                 ctdb_addr_to_str(&pip->addr),
986                 vnn->public_netmask_bits,
987                 iface,
988                 pip->pnn));
989
990         state = talloc(ctdb, struct takeover_callback_state);
991         CTDB_NO_MEMORY(ctdb, state);
992
993         state->c = talloc_steal(state, c);
994         state->addr = talloc(state, ctdb_sock_addr);       
995         CTDB_NO_MEMORY(ctdb, state->addr);
996         *state->addr = pip->addr;
997         state->vnn   = vnn;
998
999         vnn->update_in_flight = true;
1000         talloc_set_destructor(state, ctdb_releaseip_destructor);
1001
1002         ret = ctdb_event_script_callback(ctdb, 
1003                                          state, release_ip_callback, state,
1004                                          false,
1005                                          CTDB_EVENT_RELEASE_IP,
1006                                          "%s %s %u",
1007                                          iface,
1008                                          ctdb_addr_to_str(&pip->addr),
1009                                          vnn->public_netmask_bits);
1010         free(iface);
1011         if (ret != 0) {
1012                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1013                         ctdb_addr_to_str(&pip->addr),
1014                         ctdb_vnn_iface_string(vnn)));
1015                 talloc_free(state);
1016                 return -1;
1017         }
1018
1019         /* tell the control that we will be reply asynchronously */
1020         *async_reply = true;
1021         return 0;
1022 }
1023
1024 /*
1025   release an ip address old v4 style
1026  */
1027 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1028                                 struct ctdb_req_control *c,
1029                                 TDB_DATA indata, 
1030                                 bool *async_reply)
1031 {
1032         TDB_DATA data;
1033         
1034         data.dsize = sizeof(struct ctdb_public_ip);
1035         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1036         CTDB_NO_MEMORY(ctdb, data.dptr);
1037         
1038         memcpy(data.dptr, indata.dptr, indata.dsize);
1039         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1040 }
1041
1042
1043 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1044                                    ctdb_sock_addr *addr,
1045                                    unsigned mask, const char *ifaces,
1046                                    bool check_address)
1047 {
1048         struct ctdb_vnn      *vnn;
1049         uint32_t num = 0;
1050         char *tmp;
1051         const char *iface;
1052         int i;
1053         int ret;
1054
1055         tmp = strdup(ifaces);
1056         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1057                 if (!ctdb_sys_check_iface_exists(iface)) {
1058                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1059                         free(tmp);
1060                         return -1;
1061                 }
1062         }
1063         free(tmp);
1064
1065         /* Verify that we dont have an entry for this ip yet */
1066         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1067                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1068                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1069                                 ctdb_addr_to_str(addr)));
1070                         return -1;
1071                 }               
1072         }
1073
1074         /* create a new vnn structure for this ip address */
1075         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1076         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1077         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1078         tmp = talloc_strdup(vnn, ifaces);
1079         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1080         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1081                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1082                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1083                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1084                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1085                 num++;
1086         }
1087         talloc_free(tmp);
1088         vnn->ifaces[num] = NULL;
1089         vnn->public_address      = *addr;
1090         vnn->public_netmask_bits = mask;
1091         vnn->pnn                 = -1;
1092         if (check_address) {
1093                 if (ctdb_sys_have_ip(addr)) {
1094                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1095                         vnn->pnn = ctdb->pnn;
1096                 }
1097         }
1098
1099         for (i=0; vnn->ifaces[i]; i++) {
1100                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1101                 if (ret != 0) {
1102                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1103                                            "for public_address[%s]\n",
1104                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1105                         talloc_free(vnn);
1106                         return -1;
1107                 }
1108         }
1109
1110         DLIST_ADD(ctdb->vnn, vnn);
1111
1112         return 0;
1113 }
1114
1115 /*
1116   setup the event script directory
1117 */
1118 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1119 {
1120         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1121         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1122         return 0;
1123 }
1124
1125 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1126                                   struct timeval t, void *private_data)
1127 {
1128         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1129                                                         struct ctdb_context);
1130         struct ctdb_vnn *vnn;
1131
1132         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1133                 int i;
1134
1135                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1136                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1137                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1138                                         vnn->ifaces[i],
1139                                         ctdb_addr_to_str(&vnn->public_address)));
1140                         }
1141                 }
1142         }
1143
1144         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1145                 timeval_current_ofs(30, 0), 
1146                 ctdb_check_interfaces_event, ctdb);
1147 }
1148
1149
1150 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1151 {
1152         if (ctdb->check_public_ifaces_ctx != NULL) {
1153                 talloc_free(ctdb->check_public_ifaces_ctx);
1154                 ctdb->check_public_ifaces_ctx = NULL;
1155         }
1156
1157         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1158         if (ctdb->check_public_ifaces_ctx == NULL) {
1159                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1160         }
1161
1162         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1163                 timeval_current_ofs(30, 0), 
1164                 ctdb_check_interfaces_event, ctdb);
1165
1166         return 0;
1167 }
1168
1169
1170 /*
1171   setup the public address lists from a file
1172 */
1173 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1174 {
1175         char **lines;
1176         int nlines;
1177         int i;
1178
1179         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1180         if (lines == NULL) {
1181                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1182                 return -1;
1183         }
1184         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1185                 nlines--;
1186         }
1187
1188         for (i=0;i<nlines;i++) {
1189                 unsigned mask;
1190                 ctdb_sock_addr addr;
1191                 const char *addrstr;
1192                 const char *ifaces;
1193                 char *tok, *line;
1194
1195                 line = lines[i];
1196                 while ((*line == ' ') || (*line == '\t')) {
1197                         line++;
1198                 }
1199                 if (*line == '#') {
1200                         continue;
1201                 }
1202                 if (strcmp(line, "") == 0) {
1203                         continue;
1204                 }
1205                 tok = strtok(line, " \t");
1206                 addrstr = tok;
1207                 tok = strtok(NULL, " \t");
1208                 if (tok == NULL) {
1209                         if (NULL == ctdb->default_public_interface) {
1210                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1211                                          i+1));
1212                                 talloc_free(lines);
1213                                 return -1;
1214                         }
1215                         ifaces = ctdb->default_public_interface;
1216                 } else {
1217                         ifaces = tok;
1218                 }
1219
1220                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1221                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1226                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1227                         talloc_free(lines);
1228                         return -1;
1229                 }
1230         }
1231
1232
1233         talloc_free(lines);
1234         return 0;
1235 }
1236
1237 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1238                               const char *iface,
1239                               const char *ip)
1240 {
1241         struct ctdb_vnn *svnn;
1242         struct ctdb_iface *cur = NULL;
1243         bool ok;
1244         int ret;
1245
1246         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1247         CTDB_NO_MEMORY(ctdb, svnn);
1248
1249         svnn->ifaces = talloc_array(svnn, const char *, 2);
1250         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1251         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1252         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1253         svnn->ifaces[1] = NULL;
1254
1255         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1256         if (!ok) {
1257                 talloc_free(svnn);
1258                 return -1;
1259         }
1260
1261         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1262         if (ret != 0) {
1263                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1264                                    "for single_ip[%s]\n",
1265                                    svnn->ifaces[0],
1266                                    ctdb_addr_to_str(&svnn->public_address)));
1267                 talloc_free(svnn);
1268                 return -1;
1269         }
1270
1271         /* assume the single public ip interface is initially "good" */
1272         cur = ctdb_find_iface(ctdb, iface);
1273         if (cur == NULL) {
1274                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1275                 return -1;
1276         }
1277         cur->link_up = true;
1278
1279         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1280         if (ret != 0) {
1281                 talloc_free(svnn);
1282                 return -1;
1283         }
1284
1285         ctdb->single_ip_vnn = svnn;
1286         return 0;
1287 }
1288
1289 /* Given a physical node, return the number of
1290    public addresses that is currently assigned to this node.
1291 */
1292 static int node_ip_coverage(struct ctdb_context *ctdb, 
1293         int32_t pnn,
1294         struct ctdb_public_ip_list *ips)
1295 {
1296         int num=0;
1297
1298         for (;ips;ips=ips->next) {
1299                 if (ips->pnn == pnn) {
1300                         num++;
1301                 }
1302         }
1303         return num;
1304 }
1305
1306
1307 /* Can the given node host the given IP: is the public IP known to the
1308  * node and is NOIPHOST unset?
1309 */
1310 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1311                              struct ctdb_ipflags ipflags,
1312                              struct ctdb_public_ip_list *ip)
1313 {
1314         struct ctdb_all_public_ips *public_ips;
1315         int i;
1316
1317         if (ipflags.noiphost) {
1318                 return false;
1319         }
1320
1321         public_ips = ctdb->nodes[pnn]->available_public_ips;
1322
1323         if (public_ips == NULL) {
1324                 return false;
1325         }
1326
1327         for (i=0; i<public_ips->num; i++) {
1328                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1329                         /* yes, this node can serve this public ip */
1330                         return true;
1331                 }
1332         }
1333
1334         return false;
1335 }
1336
1337 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1338                                  struct ctdb_ipflags ipflags,
1339                                  struct ctdb_public_ip_list *ip)
1340 {
1341         if (ipflags.noiptakeover) {
1342                 return false;
1343         }
1344
1345         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1346 }
1347
1348 /* search the node lists list for a node to takeover this ip.
1349    pick the node that currently are serving the least number of ips
1350    so that the ips get spread out evenly.
1351 */
1352 static int find_takeover_node(struct ctdb_context *ctdb, 
1353                 struct ctdb_ipflags *ipflags,
1354                 struct ctdb_public_ip_list *ip,
1355                 struct ctdb_public_ip_list *all_ips)
1356 {
1357         int pnn, min=0, num;
1358         int i, numnodes;
1359
1360         numnodes = talloc_array_length(ipflags);
1361         pnn    = -1;
1362         for (i=0; i<numnodes; i++) {
1363                 /* verify that this node can serve this ip */
1364                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1365                         /* no it couldnt   so skip to the next node */
1366                         continue;
1367                 }
1368
1369                 num = node_ip_coverage(ctdb, i, all_ips);
1370                 /* was this the first node we checked ? */
1371                 if (pnn == -1) {
1372                         pnn = i;
1373                         min  = num;
1374                 } else {
1375                         if (num < min) {
1376                                 pnn = i;
1377                                 min  = num;
1378                         }
1379                 }
1380         }       
1381         if (pnn == -1) {
1382                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1383                         ctdb_addr_to_str(&ip->addr)));
1384
1385                 return -1;
1386         }
1387
1388         ip->pnn = pnn;
1389         return 0;
1390 }
1391
1392 #define IP_KEYLEN       4
1393 static uint32_t *ip_key(ctdb_sock_addr *ip)
1394 {
1395         static uint32_t key[IP_KEYLEN];
1396
1397         bzero(key, sizeof(key));
1398
1399         switch (ip->sa.sa_family) {
1400         case AF_INET:
1401                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1402                 break;
1403         case AF_INET6: {
1404                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1405                 key[0]  = htonl(s6_a32[0]);
1406                 key[1]  = htonl(s6_a32[1]);
1407                 key[2]  = htonl(s6_a32[2]);
1408                 key[3]  = htonl(s6_a32[3]);
1409                 break;
1410         }
1411         default:
1412                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1413                 return key;
1414         }
1415
1416         return key;
1417 }
1418
1419 static void *add_ip_callback(void *parm, void *data)
1420 {
1421         struct ctdb_public_ip_list *this_ip = parm; 
1422         struct ctdb_public_ip_list *prev_ip = data; 
1423
1424         if (prev_ip == NULL) {
1425                 return parm;
1426         }
1427         if (this_ip->pnn == -1) {
1428                 this_ip->pnn = prev_ip->pnn;
1429         }
1430
1431         return parm;
1432 }
1433
1434 static int getips_count_callback(void *param, void *data)
1435 {
1436         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1437         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1438
1439         new_ip->next = *ip_list;
1440         *ip_list     = new_ip;
1441         return 0;
1442 }
1443
1444 static struct ctdb_public_ip_list *
1445 create_merged_ip_list(struct ctdb_context *ctdb)
1446 {
1447         int i, j;
1448         struct ctdb_public_ip_list *ip_list;
1449         struct ctdb_all_public_ips *public_ips;
1450
1451         if (ctdb->ip_tree != NULL) {
1452                 talloc_free(ctdb->ip_tree);
1453                 ctdb->ip_tree = NULL;
1454         }
1455         ctdb->ip_tree = trbt_create(ctdb, 0);
1456
1457         for (i=0;i<ctdb->num_nodes;i++) {
1458                 public_ips = ctdb->nodes[i]->known_public_ips;
1459
1460                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1461                         continue;
1462                 }
1463
1464                 /* there were no public ips for this node */
1465                 if (public_ips == NULL) {
1466                         continue;
1467                 }               
1468
1469                 for (j=0;j<public_ips->num;j++) {
1470                         struct ctdb_public_ip_list *tmp_ip; 
1471
1472                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1473                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1474                         /* Do not use information about IP addresses hosted
1475                          * on other nodes, it may not be accurate */
1476                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1477                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1478                         } else {
1479                                 tmp_ip->pnn = -1;
1480                         }
1481                         tmp_ip->addr = public_ips->ips[j].addr;
1482                         tmp_ip->next = NULL;
1483
1484                         trbt_insertarray32_callback(ctdb->ip_tree,
1485                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1486                                 add_ip_callback,
1487                                 tmp_ip);
1488                 }
1489         }
1490
1491         ip_list = NULL;
1492         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1493
1494         return ip_list;
1495 }
1496
1497 /* 
1498  * This is the length of the longtest common prefix between the IPs.
1499  * It is calculated by XOR-ing the 2 IPs together and counting the
1500  * number of leading zeroes.  The implementation means that all
1501  * addresses end up being 128 bits long.
1502  *
1503  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1504  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1505  * lots of nodes and IP addresses?
1506  */
1507 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1508 {
1509         uint32_t ip1_k[IP_KEYLEN];
1510         uint32_t *t;
1511         int i;
1512         uint32_t x;
1513
1514         uint32_t distance = 0;
1515
1516         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1517         t = ip_key(ip2);
1518         for (i=0; i<IP_KEYLEN; i++) {
1519                 x = ip1_k[i] ^ t[i];
1520                 if (x == 0) {
1521                         distance += 32;
1522                 } else {
1523                         /* Count number of leading zeroes. 
1524                          * FIXME? This could be optimised...
1525                          */
1526                         while ((x & (1 << 31)) == 0) {
1527                                 x <<= 1;
1528                                 distance += 1;
1529                         }
1530                 }
1531         }
1532
1533         return distance;
1534 }
1535
1536 /* Calculate the IP distance for the given IP relative to IPs on the
1537    given node.  The ips argument is generally the all_ips variable
1538    used in the main part of the algorithm.
1539  */
1540 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1541                                   struct ctdb_public_ip_list *ips,
1542                                   int pnn)
1543 {
1544         struct ctdb_public_ip_list *t;
1545         uint32_t d;
1546
1547         uint32_t sum = 0;
1548
1549         for (t=ips; t != NULL; t=t->next) {
1550                 if (t->pnn != pnn) {
1551                         continue;
1552                 }
1553
1554                 /* Optimisation: We never calculate the distance
1555                  * between an address and itself.  This allows us to
1556                  * calculate the effect of removing an address from a
1557                  * node by simply calculating the distance between
1558                  * that address and all of the exitsing addresses.
1559                  * Moreover, we assume that we're only ever dealing
1560                  * with addresses from all_ips so we can identify an
1561                  * address via a pointer rather than doing a more
1562                  * expensive address comparison. */
1563                 if (&(t->addr) == ip) {
1564                         continue;
1565                 }
1566
1567                 d = ip_distance(ip, &(t->addr));
1568                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1569         }
1570
1571         return sum;
1572 }
1573
1574 /* Return the LCP2 imbalance metric for addresses currently assigned
1575    to the given node.
1576  */
1577 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1578 {
1579         struct ctdb_public_ip_list *t;
1580
1581         uint32_t imbalance = 0;
1582
1583         for (t=all_ips; t!=NULL; t=t->next) {
1584                 if (t->pnn != pnn) {
1585                         continue;
1586                 }
1587                 /* Pass the rest of the IPs rather than the whole
1588                    all_ips input list.
1589                 */
1590                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1591         }
1592
1593         return imbalance;
1594 }
1595
1596 /* Allocate any unassigned IPs just by looping through the IPs and
1597  * finding the best node for each.
1598  */
1599 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1600                                       struct ctdb_ipflags *ipflags,
1601                                       struct ctdb_public_ip_list *all_ips)
1602 {
1603         struct ctdb_public_ip_list *tmp_ip;
1604
1605         /* loop over all ip's and find a physical node to cover for 
1606            each unassigned ip.
1607         */
1608         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1609                 if (tmp_ip->pnn == -1) {
1610                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1611                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1612                                         ctdb_addr_to_str(&tmp_ip->addr)));
1613                         }
1614                 }
1615         }
1616 }
1617
1618 /* Basic non-deterministic rebalancing algorithm.
1619  */
1620 static void basic_failback(struct ctdb_context *ctdb,
1621                            struct ctdb_ipflags *ipflags,
1622                            struct ctdb_public_ip_list *all_ips,
1623                            int num_ips)
1624 {
1625         int i, numnodes;
1626         int maxnode, maxnum, minnode, minnum, num, retries;
1627         struct ctdb_public_ip_list *tmp_ip;
1628
1629         numnodes = talloc_array_length(ipflags);
1630         retries = 0;
1631
1632 try_again:
1633         maxnum=0;
1634         minnum=0;
1635
1636         /* for each ip address, loop over all nodes that can serve
1637            this ip and make sure that the difference between the node
1638            serving the most and the node serving the least ip's are
1639            not greater than 1.
1640         */
1641         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1642                 if (tmp_ip->pnn == -1) {
1643                         continue;
1644                 }
1645
1646                 /* Get the highest and lowest number of ips's served by any 
1647                    valid node which can serve this ip.
1648                 */
1649                 maxnode = -1;
1650                 minnode = -1;
1651                 for (i=0; i<numnodes; i++) {
1652                         /* only check nodes that can actually serve this ip */
1653                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1654                                 /* no it couldnt   so skip to the next node */
1655                                 continue;
1656                         }
1657
1658                         num = node_ip_coverage(ctdb, i, all_ips);
1659                         if (maxnode == -1) {
1660                                 maxnode = i;
1661                                 maxnum  = num;
1662                         } else {
1663                                 if (num > maxnum) {
1664                                         maxnode = i;
1665                                         maxnum  = num;
1666                                 }
1667                         }
1668                         if (minnode == -1) {
1669                                 minnode = i;
1670                                 minnum  = num;
1671                         } else {
1672                                 if (num < minnum) {
1673                                         minnode = i;
1674                                         minnum  = num;
1675                                 }
1676                         }
1677                 }
1678                 if (maxnode == -1) {
1679                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1680                                 ctdb_addr_to_str(&tmp_ip->addr)));
1681
1682                         continue;
1683                 }
1684
1685                 /* if the spread between the smallest and largest coverage by
1686                    a node is >=2 we steal one of the ips from the node with
1687                    most coverage to even things out a bit.
1688                    try to do this a limited number of times since we dont
1689                    want to spend too much time balancing the ip coverage.
1690                 */
1691                 if ( (maxnum > minnum+1)
1692                      && (retries < (num_ips + 5)) ){
1693                         struct ctdb_public_ip_list *tmp;
1694
1695                         /* Reassign one of maxnode's VNNs */
1696                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1697                                 if (tmp->pnn == maxnode) {
1698                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1699                                         retries++;
1700                                         goto try_again;;
1701                                 }
1702                         }
1703                 }
1704         }
1705 }
1706
1707 struct ctdb_rebalancenodes {
1708         struct ctdb_rebalancenodes *next;
1709         uint32_t pnn;
1710 };
1711 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1712
1713
1714 /* set this flag to force the node to be rebalanced even if it just didnt
1715    become healthy again.
1716 */
1717 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1718 {
1719         struct ctdb_rebalancenodes *rebalance;
1720
1721         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1722                 if (rebalance->pnn == pnn) {
1723                         return;
1724                 }
1725         }
1726
1727         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1728         rebalance->pnn = pnn;
1729         rebalance->next = force_rebalance_list;
1730         force_rebalance_list = rebalance;
1731 }
1732
1733 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1734  * that we can unit test it.
1735  */
1736 static void lcp2_init(struct ctdb_context *tmp_ctx,
1737                       struct ctdb_ipflags *ipflags,
1738                       struct ctdb_public_ip_list *all_ips,
1739                       uint32_t **lcp2_imbalances,
1740                       bool **rebalance_candidates)
1741 {
1742         int i, numnodes;
1743         struct ctdb_public_ip_list *tmp_ip;
1744
1745         numnodes = talloc_array_length(ipflags);
1746
1747         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1748         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1749         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1750         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1751
1752         for (i=0; i<numnodes; i++) {
1753                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1754                 /* First step: assume all nodes are candidates */
1755                 (*rebalance_candidates)[i] = true;
1756         }
1757
1758         /* 2nd step: if a node has IPs assigned then it must have been
1759          * healthy before, so we remove it from consideration.  This
1760          * is overkill but is all we have because we don't maintain
1761          * state between takeover runs.  An alternative would be to
1762          * keep state and invalidate it every time the recovery master
1763          * changes.
1764          */
1765         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1766                 if (tmp_ip->pnn != -1) {
1767                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1768                 }
1769         }
1770
1771         /* 3rd step: if a node is forced to re-balance then
1772            we allow failback onto the node */
1773         while (force_rebalance_list != NULL) {
1774                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1775
1776                 if (force_rebalance_list->pnn <= numnodes) {
1777                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1778                 }
1779
1780                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1781                 talloc_free(force_rebalance_list);
1782                 force_rebalance_list = next;
1783         }
1784 }
1785
1786 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1787  * the IP/node combination that will cost the least.
1788  */
1789 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1790                                      struct ctdb_ipflags *ipflags,
1791                                      struct ctdb_public_ip_list *all_ips,
1792                                      uint32_t *lcp2_imbalances)
1793 {
1794         struct ctdb_public_ip_list *tmp_ip;
1795         int dstnode, numnodes;
1796
1797         int minnode;
1798         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1799         struct ctdb_public_ip_list *minip;
1800
1801         bool should_loop = true;
1802         bool have_unassigned = true;
1803
1804         numnodes = talloc_array_length(ipflags);
1805
1806         while (have_unassigned && should_loop) {
1807                 should_loop = false;
1808
1809                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1810                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1811
1812                 minnode = -1;
1813                 mindsum = 0;
1814                 minip = NULL;
1815
1816                 /* loop over each unassigned ip. */
1817                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1818                         if (tmp_ip->pnn != -1) {
1819                                 continue;
1820                         }
1821
1822                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1823                                 /* only check nodes that can actually takeover this ip */
1824                                 if (!can_node_takeover_ip(ctdb, dstnode,
1825                                                           ipflags[dstnode],
1826                                                           tmp_ip)) {
1827                                         /* no it couldnt   so skip to the next node */
1828                                         continue;
1829                                 }
1830
1831                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1832                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1833                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1834                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1835                                                    dstnode,
1836                                                    dstimbl - lcp2_imbalances[dstnode]));
1837
1838
1839                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1840                                         minnode = dstnode;
1841                                         minimbl = dstimbl;
1842                                         mindsum = dstdsum;
1843                                         minip = tmp_ip;
1844                                         should_loop = true;
1845                                 }
1846                         }
1847                 }
1848
1849                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1850
1851                 /* If we found one then assign it to the given node. */
1852                 if (minnode != -1) {
1853                         minip->pnn = minnode;
1854                         lcp2_imbalances[minnode] = minimbl;
1855                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1856                                           ctdb_addr_to_str(&(minip->addr)),
1857                                           minnode,
1858                                           mindsum));
1859                 }
1860
1861                 /* There might be a better way but at least this is clear. */
1862                 have_unassigned = false;
1863                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1864                         if (tmp_ip->pnn == -1) {
1865                                 have_unassigned = true;
1866                         }
1867                 }
1868         }
1869
1870         /* We know if we have an unassigned addresses so we might as
1871          * well optimise.
1872          */
1873         if (have_unassigned) {
1874                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1875                         if (tmp_ip->pnn == -1) {
1876                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1877                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1878                         }
1879                 }
1880         }
1881 }
1882
1883 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1884  * to move IPs from, determines the best IP/destination node
1885  * combination to move from the source node.
1886  */
1887 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1888                                     struct ctdb_ipflags *ipflags,
1889                                     struct ctdb_public_ip_list *all_ips,
1890                                     int srcnode,
1891                                     uint32_t candimbl,
1892                                     uint32_t *lcp2_imbalances,
1893                                     bool *rebalance_candidates)
1894 {
1895         int dstnode, mindstnode, numnodes;
1896         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1897         uint32_t minsrcimbl, mindstimbl;
1898         struct ctdb_public_ip_list *minip;
1899         struct ctdb_public_ip_list *tmp_ip;
1900
1901         /* Find an IP and destination node that best reduces imbalance. */
1902         minip = NULL;
1903         minsrcimbl = 0;
1904         mindstnode = -1;
1905         mindstimbl = 0;
1906
1907         numnodes = talloc_array_length(ipflags);
1908
1909         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1910         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1911
1912         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1913                 /* Only consider addresses on srcnode. */
1914                 if (tmp_ip->pnn != srcnode) {
1915                         continue;
1916                 }
1917
1918                 /* What is this IP address costing the source node? */
1919                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1920                 srcimbl = candimbl - srcdsum;
1921
1922                 /* Consider this IP address would cost each potential
1923                  * destination node.  Destination nodes are limited to
1924                  * those that are newly healthy, since we don't want
1925                  * to do gratuitous failover of IPs just to make minor
1926                  * balance improvements.
1927                  */
1928                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1929                         if (!rebalance_candidates[dstnode]) {
1930                                 continue;
1931                         }
1932
1933                         /* only check nodes that can actually takeover this ip */
1934                         if (!can_node_takeover_ip(ctdb, dstnode,
1935                                                   ipflags[dstnode], tmp_ip)) {
1936                                 /* no it couldnt   so skip to the next node */
1937                                 continue;
1938                         }
1939
1940                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1941                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1942                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1943                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1944                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1945                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1946
1947                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1948                             ((mindstnode == -1) ||                              \
1949                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1950
1951                                 minip = tmp_ip;
1952                                 minsrcimbl = srcimbl;
1953                                 mindstnode = dstnode;
1954                                 mindstimbl = dstimbl;
1955                         }
1956                 }
1957         }
1958         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1959
1960         if (mindstnode != -1) {
1961                 /* We found a move that makes things better... */
1962                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1963                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1964                                   ctdb_addr_to_str(&(minip->addr)),
1965                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1966
1967
1968                 lcp2_imbalances[srcnode] = srcimbl;
1969                 lcp2_imbalances[mindstnode] = mindstimbl;
1970                 minip->pnn = mindstnode;
1971
1972                 return true;
1973         }
1974
1975         return false;
1976         
1977 }
1978
1979 struct lcp2_imbalance_pnn {
1980         uint32_t imbalance;
1981         int pnn;
1982 };
1983
1984 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1985 {
1986         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1987         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1988
1989         if (lipa->imbalance > lipb->imbalance) {
1990                 return -1;
1991         } else if (lipa->imbalance == lipb->imbalance) {
1992                 return 0;
1993         } else {
1994                 return 1;
1995         }
1996 }
1997
1998 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1999  * node with the highest LCP2 imbalance, and then determines the best
2000  * IP/destination node combination to move from the source node.
2001  */
2002 static void lcp2_failback(struct ctdb_context *ctdb,
2003                           struct ctdb_ipflags *ipflags,
2004                           struct ctdb_public_ip_list *all_ips,
2005                           uint32_t *lcp2_imbalances,
2006                           bool *rebalance_candidates)
2007 {
2008         int i, num_rebalance_candidates, numnodes;
2009         struct lcp2_imbalance_pnn * lips;
2010         bool again;
2011
2012         numnodes = talloc_array_length(ipflags);
2013
2014 try_again:
2015
2016         /* It is only worth continuing if we have suitable target
2017          * nodes to transfer IPs to.  This check is much cheaper than
2018          * continuing on...
2019          */
2020         num_rebalance_candidates = 0;
2021         for (i=0; i<numnodes; i++) {
2022                 if (rebalance_candidates[i]) {
2023                         num_rebalance_candidates++;
2024                 }
2025         }
2026         if (num_rebalance_candidates == 0) {
2027                 return;
2028         }
2029
2030         /* Put the imbalances and nodes into an array, sort them and
2031          * iterate through candidates.  Usually the 1st one will be
2032          * used, so this doesn't cost much...
2033          */
2034         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2035         for (i=0; i<numnodes; i++) {
2036                 lips[i].imbalance = lcp2_imbalances[i];
2037                 lips[i].pnn = i;
2038         }
2039         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2040               lcp2_cmp_imbalance_pnn);
2041
2042         again = false;
2043         for (i=0; i<numnodes; i++) {
2044                 /* This means that all nodes had 0 or 1 addresses, so
2045                  * can't be imbalanced.
2046                  */
2047                 if (lips[i].imbalance == 0) {
2048                         break;
2049                 }
2050
2051                 if (lcp2_failback_candidate(ctdb,
2052                                             ipflags,
2053                                             all_ips,
2054                                             lips[i].pnn,
2055                                             lips[i].imbalance,
2056                                             lcp2_imbalances,
2057                                             rebalance_candidates)) {
2058                         again = true;
2059                         break;
2060                 }
2061         }
2062
2063         talloc_free(lips);
2064         if (again) {
2065                 goto try_again;
2066         }
2067 }
2068
2069 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2070                                     struct ctdb_ipflags *ipflags,
2071                                     struct ctdb_public_ip_list *all_ips)
2072 {
2073         struct ctdb_public_ip_list *tmp_ip;
2074
2075         /* verify that the assigned nodes can serve that public ip
2076            and set it to -1 if not
2077         */
2078         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2079                 if (tmp_ip->pnn == -1) {
2080                         continue;
2081                 }
2082                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2083                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2084                         /* this node can not serve this ip. */
2085                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2086                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2087                                            tmp_ip->pnn));
2088                         tmp_ip->pnn = -1;
2089                 }
2090         }
2091 }
2092
2093 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2094                                        struct ctdb_ipflags *ipflags,
2095                                        struct ctdb_public_ip_list *all_ips)
2096 {
2097         struct ctdb_public_ip_list *tmp_ip;
2098         int i, numnodes;
2099
2100         numnodes = talloc_array_length(ipflags);
2101
2102         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2103        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2104         *  always be allocated the same way for a specific set of
2105         *  available/unavailable nodes.
2106         */
2107
2108         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2109                 tmp_ip->pnn = i % numnodes;
2110         }
2111
2112         /* IP failback doesn't make sense with deterministic
2113          * IPs, since the modulo step above implicitly fails
2114          * back IPs to their "home" node.
2115          */
2116         if (1 == ctdb->tunable.no_ip_failback) {
2117                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2118         }
2119
2120         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2121
2122         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2123
2124         /* No failback here! */
2125 }
2126
2127 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2128                                           struct ctdb_ipflags *ipflags,
2129                                           struct ctdb_public_ip_list *all_ips)
2130 {
2131         /* This should be pushed down into basic_failback. */
2132         struct ctdb_public_ip_list *tmp_ip;
2133         int num_ips = 0;
2134         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2135                 num_ips++;
2136         }
2137
2138         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2139
2140         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2141
2142         /* If we don't want IPs to fail back then don't rebalance IPs. */
2143         if (1 == ctdb->tunable.no_ip_failback) {
2144                 return;
2145         }
2146
2147         /* Now, try to make sure the ip adresses are evenly distributed
2148            across the nodes.
2149         */
2150         basic_failback(ctdb, ipflags, all_ips, num_ips);
2151 }
2152
2153 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2154                           struct ctdb_ipflags *ipflags,
2155                           struct ctdb_public_ip_list *all_ips)
2156 {
2157         uint32_t *lcp2_imbalances;
2158         bool *rebalance_candidates;
2159
2160         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2161
2162         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2163
2164         lcp2_init(tmp_ctx, ipflags, all_ips,
2165                   &lcp2_imbalances, &rebalance_candidates);
2166
2167         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2168
2169         /* If we don't want IPs to fail back then don't rebalance IPs. */
2170         if (1 == ctdb->tunable.no_ip_failback) {
2171                 goto finished;
2172         }
2173
2174         /* Now, try to make sure the ip adresses are evenly distributed
2175            across the nodes.
2176         */
2177         lcp2_failback(ctdb, ipflags, all_ips,
2178                       lcp2_imbalances, rebalance_candidates);
2179
2180 finished:
2181         talloc_free(tmp_ctx);
2182 }
2183
2184 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2185 {
2186         int i, num_healthy;
2187
2188         /* Count how many completely healthy nodes we have */
2189         num_healthy = 0;
2190         for (i=0;i<nodemap->num;i++) {
2191                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2192                         num_healthy++;
2193                 }
2194         }
2195
2196         return num_healthy == 0;
2197 }
2198
2199 /* The calculation part of the IP allocation algorithm. */
2200 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2201                                    struct ctdb_ipflags *ipflags,
2202                                    struct ctdb_public_ip_list **all_ips_p)
2203 {
2204         /* since nodes only know about those public addresses that
2205            can be served by that particular node, no single node has
2206            a full list of all public addresses that exist in the cluster.
2207            Walk over all node structures and create a merged list of
2208            all public addresses that exist in the cluster.
2209
2210            keep the tree of ips around as ctdb->ip_tree
2211         */
2212         *all_ips_p = create_merged_ip_list(ctdb);
2213
2214         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2215                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p);
2216         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2217                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2218         } else {
2219                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2220         }
2221
2222         /* at this point ->pnn is the node which will own each IP
2223            or -1 if there is no node that can cover this ip
2224         */
2225
2226         return;
2227 }
2228
2229 struct get_tunable_callback_data {
2230         const char *tunable;
2231         uint32_t *out;
2232         bool fatal;
2233 };
2234
2235 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2236                                  int32_t res, TDB_DATA outdata,
2237                                  void *callback)
2238 {
2239         struct get_tunable_callback_data *cd =
2240                 (struct get_tunable_callback_data *)callback;
2241         int size;
2242
2243         if (res != 0) {
2244                 /* Already handled in fail callback */
2245                 return;
2246         }
2247
2248         if (outdata.dsize != sizeof(uint32_t)) {
2249                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2250                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2251                                  (int)outdata.dsize));
2252                 cd->fatal = true;
2253                 return;
2254         }
2255
2256         size = talloc_array_length(cd->out);
2257         if (pnn >= size) {
2258                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2259                                  cd->tunable, pnn, size));
2260                 return;
2261         }
2262
2263                 
2264         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2265 }
2266
2267 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2268                                        int32_t res, TDB_DATA outdata,
2269                                        void *callback)
2270 {
2271         struct get_tunable_callback_data *cd =
2272                 (struct get_tunable_callback_data *)callback;
2273
2274         switch (res) {
2275         case -ETIME:
2276                 DEBUG(DEBUG_ERR,
2277                       ("Timed out getting tunable \"%s\" from node %d\n",
2278                        cd->tunable, pnn));
2279                 cd->fatal = true;
2280                 break;
2281         case -EINVAL:
2282         case -1:
2283                 DEBUG(DEBUG_WARNING,
2284                       ("Tunable \"%s\" not implemented on node %d\n",
2285                        cd->tunable, pnn));
2286                 break;
2287         default:
2288                 DEBUG(DEBUG_ERR,
2289                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2290                        cd->tunable, pnn));
2291                 cd->fatal = true;
2292         }
2293 }
2294
2295 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2296                                         TALLOC_CTX *tmp_ctx,
2297                                         struct ctdb_node_map *nodemap,
2298                                         const char *tunable,
2299                                         uint32_t default_value)
2300 {
2301         TDB_DATA data;
2302         struct ctdb_control_get_tunable *t;
2303         uint32_t *nodes;
2304         uint32_t *tvals;
2305         struct get_tunable_callback_data callback_data;
2306         int i;
2307
2308         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2309         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2310         for (i=0; i<nodemap->num; i++) {
2311                 tvals[i] = default_value;
2312         }
2313                 
2314         callback_data.out = tvals;
2315         callback_data.tunable = tunable;
2316         callback_data.fatal = false;
2317
2318         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2319         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2320         t = (struct ctdb_control_get_tunable *)data.dptr;
2321         t->length = strlen(tunable)+1;
2322         memcpy(t->name, tunable, t->length);
2323         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2324         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2325                                       nodes, 0, TAKEOVER_TIMEOUT(),
2326                                       false, data,
2327                                       get_tunable_callback,
2328                                       get_tunable_fail_callback,
2329                                       &callback_data) != 0) {
2330                 if (callback_data.fatal) {
2331                         talloc_free(tvals);
2332                         tvals = NULL;
2333                 }
2334         }
2335         talloc_free(nodes);
2336         talloc_free(data.dptr);
2337
2338         return tvals;
2339 }
2340
2341 /* Set internal flags for IP allocation:
2342  *   Clear ip flags
2343  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2344  *   Set NOIPHOST ip flag for each INACTIVE node
2345  *   if all nodes are disabled:
2346  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2347  *   else
2348  *     Set NOIPHOST ip flags for disabled nodes
2349  */
2350 static struct ctdb_ipflags *
2351 set_ipflags_internal(struct ctdb_context *ctdb,
2352                      TALLOC_CTX *tmp_ctx,
2353                      struct ctdb_node_map *nodemap,
2354                      uint32_t *tval_noiptakeover,
2355                      uint32_t *tval_noiphostonalldisabled)
2356 {
2357         int i;
2358         struct ctdb_ipflags *ipflags;
2359
2360         /* Clear IP flags - implicit due to talloc_zero */
2361         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2362         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2363
2364         for (i=0;i<nodemap->num;i++) {
2365                 /* Can not take IPs on node with NoIPTakeover set */
2366                 if (tval_noiptakeover[i] != 0) {
2367                         ipflags[i].noiptakeover = true;
2368                 }
2369
2370                 /* Can not host IPs on INACTIVE node */
2371                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2372                         ipflags[i].noiphost = true;
2373                 }
2374         }
2375
2376         if (all_nodes_are_disabled(nodemap)) {
2377                 /* If all nodes are disabled, can not host IPs on node
2378                  * with NoIPHostOnAllDisabled set
2379                  */
2380                 for (i=0;i<nodemap->num;i++) {
2381                         if (tval_noiphostonalldisabled[i] != 0) {
2382                                 ipflags[i].noiphost = true;
2383                         }
2384                 }
2385         } else {
2386                 /* If some nodes are not disabled, then can not host
2387                  * IPs on DISABLED node
2388                  */
2389                 for (i=0;i<nodemap->num;i++) {
2390                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2391                                 ipflags[i].noiphost = true;
2392                         }
2393                 }
2394         }
2395
2396         return ipflags;
2397 }
2398
2399 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2400                                         TALLOC_CTX *tmp_ctx,
2401                                         struct ctdb_node_map *nodemap)
2402 {
2403         uint32_t *tval_noiptakeover;
2404         uint32_t *tval_noiphostonalldisabled;
2405         struct ctdb_ipflags *ipflags;
2406
2407         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2408                                                    "NoIPTakeover", 0);
2409         if (tval_noiptakeover == NULL) {
2410                 return NULL;
2411         }
2412
2413         tval_noiphostonalldisabled =
2414                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2415                                        "NoIPHostOnAllDisabled", 0);
2416         if (tval_noiphostonalldisabled == NULL) {
2417                 /* Caller frees tmp_ctx */
2418                 return NULL;
2419         }
2420
2421         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2422                                        tval_noiptakeover,
2423                                        tval_noiphostonalldisabled);
2424
2425         talloc_free(tval_noiptakeover);
2426         talloc_free(tval_noiphostonalldisabled);
2427
2428         return ipflags;
2429 }
2430
2431 /*
2432   make any IP alias changes for public addresses that are necessary 
2433  */
2434 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2435                       client_async_callback fail_callback, void *callback_data)
2436 {
2437         int i;
2438         struct ctdb_public_ip ip;
2439         struct ctdb_public_ipv4 ipv4;
2440         uint32_t *nodes;
2441         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2442         TDB_DATA data;
2443         struct timeval timeout;
2444         struct client_async_data *async_data;
2445         struct ctdb_client_control_state *state;
2446         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2447         uint32_t disable_timeout;
2448         struct ctdb_ipflags *ipflags;
2449
2450         /*
2451          * ip failover is completely disabled, just send out the 
2452          * ipreallocated event.
2453          */
2454         if (ctdb->tunable.disable_ip_failover != 0) {
2455                 goto ipreallocated;
2456         }
2457
2458         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2459         if (ipflags == NULL) {
2460                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2461                 talloc_free(tmp_ctx);
2462                 return -1;
2463         }
2464
2465         ZERO_STRUCT(ip);
2466
2467         /* Do the IP reassignment calculations */
2468         ctdb_takeover_run_core(ctdb, ipflags, &all_ips);
2469
2470         /* The IP flags need to be cleared because they should never
2471          * be seen outside the IP allocation code.
2472          */
2473
2474         /* The recovery daemon does regular sanity checks of the IPs.
2475          * However, sometimes it is overzealous and thinks changes are
2476          * required when they're already underway.  This stops the
2477          * checks for a while before we start moving IPs.
2478          */
2479         disable_timeout = ctdb->tunable.takeover_timeout;
2480         data.dptr  = (uint8_t*)&disable_timeout;
2481         data.dsize = sizeof(disable_timeout);
2482         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2483                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2484                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2485         }
2486
2487         /* now tell all nodes to delete any alias that they should not
2488            have.  This will be a NOOP on nodes that don't currently
2489            hold the given alias */
2490         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2491         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2492
2493         async_data->fail_callback = fail_callback;
2494         async_data->callback_data = callback_data;
2495
2496         for (i=0;i<nodemap->num;i++) {
2497                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2498                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2499                         continue;
2500                 }
2501
2502                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2503                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2504                                 /* This node should be serving this
2505                                    vnn so dont tell it to release the ip
2506                                 */
2507                                 continue;
2508                         }
2509                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2510                                 ipv4.pnn = tmp_ip->pnn;
2511                                 ipv4.sin = tmp_ip->addr.ip;
2512
2513                                 timeout = TAKEOVER_TIMEOUT();
2514                                 data.dsize = sizeof(ipv4);
2515                                 data.dptr  = (uint8_t *)&ipv4;
2516                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2517                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2518                                                 data, async_data,
2519                                                 &timeout, NULL);
2520                         } else {
2521                                 ip.pnn  = tmp_ip->pnn;
2522                                 ip.addr = tmp_ip->addr;
2523
2524                                 timeout = TAKEOVER_TIMEOUT();
2525                                 data.dsize = sizeof(ip);
2526                                 data.dptr  = (uint8_t *)&ip;
2527                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2528                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2529                                                 data, async_data,
2530                                                 &timeout, NULL);
2531                         }
2532
2533                         if (state == NULL) {
2534                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2535                                 talloc_free(tmp_ctx);
2536                                 return -1;
2537                         }
2538                 
2539                         ctdb_client_async_add(async_data, state);
2540                 }
2541         }
2542         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2543                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2544                 talloc_free(tmp_ctx);
2545                 return -1;
2546         }
2547         talloc_free(async_data);
2548
2549
2550         /* tell all nodes to get their own IPs */
2551         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2552         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2553
2554         async_data->fail_callback = fail_callback;
2555         async_data->callback_data = callback_data;
2556
2557         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2558                 if (tmp_ip->pnn == -1) {
2559                         /* this IP won't be taken over */
2560                         continue;
2561                 }
2562
2563                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2564                         ipv4.pnn = tmp_ip->pnn;
2565                         ipv4.sin = tmp_ip->addr.ip;
2566
2567                         timeout = TAKEOVER_TIMEOUT();
2568                         data.dsize = sizeof(ipv4);
2569                         data.dptr  = (uint8_t *)&ipv4;
2570                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2571                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2572                                         data, async_data,
2573                                         &timeout, NULL);
2574                 } else {
2575                         ip.pnn  = tmp_ip->pnn;
2576                         ip.addr = tmp_ip->addr;
2577
2578                         timeout = TAKEOVER_TIMEOUT();
2579                         data.dsize = sizeof(ip);
2580                         data.dptr  = (uint8_t *)&ip;
2581                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2582                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2583                                         data, async_data,
2584                                         &timeout, NULL);
2585                 }
2586                 if (state == NULL) {
2587                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2588                         talloc_free(tmp_ctx);
2589                         return -1;
2590                 }
2591                 
2592                 ctdb_client_async_add(async_data, state);
2593         }
2594         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2595                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2596                 talloc_free(tmp_ctx);
2597                 return -1;
2598         }
2599
2600 ipreallocated:
2601         /* 
2602          * Tell all nodes to run eventscripts to process the
2603          * "ipreallocated" event.  This can do a lot of things,
2604          * including restarting services to reconfigure them if public
2605          * IPs have moved.  Once upon a time this event only used to
2606          * update natwg.
2607          */
2608         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2609         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2610                                       nodes, 0, TAKEOVER_TIMEOUT(),
2611                                       false, tdb_null,
2612                                       NULL, fail_callback,
2613                                       callback_data) != 0) {
2614                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2615         }
2616
2617         talloc_free(tmp_ctx);
2618         return 0;
2619 }
2620
2621
2622 /*
2623   destroy a ctdb_client_ip structure
2624  */
2625 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2626 {
2627         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2628                 ctdb_addr_to_str(&ip->addr),
2629                 ntohs(ip->addr.ip.sin_port),
2630                 ip->client_id));
2631
2632         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2633         return 0;
2634 }
2635
2636 /*
2637   called by a client to inform us of a TCP connection that it is managing
2638   that should tickled with an ACK when IP takeover is done
2639   we handle both the old ipv4 style of packets as well as the new ipv4/6
2640   pdus.
2641  */
2642 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2643                                 TDB_DATA indata)
2644 {
2645         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2646         struct ctdb_control_tcp *old_addr = NULL;
2647         struct ctdb_control_tcp_addr new_addr;
2648         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2649         struct ctdb_tcp_list *tcp;
2650         struct ctdb_tcp_connection t;
2651         int ret;
2652         TDB_DATA data;
2653         struct ctdb_client_ip *ip;
2654         struct ctdb_vnn *vnn;
2655         ctdb_sock_addr addr;
2656
2657         switch (indata.dsize) {
2658         case sizeof(struct ctdb_control_tcp):
2659                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2660                 ZERO_STRUCT(new_addr);
2661                 tcp_sock = &new_addr;
2662                 tcp_sock->src.ip  = old_addr->src;
2663                 tcp_sock->dest.ip = old_addr->dest;
2664                 break;
2665         case sizeof(struct ctdb_control_tcp_addr):
2666                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2667                 break;
2668         default:
2669                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2670                                  "to ctdb_control_tcp_client. size was %d but "
2671                                  "only allowed sizes are %lu and %lu\n",
2672                                  (int)indata.dsize,
2673                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2674                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2675                 return -1;
2676         }
2677
2678         addr = tcp_sock->src;
2679         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2680         addr = tcp_sock->dest;
2681         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2682
2683         ZERO_STRUCT(addr);
2684         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2685         vnn = find_public_ip_vnn(ctdb, &addr);
2686         if (vnn == NULL) {
2687                 switch (addr.sa.sa_family) {
2688                 case AF_INET:
2689                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2690                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2691                                         ctdb_addr_to_str(&addr)));
2692                         }
2693                         break;
2694                 case AF_INET6:
2695                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2696                                 ctdb_addr_to_str(&addr)));
2697                         break;
2698                 default:
2699                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2700                 }
2701
2702                 return 0;
2703         }
2704
2705         if (vnn->pnn != ctdb->pnn) {
2706                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2707                         ctdb_addr_to_str(&addr),
2708                         client_id, client->pid));
2709                 /* failing this call will tell smbd to die */
2710                 return -1;
2711         }
2712
2713         ip = talloc(client, struct ctdb_client_ip);
2714         CTDB_NO_MEMORY(ctdb, ip);
2715
2716         ip->ctdb      = ctdb;
2717         ip->addr      = addr;
2718         ip->client_id = client_id;
2719         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2720         DLIST_ADD(ctdb->client_ip_list, ip);
2721
2722         tcp = talloc(client, struct ctdb_tcp_list);
2723         CTDB_NO_MEMORY(ctdb, tcp);
2724
2725         tcp->connection.src_addr = tcp_sock->src;
2726         tcp->connection.dst_addr = tcp_sock->dest;
2727
2728         DLIST_ADD(client->tcp_list, tcp);
2729
2730         t.src_addr = tcp_sock->src;
2731         t.dst_addr = tcp_sock->dest;
2732
2733         data.dptr = (uint8_t *)&t;
2734         data.dsize = sizeof(t);
2735
2736         switch (addr.sa.sa_family) {
2737         case AF_INET:
2738                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2739                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2740                         ctdb_addr_to_str(&tcp_sock->src),
2741                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2742                 break;
2743         case AF_INET6:
2744                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2745                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2746                         ctdb_addr_to_str(&tcp_sock->src),
2747                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2748                 break;
2749         default:
2750                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2751         }
2752
2753
2754         /* tell all nodes about this tcp connection */
2755         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2756                                        CTDB_CONTROL_TCP_ADD,
2757                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2758         if (ret != 0) {
2759                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2760                 return -1;
2761         }
2762
2763         return 0;
2764 }
2765
2766 /*
2767   find a tcp address on a list
2768  */
2769 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2770                                            struct ctdb_tcp_connection *tcp)
2771 {
2772         int i;
2773
2774         if (array == NULL) {
2775                 return NULL;
2776         }
2777
2778         for (i=0;i<array->num;i++) {
2779                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2780                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2781                         return &array->connections[i];
2782                 }
2783         }
2784         return NULL;
2785 }
2786
2787
2788
2789 /*
2790   called by a daemon to inform us of a TCP connection that one of its
2791   clients managing that should tickled with an ACK when IP takeover is
2792   done
2793  */
2794 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2795 {
2796         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2797         struct ctdb_tcp_array *tcparray;
2798         struct ctdb_tcp_connection tcp;
2799         struct ctdb_vnn *vnn;
2800
2801         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2802         if (vnn == NULL) {
2803                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2804                         ctdb_addr_to_str(&p->dst_addr)));
2805
2806                 return -1;
2807         }
2808
2809
2810         tcparray = vnn->tcp_array;
2811
2812         /* If this is the first tickle */
2813         if (tcparray == NULL) {
2814                 tcparray = talloc_size(ctdb->nodes, 
2815                         offsetof(struct ctdb_tcp_array, connections) +
2816                         sizeof(struct ctdb_tcp_connection) * 1);
2817                 CTDB_NO_MEMORY(ctdb, tcparray);
2818                 vnn->tcp_array = tcparray;
2819
2820                 tcparray->num = 0;
2821                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2822                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2823
2824                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2825                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2826                 tcparray->num++;
2827
2828                 if (tcp_update_needed) {
2829                         vnn->tcp_update_needed = true;
2830                 }
2831                 return 0;
2832         }
2833
2834
2835         /* Do we already have this tickle ?*/
2836         tcp.src_addr = p->src_addr;
2837         tcp.dst_addr = p->dst_addr;
2838         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2839                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2840                         ctdb_addr_to_str(&tcp.dst_addr),
2841                         ntohs(tcp.dst_addr.ip.sin_port),
2842                         vnn->pnn));
2843                 return 0;
2844         }
2845
2846         /* A new tickle, we must add it to the array */
2847         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2848                                         struct ctdb_tcp_connection,
2849                                         tcparray->num+1);
2850         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2851
2852         vnn->tcp_array = tcparray;
2853         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2854         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2855         tcparray->num++;
2856                                 
2857         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2858                 ctdb_addr_to_str(&tcp.dst_addr),
2859                 ntohs(tcp.dst_addr.ip.sin_port),
2860                 vnn->pnn));
2861
2862         if (tcp_update_needed) {
2863                 vnn->tcp_update_needed = true;
2864         }
2865
2866         return 0;
2867 }
2868
2869
2870 /*
2871   called by a daemon to inform us of a TCP connection that one of its
2872   clients managing that should tickled with an ACK when IP takeover is
2873   done
2874  */
2875 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2876 {
2877         struct ctdb_tcp_connection *tcpp;
2878         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2879
2880         if (vnn == NULL) {
2881                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2882                         ctdb_addr_to_str(&conn->dst_addr)));
2883                 return;
2884         }
2885
2886         /* if the array is empty we cant remove it
2887            and we dont need to do anything
2888          */
2889         if (vnn->tcp_array == NULL) {
2890                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2891                         ctdb_addr_to_str(&conn->dst_addr),
2892                         ntohs(conn->dst_addr.ip.sin_port)));
2893                 return;
2894         }
2895
2896
2897         /* See if we know this connection
2898            if we dont know this connection  then we dont need to do anything
2899          */
2900         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2901         if (tcpp == NULL) {
2902                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2903                         ctdb_addr_to_str(&conn->dst_addr),
2904                         ntohs(conn->dst_addr.ip.sin_port)));
2905                 return;
2906         }
2907
2908
2909         /* We need to remove this entry from the array.
2910            Instead of allocating a new array and copying data to it
2911            we cheat and just copy the last entry in the existing array
2912            to the entry that is to be removed and just shring the 
2913            ->num field
2914          */
2915         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2916         vnn->tcp_array->num--;
2917
2918         /* If we deleted the last entry we also need to remove the entire array
2919          */
2920         if (vnn->tcp_array->num == 0) {
2921                 talloc_free(vnn->tcp_array);
2922                 vnn->tcp_array = NULL;
2923         }               
2924
2925         vnn->tcp_update_needed = true;
2926
2927         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2928                 ctdb_addr_to_str(&conn->src_addr),
2929                 ntohs(conn->src_addr.ip.sin_port)));
2930 }
2931
2932
2933 /*
2934   called by a daemon to inform us of a TCP connection that one of its
2935   clients used are no longer needed in the tickle database
2936  */
2937 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2938 {
2939         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2940
2941         ctdb_remove_tcp_connection(ctdb, conn);
2942
2943         return 0;
2944 }
2945
2946
2947 /*
2948   called when a daemon restarts - send all tickes for all public addresses
2949   we are serving immediately to the new node.
2950  */
2951 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2952 {
2953 /*XXX here we should send all tickes we are serving to the new node */
2954         return 0;
2955 }
2956
2957
2958 /*
2959   called when a client structure goes away - hook to remove
2960   elements from the tcp_list in all daemons
2961  */
2962 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2963 {
2964         while (client->tcp_list) {
2965                 struct ctdb_tcp_list *tcp = client->tcp_list;
2966                 DLIST_REMOVE(client->tcp_list, tcp);
2967                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2968         }
2969 }
2970
2971
2972 /*
2973   release all IPs on shutdown
2974  */
2975 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2976 {
2977         struct ctdb_vnn *vnn;
2978
2979         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2980                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2981                         ctdb_vnn_unassign_iface(ctdb, vnn);
2982                         continue;
2983                 }
2984                 if (!vnn->iface) {
2985                         continue;
2986                 }
2987                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2988                                   ctdb_vnn_iface_string(vnn),
2989                                   ctdb_addr_to_str(&vnn->public_address),
2990                                   vnn->public_netmask_bits);
2991                 release_kill_clients(ctdb, &vnn->public_address);
2992                 ctdb_vnn_unassign_iface(ctdb, vnn);
2993         }
2994 }
2995
2996
2997 /*
2998   get list of public IPs
2999  */
3000 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3001                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3002 {
3003         int i, num, len;
3004         struct ctdb_all_public_ips *ips;
3005         struct ctdb_vnn *vnn;
3006         bool only_available = false;
3007
3008         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3009                 only_available = true;
3010         }
3011
3012         /* count how many public ip structures we have */
3013         num = 0;
3014         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3015                 num++;
3016         }
3017
3018         len = offsetof(struct ctdb_all_public_ips, ips) + 
3019                 num*sizeof(struct ctdb_public_ip);
3020         ips = talloc_zero_size(outdata, len);
3021         CTDB_NO_MEMORY(ctdb, ips);
3022
3023         i = 0;
3024         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3025                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3026                         continue;
3027                 }
3028                 ips->ips[i].pnn  = vnn->pnn;
3029                 ips->ips[i].addr = vnn->public_address;
3030                 i++;
3031         }
3032         ips->num = i;
3033         len = offsetof(struct ctdb_all_public_ips, ips) +
3034                 i*sizeof(struct ctdb_public_ip);
3035
3036         outdata->dsize = len;
3037         outdata->dptr  = (uint8_t *)ips;
3038
3039         return 0;
3040 }
3041
3042
3043 /*
3044   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3045  */
3046 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3047                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3048 {
3049         int i, num, len;
3050         struct ctdb_all_public_ipsv4 *ips;
3051         struct ctdb_vnn *vnn;
3052
3053         /* count how many public ip structures we have */
3054         num = 0;
3055         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3056                 if (vnn->public_address.sa.sa_family != AF_INET) {
3057                         continue;
3058                 }
3059                 num++;
3060         }
3061
3062         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3063                 num*sizeof(struct ctdb_public_ipv4);
3064         ips = talloc_zero_size(outdata, len);
3065         CTDB_NO_MEMORY(ctdb, ips);
3066
3067         outdata->dsize = len;
3068         outdata->dptr  = (uint8_t *)ips;
3069
3070         ips->num = num;
3071         i = 0;
3072         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3073                 if (vnn->public_address.sa.sa_family != AF_INET) {
3074                         continue;
3075                 }
3076                 ips->ips[i].pnn = vnn->pnn;
3077                 ips->ips[i].sin = vnn->public_address.ip;
3078                 i++;
3079         }
3080
3081         return 0;
3082 }
3083
3084 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3085                                         struct ctdb_req_control *c,
3086                                         TDB_DATA indata,
3087                                         TDB_DATA *outdata)
3088 {
3089         int i, num, len;
3090         ctdb_sock_addr *addr;
3091         struct ctdb_control_public_ip_info *info;
3092         struct ctdb_vnn *vnn;
3093
3094         addr = (ctdb_sock_addr *)indata.dptr;
3095
3096         vnn = find_public_ip_vnn(ctdb, addr);
3097         if (vnn == NULL) {
3098                 /* if it is not a public ip   it could be our 'single ip' */
3099                 if (ctdb->single_ip_vnn) {
3100                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3101                                 vnn = ctdb->single_ip_vnn;
3102                         }
3103                 }
3104         }
3105         if (vnn == NULL) {
3106                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3107                                  "'%s'not a public address\n",
3108                                  ctdb_addr_to_str(addr)));
3109                 return -1;
3110         }
3111
3112         /* count how many public ip structures we have */
3113         num = 0;
3114         for (;vnn->ifaces[num];) {
3115                 num++;
3116         }
3117
3118         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3119                 num*sizeof(struct ctdb_control_iface_info);
3120         info = talloc_zero_size(outdata, len);
3121         CTDB_NO_MEMORY(ctdb, info);
3122
3123         info->ip.addr = vnn->public_address;
3124         info->ip.pnn = vnn->pnn;
3125         info->active_idx = 0xFFFFFFFF;
3126
3127         for (i=0; vnn->ifaces[i]; i++) {
3128                 struct ctdb_iface *cur;
3129
3130                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3131                 if (cur == NULL) {
3132                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3133                                            vnn->ifaces[i]));
3134                         return -1;
3135                 }
3136                 if (vnn->iface == cur) {
3137                         info->active_idx = i;
3138                 }
3139                 strcpy(info->ifaces[i].name, cur->name);
3140                 info->ifaces[i].link_state = cur->link_up;
3141                 info->ifaces[i].references = cur->references;
3142         }
3143         info->num = i;
3144         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3145                 i*sizeof(struct ctdb_control_iface_info);
3146
3147         outdata->dsize = len;
3148         outdata->dptr  = (uint8_t *)info;
3149
3150         return 0;
3151 }
3152
3153 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3154                                 struct ctdb_req_control *c,
3155                                 TDB_DATA *outdata)
3156 {
3157         int i, num, len;
3158         struct ctdb_control_get_ifaces *ifaces;
3159         struct ctdb_iface *cur;
3160
3161         /* count how many public ip structures we have */
3162         num = 0;
3163         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3164                 num++;
3165         }
3166
3167         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3168                 num*sizeof(struct ctdb_control_iface_info);
3169         ifaces = talloc_zero_size(outdata, len);
3170         CTDB_NO_MEMORY(ctdb, ifaces);
3171
3172         i = 0;
3173         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3174                 strcpy(ifaces->ifaces[i].name, cur->name);
3175                 ifaces->ifaces[i].link_state = cur->link_up;
3176                 ifaces->ifaces[i].references = cur->references;
3177                 i++;
3178         }
3179         ifaces->num = i;
3180         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3181                 i*sizeof(struct ctdb_control_iface_info);
3182
3183         outdata->dsize = len;
3184         outdata->dptr  = (uint8_t *)ifaces;
3185
3186         return 0;
3187 }
3188
3189 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3190                                     struct ctdb_req_control *c,
3191                                     TDB_DATA indata)
3192 {
3193         struct ctdb_control_iface_info *info;
3194         struct ctdb_iface *iface;
3195         bool link_up = false;
3196
3197         info = (struct ctdb_control_iface_info *)indata.dptr;
3198
3199         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3200                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3201                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3202                                   len, len, info->name));
3203                 return -1;
3204         }
3205
3206         switch (info->link_state) {
3207         case 0:
3208                 link_up = false;
3209                 break;
3210         case 1:
3211                 link_up = true;
3212                 break;
3213         default:
3214                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3215                                   (unsigned int)info->link_state));
3216                 return -1;
3217         }
3218
3219         if (info->references != 0) {
3220                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3221                                   (unsigned int)info->references));
3222                 return -1;
3223         }
3224
3225         iface = ctdb_find_iface(ctdb, info->name);
3226         if (iface == NULL) {
3227                 return -1;
3228         }
3229
3230         if (link_up == iface->link_up) {
3231                 return 0;
3232         }
3233
3234         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3235               ("iface[%s] has changed it's link status %s => %s\n",
3236                iface->name,
3237                iface->link_up?"up":"down",
3238                link_up?"up":"down"));
3239
3240         iface->link_up = link_up;
3241         return 0;
3242 }
3243
3244
3245 /* 
3246    structure containing the listening socket and the list of tcp connections
3247    that the ctdb daemon is to kill
3248 */
3249 struct ctdb_kill_tcp {
3250         struct ctdb_vnn *vnn;
3251         struct ctdb_context *ctdb;
3252         int capture_fd;
3253         struct fd_event *fde;
3254         trbt_tree_t *connections;
3255         void *private_data;
3256 };
3257
3258 /*
3259   a tcp connection that is to be killed
3260  */
3261 struct ctdb_killtcp_con {
3262         ctdb_sock_addr src_addr;
3263         ctdb_sock_addr dst_addr;
3264         int count;
3265         struct ctdb_kill_tcp *killtcp;
3266 };
3267
3268 /* this function is used to create a key to represent this socketpair
3269    in the killtcp tree.
3270    this key is used to insert and lookup matching socketpairs that are
3271    to be tickled and RST
3272 */
3273 #define KILLTCP_KEYLEN  10
3274 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3275 {
3276         static uint32_t key[KILLTCP_KEYLEN];
3277
3278         bzero(key, sizeof(key));
3279
3280         if (src->sa.sa_family != dst->sa.sa_family) {
3281                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3282                 return key;
3283         }
3284         
3285         switch (src->sa.sa_family) {
3286         case AF_INET:
3287                 key[0]  = dst->ip.sin_addr.s_addr;
3288                 key[1]  = src->ip.sin_addr.s_addr;
3289                 key[2]  = dst->ip.sin_port;
3290                 key[3]  = src->ip.sin_port;
3291                 break;
3292         case AF_INET6: {
3293                 uint32_t *dst6_addr32 =
3294                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3295                 uint32_t *src6_addr32 =
3296                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3297                 key[0]  = dst6_addr32[3];
3298                 key[1]  = src6_addr32[3];
3299                 key[2]  = dst6_addr32[2];
3300                 key[3]  = src6_addr32[2];
3301                 key[4]  = dst6_addr32[1];
3302                 key[5]  = src6_addr32[1];
3303                 key[6]  = dst6_addr32[0];
3304                 key[7]  = src6_addr32[0];
3305                 key[8]  = dst->ip6.sin6_port;
3306                 key[9]  = src->ip6.sin6_port;
3307                 break;
3308         }
3309         default:
3310                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3311                 return key;
3312         }
3313
3314         return key;
3315 }
3316
3317 /*
3318   called when we get a read event on the raw socket
3319  */
3320 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3321                                 uint16_t flags, void *private_data)
3322 {
3323         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3324         struct ctdb_killtcp_con *con;
3325         ctdb_sock_addr src, dst;
3326         uint32_t ack_seq, seq;
3327
3328         if (!(flags & EVENT_FD_READ)) {
3329                 return;
3330         }
3331
3332         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3333                                 killtcp->private_data,
3334                                 &src, &dst,
3335                                 &ack_seq, &seq) != 0) {
3336                 /* probably a non-tcp ACK packet */
3337                 return;
3338         }
3339
3340         /* check if we have this guy in our list of connections
3341            to kill
3342         */
3343         con = trbt_lookuparray32(killtcp->connections, 
3344                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3345         if (con == NULL) {
3346                 /* no this was some other packet we can just ignore */
3347                 return;
3348         }
3349
3350         /* This one has been tickled !
3351            now reset him and remove him from the list.
3352          */
3353         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3354                 ntohs(con->dst_addr.ip.sin_port),
3355                 ctdb_addr_to_str(&con->src_addr),
3356                 ntohs(con->src_addr.ip.sin_port)));
3357
3358         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3359         talloc_free(con);
3360 }
3361
3362
3363 /* when traversing the list of all tcp connections to send tickle acks to
3364    (so that we can capture the ack coming back and kill the connection
3365     by a RST)
3366    this callback is called for each connection we are currently trying to kill
3367 */
3368 static int tickle_connection_traverse(void *param, void *data)
3369 {
3370         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3371
3372         /* have tried too many times, just give up */
3373         if (con->count >= 5) {
3374                 /* can't delete in traverse: reparent to delete_cons */
3375                 talloc_steal(param, con);
3376                 return 0;
3377         }
3378
3379         /* othervise, try tickling it again */
3380         con->count++;
3381         ctdb_sys_send_tcp(
3382                 (ctdb_sock_addr *)&con->dst_addr,
3383                 (ctdb_sock_addr *)&con->src_addr,
3384                 0, 0, 0);
3385         return 0;
3386 }
3387
3388
3389 /* 
3390    called every second until all sentenced connections have been reset
3391  */
3392 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3393                                               struct timeval t, void *private_data)
3394 {
3395         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3396         void *delete_cons = talloc_new(NULL);
3397
3398         /* loop over all connections sending tickle ACKs */
3399         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3400
3401         /* now we've finished traverse, it's safe to do deletion. */
3402         talloc_free(delete_cons);
3403
3404         /* If there are no more connections to kill we can remove the
3405            entire killtcp structure
3406          */
3407         if ( (killtcp->connections == NULL) || 
3408              (killtcp->connections->root == NULL) ) {
3409                 talloc_free(killtcp);
3410                 return;
3411         }
3412
3413         /* try tickling them again in a seconds time
3414          */
3415         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3416                         ctdb_tickle_sentenced_connections, killtcp);
3417 }
3418
3419 /*
3420   destroy the killtcp structure
3421  */
3422 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3423 {
3424         struct ctdb_vnn *tmpvnn;
3425
3426         /* verify that this vnn is still active */
3427         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3428                 if (tmpvnn == killtcp->vnn) {
3429                         break;
3430                 }
3431         }
3432
3433         if (tmpvnn == NULL) {
3434                 return 0;
3435         }
3436
3437         if (killtcp->vnn->killtcp != killtcp) {
3438                 return 0;
3439         }
3440
3441         killtcp->vnn->killtcp = NULL;
3442
3443         return 0;
3444 }
3445
3446
3447 /* nothing fancy here, just unconditionally replace any existing
3448    connection structure with the new one.
3449
3450    dont even free the old one if it did exist, that one is talloc_stolen
3451    by the same node in the tree anyway and will be deleted when the new data 
3452    is deleted
3453 */
3454 static void *add_killtcp_callback(void *parm, void *data)
3455 {
3456         return parm;
3457 }
3458
3459 /*
3460   add a tcp socket to the list of connections we want to RST
3461  */
3462 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3463                                        ctdb_sock_addr *s,
3464                                        ctdb_sock_addr *d)
3465 {
3466         ctdb_sock_addr src, dst;
3467         struct ctdb_kill_tcp *killtcp;
3468         struct ctdb_killtcp_con *con;
3469         struct ctdb_vnn *vnn;
3470
3471         ctdb_canonicalize_ip(s, &src);
3472         ctdb_canonicalize_ip(d, &dst);
3473
3474         vnn = find_public_ip_vnn(ctdb, &dst);
3475         if (vnn == NULL) {
3476                 vnn = find_public_ip_vnn(ctdb, &src);
3477         }
3478         if (vnn == NULL) {
3479                 /* if it is not a public ip   it could be our 'single ip' */
3480                 if (ctdb->single_ip_vnn) {
3481                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3482                                 vnn = ctdb->single_ip_vnn;
3483                         }
3484                 }
3485         }
3486         if (vnn == NULL) {
3487                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3488                 return -1;
3489         }
3490
3491         killtcp = vnn->killtcp;
3492         
3493         /* If this is the first connection to kill we must allocate
3494            a new structure
3495          */
3496         if (killtcp == NULL) {
3497                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3498                 CTDB_NO_MEMORY(ctdb, killtcp);
3499
3500                 killtcp->vnn         = vnn;
3501                 killtcp->ctdb        = ctdb;
3502                 killtcp->capture_fd  = -1;
3503                 killtcp->connections = trbt_create(killtcp, 0);
3504
3505                 vnn->killtcp         = killtcp;
3506                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3507         }
3508
3509
3510
3511         /* create a structure that describes this connection we want to
3512            RST and store it in killtcp->connections
3513         */
3514         con = talloc(killtcp, struct ctdb_killtcp_con);
3515         CTDB_NO_MEMORY(ctdb, con);
3516         con->src_addr = src;
3517         con->dst_addr = dst;
3518         con->count    = 0;
3519         con->killtcp  = killtcp;
3520
3521
3522         trbt_insertarray32_callback(killtcp->connections,
3523                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3524                         add_killtcp_callback, con);
3525
3526         /* 
3527            If we dont have a socket to listen on yet we must create it
3528          */
3529         if (killtcp->capture_fd == -1) {
3530                 const char *iface = ctdb_vnn_iface_string(vnn);
3531                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3532                 if (killtcp->capture_fd == -1) {
3533                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3534                                           "socket on iface '%s' for killtcp (%s)\n",
3535                                           iface, strerror(errno)));
3536                         goto failed;
3537                 }
3538         }
3539
3540
3541         if (killtcp->fde == NULL) {
3542                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3543                                             EVENT_FD_READ,
3544                                             capture_tcp_handler, killtcp);
3545                 tevent_fd_set_auto_close(killtcp->fde);
3546
3547                 /* We also need to set up some events to tickle all these connections
3548                    until they are all reset
3549                 */
3550                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3551                                 ctdb_tickle_sentenced_connections, killtcp);
3552         }
3553
3554         /* tickle him once now */
3555         ctdb_sys_send_tcp(
3556                 &con->dst_addr,
3557                 &con->src_addr,
3558                 0, 0, 0);
3559
3560         return 0;
3561
3562 failed:
3563         talloc_free(vnn->killtcp);
3564         vnn->killtcp = NULL;
3565         return -1;
3566 }
3567
3568 /*
3569   kill a TCP connection.
3570  */
3571 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3572 {
3573         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3574
3575         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3576 }
3577
3578 /*
3579   called by a daemon to inform us of the entire list of TCP tickles for
3580   a particular public address.
3581   this control should only be sent by the node that is currently serving
3582   that public address.
3583  */
3584 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3585 {
3586         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3587         struct ctdb_tcp_array *tcparray;
3588         struct ctdb_vnn *vnn;
3589
3590         /* We must at least have tickles.num or else we cant verify the size
3591            of the received data blob
3592          */
3593         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3594                                         tickles.connections)) {
3595                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3596                 return -1;
3597         }
3598
3599         /* verify that the size of data matches what we expect */
3600         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3601                                 tickles.connections)
3602                          + sizeof(struct ctdb_tcp_connection)
3603                                  * list->tickles.num) {
3604                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3605                 return -1;
3606         }       
3607
3608         vnn = find_public_ip_vnn(ctdb, &list->addr);
3609         if (vnn == NULL) {
3610                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3611                         ctdb_addr_to_str(&list->addr)));
3612
3613                 return 1;
3614         }
3615
3616         /* remove any old ticklelist we might have */
3617         talloc_free(vnn->tcp_array);
3618         vnn->tcp_array = NULL;
3619
3620         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3621         CTDB_NO_MEMORY(ctdb, tcparray);
3622
3623         tcparray->num = list->tickles.num;
3624
3625         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3626         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3627
3628         memcpy(tcparray->connections, &list->tickles.connections[0], 
3629                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3630
3631         /* We now have a new fresh tickle list array for this vnn */
3632         vnn->tcp_array = talloc_steal(vnn, tcparray);
3633         
3634         return 0;
3635 }
3636
3637 /*
3638   called to return the full list of tickles for the puclic address associated 
3639   with the provided vnn
3640  */
3641 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3642 {
3643         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3644         struct ctdb_control_tcp_tickle_list *list;
3645         struct ctdb_tcp_array *tcparray;
3646         int num;
3647         struct ctdb_vnn *vnn;
3648
3649         vnn = find_public_ip_vnn(ctdb, addr);
3650         if (vnn == NULL) {
3651                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3652                         ctdb_addr_to_str(addr)));
3653
3654                 return 1;
3655         }
3656
3657         tcparray = vnn->tcp_array;
3658         if (tcparray) {
3659                 num = tcparray->num;
3660         } else {
3661                 num = 0;
3662         }
3663
3664         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3665                                 tickles.connections)
3666                         + sizeof(struct ctdb_tcp_connection) * num;
3667
3668         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3669         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3670         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3671
3672         list->addr = *addr;
3673         list->tickles.num = num;
3674         if (num) {
3675                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3676                         sizeof(struct ctdb_tcp_connection) * num);
3677         }
3678
3679         return 0;
3680 }
3681
3682
3683 /*
3684   set the list of all tcp tickles for a public address
3685  */
3686 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3687                               struct timeval timeout, uint32_t destnode, 
3688                               ctdb_sock_addr *addr,
3689                               struct ctdb_tcp_array *tcparray)
3690 {
3691         int ret, num;
3692         TDB_DATA data;
3693         struct ctdb_control_tcp_tickle_list *list;
3694
3695         if (tcparray) {
3696                 num = tcparray->num;
3697         } else {
3698                 num = 0;
3699         }
3700
3701         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3702                                 tickles.connections) +
3703                         sizeof(struct ctdb_tcp_connection) * num;
3704         data.dptr = talloc_size(ctdb, data.dsize);
3705         CTDB_NO_MEMORY(ctdb, data.dptr);
3706
3707         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3708         list->addr = *addr;
3709         list->tickles.num = num;
3710         if (tcparray) {
3711                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3712         }
3713
3714         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3715                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3716                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3717         if (ret != 0) {
3718                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3719                 return -1;
3720         }
3721
3722         talloc_free(data.dptr);
3723
3724         return ret;
3725 }
3726
3727
3728 /*
3729   perform tickle updates if required
3730  */
3731 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3732                                 struct timed_event *te, 
3733                                 struct timeval t, void *private_data)
3734 {
3735         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3736         int ret;
3737         struct ctdb_vnn *vnn;
3738
3739         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3740                 /* we only send out updates for public addresses that 
3741                    we have taken over
3742                  */
3743                 if (ctdb->pnn != vnn->pnn) {
3744                         continue;
3745                 }
3746                 /* We only send out the updates if we need to */
3747                 if (!vnn->tcp_update_needed) {
3748                         continue;
3749                 }
3750                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3751                                 TAKEOVER_TIMEOUT(),
3752                                 CTDB_BROADCAST_CONNECTED,
3753                                 &vnn->public_address,
3754                                 vnn->tcp_array);
3755                 if (ret != 0) {
3756                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3757                                 ctdb_addr_to_str(&vnn->public_address)));
3758                 }
3759         }
3760
3761         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3762                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3763                              ctdb_update_tcp_tickles, ctdb);
3764 }               
3765         
3766
3767 /*
3768   start periodic update of tcp tickles
3769  */
3770 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3771 {
3772         ctdb->tickle_update_context = talloc_new(ctdb);
3773
3774         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3775                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3776                              ctdb_update_tcp_tickles, ctdb);
3777 }
3778
3779
3780
3781
3782 struct control_gratious_arp {
3783         struct ctdb_context *ctdb;
3784         ctdb_sock_addr addr;
3785         const char *iface;
3786         int count;
3787 };
3788
3789 /*
3790   send a control_gratuitous arp
3791  */
3792 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3793                                   struct timeval t, void *private_data)
3794 {
3795         int ret;
3796         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3797                                                         struct control_gratious_arp);
3798
3799         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3800         if (ret != 0) {
3801                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3802                                  arp->iface, strerror(errno)));
3803         }
3804
3805
3806         arp->count++;
3807         if (arp->count == CTDB_ARP_REPEAT) {
3808                 talloc_free(arp);
3809                 return;
3810         }
3811
3812         event_add_timed(arp->ctdb->ev, arp, 
3813                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3814                         send_gratious_arp, arp);
3815 }
3816
3817
3818 /*
3819   send a gratious arp 
3820  */
3821 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3822 {
3823         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3824         struct control_gratious_arp *arp;
3825
3826         /* verify the size of indata */
3827         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3828                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3829                                  (unsigned)indata.dsize, 
3830                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3831                 return -1;
3832         }
3833         if (indata.dsize != 
3834                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3835                 + gratious_arp->len ) ){
3836
3837                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3838                         "but should be %u bytes\n", 
3839                          (unsigned)indata.dsize, 
3840                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3841                 return -1;
3842         }
3843
3844
3845         arp = talloc(ctdb, struct control_gratious_arp);
3846         CTDB_NO_MEMORY(ctdb, arp);
3847
3848         arp->ctdb  = ctdb;
3849         arp->addr   = gratious_arp->addr;
3850         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3851         CTDB_NO_MEMORY(ctdb, arp->iface);
3852         arp->count = 0;
3853         
3854         event_add_timed(arp->ctdb->ev, arp, 
3855                         timeval_zero(), send_gratious_arp, arp);
3856
3857         return 0;
3858 }
3859
3860 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3861 {
3862         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3863         int ret;
3864
3865         /* verify the size of indata */
3866         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3867                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3868                 return -1;
3869         }
3870         if (indata.dsize != 
3871                 ( offsetof(struct ctdb_control_ip_iface, iface)
3872                 + pub->len ) ){
3873
3874                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3875                         "but should be %u bytes\n", 
3876                          (unsigned)indata.dsize, 
3877                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3878                 return -1;
3879         }
3880
3881         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
3882
3883         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3884
3885         if (ret != 0) {
3886                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3887                 return -1;
3888         }
3889
3890         return 0;
3891 }
3892
3893 /*
3894   called when releaseip event finishes for del_public_address
3895  */
3896 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3897                                 void *private_data)
3898 {
3899         talloc_free(private_data);
3900 }
3901
3902 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3903 {
3904         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3905         struct ctdb_vnn *vnn;
3906         int ret;
3907
3908         /* verify the size of indata */
3909         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3910                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3911                 return -1;
3912         }
3913         if (indata.dsize != 
3914                 ( offsetof(struct ctdb_control_ip_iface, iface)
3915                 + pub->len ) ){
3916
3917                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3918                         "but should be %u bytes\n", 
3919                          (unsigned)indata.dsize, 
3920                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3921                 return -1;
3922         }
3923
3924         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
3925
3926         /* walk over all public addresses until we find a match */
3927         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3928                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3929                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3930
3931                         DLIST_REMOVE(ctdb->vnn, vnn);
3932                         talloc_steal(mem_ctx, vnn);
3933                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3934                         if (vnn->pnn != ctdb->pnn) {
3935                                 if (vnn->iface != NULL) {
3936                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3937                                 }
3938                                 talloc_free(mem_ctx);
3939                                 return 0;
3940                         }
3941                         vnn->pnn = -1;
3942
3943                         ret = ctdb_event_script_callback(ctdb, 
3944                                          mem_ctx, delete_ip_callback, mem_ctx,
3945                                          false,
3946                                          CTDB_EVENT_RELEASE_IP,
3947                                          "%s %s %u",
3948                                          ctdb_vnn_iface_string(vnn),
3949                                          ctdb_addr_to_str(&vnn->public_address),
3950                                          vnn->public_netmask_bits);
3951                         if (vnn->iface != NULL) {
3952                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3953                         }
3954                         if (ret != 0) {
3955                                 return -1;
3956                         }
3957                         return 0;
3958                 }
3959         }
3960
3961         return -1;
3962 }
3963
3964
3965 struct ipreallocated_callback_state {
3966         struct ctdb_req_control *c;
3967 };
3968
3969 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3970                                         int status, void *p)
3971 {
3972         struct ipreallocated_callback_state *state =
3973                 talloc_get_type(p, struct ipreallocated_callback_state);
3974
3975         if (status != 0) {
3976                 DEBUG(DEBUG_ERR,
3977                       (" \"ipreallocated\" event script failed (status %d)\n",
3978                        status));
3979                 if (status == -ETIME) {
3980                         ctdb_ban_self(ctdb);
3981                 }
3982         }
3983
3984         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3985         talloc_free(state);
3986 }
3987
3988 /* A control to run the ipreallocated event */
3989 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3990                                    struct ctdb_req_control *c,
3991                                    bool *async_reply)
3992 {
3993         int ret;
3994         struct ipreallocated_callback_state *state;
3995
3996         state = talloc(ctdb, struct ipreallocated_callback_state);
3997         CTDB_NO_MEMORY(ctdb, state);
3998
3999         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4000
4001         ret = ctdb_event_script_callback(ctdb, state,
4002                                          ctdb_ipreallocated_callback, state,
4003                                          false, CTDB_EVENT_IPREALLOCATED,
4004                                          "%s", "");
4005
4006         if (ret != 0) {
4007                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4008                 talloc_free(state);
4009                 return -1;
4010         }
4011
4012         /* tell the control that we will be reply asynchronously */
4013         state->c    = talloc_steal(state, c);
4014         *async_reply = true;
4015
4016         return 0;
4017 }
4018
4019
4020 /* This function is called from the recovery daemon to verify that a remote
4021    node has the expected ip allocation.
4022    This is verified against ctdb->ip_tree
4023 */
4024 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
4025 {
4026         struct ctdb_public_ip_list *tmp_ip; 
4027         int i;
4028
4029         if (ctdb->ip_tree == NULL) {
4030                 /* dont know the expected allocation yet, assume remote node
4031                    is correct. */
4032                 return 0;
4033         }
4034
4035         if (ips == NULL) {
4036                 return 0;
4037         }
4038
4039         for (i=0; i<ips->num; i++) {
4040                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4041                 if (tmp_ip == NULL) {
4042                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4043                         return -1;
4044                 }
4045
4046                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4047                         continue;
4048                 }
4049
4050                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4051                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
4052                         return -1;
4053                 }
4054         }
4055
4056         return 0;
4057 }
4058
4059 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4060 {
4061         struct ctdb_public_ip_list *tmp_ip; 
4062
4063         if (ctdb->ip_tree == NULL) {
4064                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4065                 return -1;
4066         }
4067
4068         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4069         if (tmp_ip == NULL) {
4070                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4071                 return -1;
4072         }
4073
4074         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4075         tmp_ip->pnn = ip->pnn;
4076
4077         return 0;
4078 }
4079
4080
4081 struct ctdb_reloadips_handle {
4082         struct ctdb_context *ctdb;
4083         struct ctdb_req_control *c;
4084         int status;
4085         int fd[2];
4086         pid_t child;
4087         struct fd_event *fde;
4088 };
4089
4090 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4091 {
4092         if (h == h->ctdb->reload_ips) {
4093                 h->ctdb->reload_ips = NULL;
4094         }
4095         if (h->c != NULL) {
4096                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4097                 h->c = NULL;
4098         }
4099         ctdb_kill(h->ctdb, h->child, SIGKILL);
4100         return 0;
4101 }
4102
4103 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4104                                 struct timed_event *te,
4105                                 struct timeval t, void *private_data)
4106 {
4107         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4108
4109         talloc_free(h);
4110 }       
4111
4112 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4113                              uint16_t flags, void *private_data)
4114 {
4115         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4116
4117         char res;
4118         int ret;
4119
4120         ret = read(h->fd[0], &res, 1);
4121         if (ret < 1 || res != 0) {
4122                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4123                 res = 1;
4124         }
4125         h->status = res;
4126
4127         talloc_free(h);
4128 }
4129
4130 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4131 {
4132         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4133         struct ctdb_all_public_ips *ips;
4134         struct ctdb_vnn *vnn;
4135         int i, ret;
4136
4137         /* read the ip allocation from the local node */
4138         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4139         if (ret != 0) {
4140                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4141                 talloc_free(mem_ctx);
4142                 return -1;
4143         }
4144
4145         /* re-read the public ips file */
4146         ctdb->vnn = NULL;
4147         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4148                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4149                 talloc_free(mem_ctx);
4150                 return -1;
4151         }               
4152
4153
4154         /* check the previous list of ips and scan for ips that have been
4155            dropped.
4156          */
4157         for (i = 0; i < ips->num; i++) {
4158                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4159                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4160                                 break;
4161                         }
4162                 }
4163
4164                 /* we need to delete this ip, no longer available on this node */
4165                 if (vnn == NULL) {
4166                         struct ctdb_control_ip_iface pub;
4167
4168                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4169                         pub.addr  = ips->ips[i].addr;
4170                         pub.mask  = 0;
4171                         pub.len   = 0;
4172
4173                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4174                         if (ret != 0) {
4175                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4176                                 return -1;
4177                         }
4178                 }
4179         }
4180
4181
4182         /* loop over all new ones and check the ones we need to add */
4183         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4184                 for (i = 0; i < ips->num; i++) {
4185                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4186                                 break;
4187                         }
4188                 }
4189                 if (i == ips->num) {
4190                         struct ctdb_control_ip_iface pub;
4191                         const char *ifaces = NULL;
4192                         int iface = 0;
4193
4194                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4195
4196                         pub.addr  = vnn->public_address;
4197                         pub.mask  = vnn->public_netmask_bits;
4198
4199
4200                         ifaces = vnn->ifaces[0];
4201                         iface = 1;
4202                         while (vnn->ifaces[iface] != NULL) {
4203                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4204                                 iface++;
4205                         }
4206                         pub.len   = strlen(ifaces)+1;
4207                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4208
4209                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4210                         if (ret != 0) {
4211                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4212                                 return -1;
4213                         }
4214                 }
4215         }
4216
4217         return 0;
4218 }
4219
4220 /* This control is sent to force the node to re-read the public addresses file
4221    and drop any addresses we should nnot longer host, and add new addresses
4222    that we are now able to host
4223 */
4224 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4225 {
4226         struct ctdb_reloadips_handle *h;
4227         pid_t parent = getpid();
4228
4229         if (ctdb->reload_ips != NULL) {
4230                 talloc_free(ctdb->reload_ips);
4231                 ctdb->reload_ips = NULL;
4232         }
4233
4234         h = talloc(ctdb, struct ctdb_reloadips_handle);
4235         CTDB_NO_MEMORY(ctdb, h);
4236         h->ctdb     = ctdb;
4237         h->c        = NULL;
4238         h->status   = -1;
4239         
4240         if (pipe(h->fd) == -1) {
4241                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4242                 talloc_free(h);
4243                 return -1;
4244         }
4245
4246         h->child = ctdb_fork(ctdb);
4247         if (h->child == (pid_t)-1) {
4248                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4249                 close(h->fd[0]);
4250                 close(h->fd[1]);
4251                 talloc_free(h);
4252                 return -1;
4253         }
4254
4255         /* child process */
4256         if (h->child == 0) {
4257                 signed char res = 0;
4258
4259                 close(h->fd[0]);
4260                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4261
4262                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4263                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4264                         res = -1;
4265                 } else {
4266                         res = ctdb_reloadips_child(ctdb);
4267                         if (res != 0) {
4268                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4269                         }
4270                 }
4271
4272                 write(h->fd[1], &res, 1);
4273                 /* make sure we die when our parent dies */
4274                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4275                         sleep(5);
4276                 }
4277                 _exit(0);
4278         }
4279
4280         h->c             = talloc_steal(h, c);
4281
4282         close(h->fd[1]);
4283         set_close_on_exec(h->fd[0]);
4284
4285         talloc_set_destructor(h, ctdb_reloadips_destructor);
4286
4287
4288         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4289                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4290                         (void *)h);
4291         tevent_fd_set_auto_close(h->fde);
4292
4293         event_add_timed(ctdb->ev, h,
4294                         timeval_current_ofs(120, 0),
4295                         ctdb_reloadips_timeout_event, h);
4296
4297         /* we reply later */
4298         *async_reply = true;
4299         return 0;
4300 }