f263fce296c7b0970a29829481802d6286bcf064
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68
69         enum ipalloc_algorithm algorithm;
70         uint32_t no_ip_failback;
71 };
72
73 struct ctdb_interface {
74         struct ctdb_interface *prev, *next;
75         const char *name;
76         bool link_up;
77         uint32_t references;
78 };
79
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
81 {
82         if (vnn->iface) {
83                 return vnn->iface->name;
84         }
85
86         return "__none__";
87 }
88
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
90 {
91         struct ctdb_interface *i;
92
93         /* Verify that we don't have an entry for this ip yet */
94         for (i=ctdb->ifaces;i;i=i->next) {
95                 if (strcmp(i->name, iface) == 0) {
96                         return 0;
97                 }
98         }
99
100         /* create a new structure for this interface */
101         i = talloc_zero(ctdb, struct ctdb_interface);
102         CTDB_NO_MEMORY_FATAL(ctdb, i);
103         i->name = talloc_strdup(i, iface);
104         CTDB_NO_MEMORY(ctdb, i->name);
105
106         i->link_up = true;
107
108         DLIST_ADD(ctdb->ifaces, i);
109
110         return 0;
111 }
112
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
114                                         const char *name)
115 {
116         int n;
117
118         for (n = 0; vnn->ifaces[n] != NULL; n++) {
119                 if (strcmp(name, vnn->ifaces[n]) == 0) {
120                         return true;
121                 }
122         }
123
124         return false;
125 }
126
127 /* If any interfaces now have no possible IPs then delete them.  This
128  * implementation is naive (i.e. simple) rather than clever
129  * (i.e. complex).  Given that this is run on delip and that operation
130  * is rare, this doesn't need to be efficient - it needs to be
131  * foolproof.  One alternative is reference counting, where the logic
132  * is distributed and can, therefore, be broken in multiple places.
133  * Another alternative is to build a red-black tree of interfaces that
134  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135  * once) and then walking ctdb->ifaces once and deleting those not in
136  * the tree.  Let's go to one of those if the naive implementation
137  * causes problems...  :-)
138  */
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140                                         struct ctdb_vnn *vnn)
141 {
142         struct ctdb_interface *i, *next;
143
144         /* For each interface, check if there's an IP using it. */
145         for (i = ctdb->ifaces; i != NULL; i = next) {
146                 struct ctdb_vnn *tv;
147                 bool found;
148                 next = i->next;
149
150                 /* Only consider interfaces named in the given VNN. */
151                 if (!vnn_has_interface_with_name(vnn, i->name)) {
152                         continue;
153                 }
154
155                 /* Is the "single IP" on this interface? */
156                 if ((ctdb->single_ip_vnn != NULL) &&
157                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159                         /* Found, next interface please... */
160                         continue;
161                 }
162                 /* Search for a vnn with this interface. */
163                 found = false;
164                 for (tv=ctdb->vnn; tv; tv=tv->next) {
165                         if (vnn_has_interface_with_name(tv, i->name)) {
166                                 found = true;
167                                 break;
168                         }
169                 }
170
171                 if (!found) {
172                         /* None of the VNNs are using this interface. */
173                         DLIST_REMOVE(ctdb->ifaces, i);
174                         talloc_free(i);
175                 }
176         }
177 }
178
179
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
181                                               const char *iface)
182 {
183         struct ctdb_interface *i;
184
185         for (i=ctdb->ifaces;i;i=i->next) {
186                 if (strcmp(i->name, iface) == 0) {
187                         return i;
188                 }
189         }
190
191         return NULL;
192 }
193
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195                                                   struct ctdb_vnn *vnn)
196 {
197         int i;
198         struct ctdb_interface *cur = NULL;
199         struct ctdb_interface *best = NULL;
200
201         for (i=0; vnn->ifaces[i]; i++) {
202
203                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
204                 if (cur == NULL) {
205                         continue;
206                 }
207
208                 if (!cur->link_up) {
209                         continue;
210                 }
211
212                 if (best == NULL) {
213                         best = cur;
214                         continue;
215                 }
216
217                 if (cur->references < best->references) {
218                         best = cur;
219                         continue;
220                 }
221         }
222
223         return best;
224 }
225
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227                                      struct ctdb_vnn *vnn)
228 {
229         struct ctdb_interface *best = NULL;
230
231         if (vnn->iface) {
232                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                                    "still assigned to iface '%s'\n",
234                                    ctdb_addr_to_str(&vnn->public_address),
235                                    ctdb_vnn_iface_string(vnn)));
236                 return 0;
237         }
238
239         best = ctdb_vnn_best_iface(ctdb, vnn);
240         if (best == NULL) {
241                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242                                   "cannot assign to iface any iface\n",
243                                   ctdb_addr_to_str(&vnn->public_address)));
244                 return -1;
245         }
246
247         vnn->iface = best;
248         best->references++;
249         vnn->pnn = ctdb->pnn;
250
251         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252                            "now assigned to iface '%s' refs[%d]\n",
253                            ctdb_addr_to_str(&vnn->public_address),
254                            ctdb_vnn_iface_string(vnn),
255                            best->references));
256         return 0;
257 }
258
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260                                     struct ctdb_vnn *vnn)
261 {
262         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263                            "now unassigned (old iface '%s' refs[%d])\n",
264                            ctdb_addr_to_str(&vnn->public_address),
265                            ctdb_vnn_iface_string(vnn),
266                            vnn->iface?vnn->iface->references:0));
267         if (vnn->iface) {
268                 vnn->iface->references--;
269         }
270         vnn->iface = NULL;
271         if (vnn->pnn == ctdb->pnn) {
272                 vnn->pnn = -1;
273         }
274 }
275
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277                                struct ctdb_vnn *vnn)
278 {
279         int i;
280
281         /* Nodes that are not RUNNING can not host IPs */
282         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
283                 return false;
284         }
285
286         if (vnn->delete_pending) {
287                 return false;
288         }
289
290         if (vnn->iface && vnn->iface->link_up) {
291                 return true;
292         }
293
294         for (i=0; vnn->ifaces[i]; i++) {
295                 struct ctdb_interface *cur;
296
297                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
298                 if (cur == NULL) {
299                         continue;
300                 }
301
302                 if (cur->link_up) {
303                         return true;
304                 }
305         }
306
307         return false;
308 }
309
310 struct ctdb_takeover_arp {
311         struct ctdb_context *ctdb;
312         uint32_t count;
313         ctdb_sock_addr addr;
314         struct ctdb_tcp_array *tcparray;
315         struct ctdb_vnn *vnn;
316 };
317
318
319 /*
320   lists of tcp endpoints
321  */
322 struct ctdb_tcp_list {
323         struct ctdb_tcp_list *prev, *next;
324         struct ctdb_connection connection;
325 };
326
327 /*
328   list of clients to kill on IP release
329  */
330 struct ctdb_client_ip {
331         struct ctdb_client_ip *prev, *next;
332         struct ctdb_context *ctdb;
333         ctdb_sock_addr addr;
334         uint32_t client_id;
335 };
336
337
338 /*
339   send a gratuitous arp
340  */
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342                                   struct tevent_timer *te,
343                                   struct timeval t, void *private_data)
344 {
345         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
346                                                         struct ctdb_takeover_arp);
347         int i, ret;
348         struct ctdb_tcp_array *tcparray;
349         const char *iface = ctdb_vnn_iface_string(arp->vnn);
350
351         ret = ctdb_sys_send_arp(&arp->addr, iface);
352         if (ret != 0) {
353                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354                                   iface, strerror(errno)));
355         }
356
357         tcparray = arp->tcparray;
358         if (tcparray) {
359                 for (i=0;i<tcparray->num;i++) {
360                         struct ctdb_connection *tcon;
361
362                         tcon = &tcparray->connections[i];
363                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
365                                 ctdb_addr_to_str(&tcon->src),
366                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
367                         ret = ctdb_sys_send_tcp(
368                                 &tcon->src,
369                                 &tcon->dst,
370                                 0, 0, 0);
371                         if (ret != 0) {
372                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373                                         ctdb_addr_to_str(&tcon->src)));
374                         }
375                 }
376         }
377
378         arp->count++;
379
380         if (arp->count == CTDB_ARP_REPEAT) {
381                 talloc_free(arp);
382                 return;
383         }
384
385         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387                          ctdb_control_send_arp, arp);
388 }
389
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391                                        struct ctdb_vnn *vnn)
392 {
393         struct ctdb_takeover_arp *arp;
394         struct ctdb_tcp_array *tcparray;
395
396         if (!vnn->takeover_ctx) {
397                 vnn->takeover_ctx = talloc_new(vnn);
398                 if (!vnn->takeover_ctx) {
399                         return -1;
400                 }
401         }
402
403         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
404         if (!arp) {
405                 return -1;
406         }
407
408         arp->ctdb = ctdb;
409         arp->addr = vnn->public_address;
410         arp->vnn  = vnn;
411
412         tcparray = vnn->tcp_array;
413         if (tcparray) {
414                 /* add all of the known tcp connections for this IP to the
415                    list of tcp connections to send tickle acks for */
416                 arp->tcparray = talloc_steal(arp, tcparray);
417
418                 vnn->tcp_array = NULL;
419                 vnn->tcp_update_needed = true;
420         }
421
422         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423                          timeval_zero(), ctdb_control_send_arp, arp);
424
425         return 0;
426 }
427
428 struct takeover_callback_state {
429         struct ctdb_req_control_old *c;
430         ctdb_sock_addr *addr;
431         struct ctdb_vnn *vnn;
432 };
433
434 struct ctdb_do_takeip_state {
435         struct ctdb_req_control_old *c;
436         struct ctdb_vnn *vnn;
437 };
438
439 /*
440   called when takeip event finishes
441  */
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
443                                     void *private_data)
444 {
445         struct ctdb_do_takeip_state *state =
446                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
447         int32_t ret;
448         TDB_DATA data;
449
450         if (status != 0) {
451                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
452         
453                 if (status == -ETIME) {
454                         ctdb_ban_self(ctdb);
455                 }
456                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457                                  ctdb_addr_to_str(&state->vnn->public_address),
458                                  ctdb_vnn_iface_string(state->vnn)));
459                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
460
461                 node->flags |= NODE_FLAGS_UNHEALTHY;
462                 talloc_free(state);
463                 return;
464         }
465
466         if (ctdb->do_checkpublicip) {
467
468         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469         if (ret != 0) {
470                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471                 talloc_free(state);
472                 return;
473         }
474
475         }
476
477         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478         data.dsize = strlen((char *)data.dptr) + 1;
479         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
480
481         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
482
483
484         /* the control succeeded */
485         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
486         talloc_free(state);
487         return;
488 }
489
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
491 {
492         state->vnn->update_in_flight = false;
493         return 0;
494 }
495
496 /*
497   take over an ip address
498  */
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500                               struct ctdb_req_control_old *c,
501                               struct ctdb_vnn *vnn)
502 {
503         int ret;
504         struct ctdb_do_takeip_state *state;
505
506         if (vnn->update_in_flight) {
507                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508                                     "update for this IP already in flight\n",
509                                     ctdb_addr_to_str(&vnn->public_address),
510                                     vnn->public_netmask_bits));
511                 return -1;
512         }
513
514         ret = ctdb_vnn_assign_iface(ctdb, vnn);
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517                                  "assign a usable interface\n",
518                                  ctdb_addr_to_str(&vnn->public_address),
519                                  vnn->public_netmask_bits));
520                 return -1;
521         }
522
523         state = talloc(vnn, struct ctdb_do_takeip_state);
524         CTDB_NO_MEMORY(ctdb, state);
525
526         state->c = talloc_steal(ctdb, c);
527         state->vnn   = vnn;
528
529         vnn->update_in_flight = true;
530         talloc_set_destructor(state, ctdb_takeip_destructor);
531
532         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533                             ctdb_addr_to_str(&vnn->public_address),
534                             vnn->public_netmask_bits,
535                             ctdb_vnn_iface_string(vnn)));
536
537         ret = ctdb_event_script_callback(ctdb,
538                                          state,
539                                          ctdb_do_takeip_callback,
540                                          state,
541                                          CTDB_EVENT_TAKE_IP,
542                                          "%s %s %u",
543                                          ctdb_vnn_iface_string(vnn),
544                                          ctdb_addr_to_str(&vnn->public_address),
545                                          vnn->public_netmask_bits);
546
547         if (ret != 0) {
548                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549                         ctdb_addr_to_str(&vnn->public_address),
550                         ctdb_vnn_iface_string(vnn)));
551                 talloc_free(state);
552                 return -1;
553         }
554
555         return 0;
556 }
557
558 struct ctdb_do_updateip_state {
559         struct ctdb_req_control_old *c;
560         struct ctdb_interface *old;
561         struct ctdb_vnn *vnn;
562 };
563
564 /*
565   called when updateip event finishes
566  */
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
568                                       void *private_data)
569 {
570         struct ctdb_do_updateip_state *state =
571                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
572         int32_t ret;
573
574         if (status != 0) {
575                 if (status == -ETIME) {
576                         ctdb_ban_self(ctdb);
577                 }
578                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579                         ctdb_addr_to_str(&state->vnn->public_address),
580                         state->old->name,
581                         ctdb_vnn_iface_string(state->vnn)));
582
583                 /*
584                  * All we can do is reset the old interface
585                  * and let the next run fix it
586                  */
587                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588                 state->vnn->iface = state->old;
589                 state->vnn->iface->references++;
590
591                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
592                 talloc_free(state);
593                 return;
594         }
595
596         if (ctdb->do_checkpublicip) {
597
598         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
599         if (ret != 0) {
600                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
601                 talloc_free(state);
602                 return;
603         }
604
605         }
606
607         /* the control succeeded */
608         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
609         talloc_free(state);
610         return;
611 }
612
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
614 {
615         state->vnn->update_in_flight = false;
616         return 0;
617 }
618
619 /*
620   update (move) an ip address
621  */
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623                                 struct ctdb_req_control_old *c,
624                                 struct ctdb_vnn *vnn)
625 {
626         int ret;
627         struct ctdb_do_updateip_state *state;
628         struct ctdb_interface *old = vnn->iface;
629         const char *new_name;
630
631         if (vnn->update_in_flight) {
632                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633                                     "update for this IP already in flight\n",
634                                     ctdb_addr_to_str(&vnn->public_address),
635                                     vnn->public_netmask_bits));
636                 return -1;
637         }
638
639         ctdb_vnn_unassign_iface(ctdb, vnn);
640         ret = ctdb_vnn_assign_iface(ctdb, vnn);
641         if (ret != 0) {
642                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643                                  "assin a usable interface (old iface '%s')\n",
644                                  ctdb_addr_to_str(&vnn->public_address),
645                                  vnn->public_netmask_bits,
646                                  old->name));
647                 return -1;
648         }
649
650         new_name = ctdb_vnn_iface_string(vnn);
651         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652                 /* A benign update from one interface onto itself.
653                  * no need to run the eventscripts in this case, just return
654                  * success.
655                  */
656                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
657                 return 0;
658         }
659
660         state = talloc(vnn, struct ctdb_do_updateip_state);
661         CTDB_NO_MEMORY(ctdb, state);
662
663         state->c = talloc_steal(ctdb, c);
664         state->old = old;
665         state->vnn = vnn;
666
667         vnn->update_in_flight = true;
668         talloc_set_destructor(state, ctdb_updateip_destructor);
669
670         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671                             "interface %s to %s\n",
672                             ctdb_addr_to_str(&vnn->public_address),
673                             vnn->public_netmask_bits,
674                             old->name,
675                             new_name));
676
677         ret = ctdb_event_script_callback(ctdb,
678                                          state,
679                                          ctdb_do_updateip_callback,
680                                          state,
681                                          CTDB_EVENT_UPDATE_IP,
682                                          "%s %s %s %u",
683                                          state->old->name,
684                                          new_name,
685                                          ctdb_addr_to_str(&vnn->public_address),
686                                          vnn->public_netmask_bits);
687         if (ret != 0) {
688                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689                                  ctdb_addr_to_str(&vnn->public_address),
690                                  old->name, new_name));
691                 talloc_free(state);
692                 return -1;
693         }
694
695         return 0;
696 }
697
698 /*
699   Find the vnn of the node that has a public ip address
700   returns -1 if the address is not known as a public address
701  */
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
703 {
704         struct ctdb_vnn *vnn;
705
706         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707                 if (ctdb_same_ip(&vnn->public_address, addr)) {
708                         return vnn;
709                 }
710         }
711
712         return NULL;
713 }
714
715 /*
716   take over an ip address
717  */
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719                                  struct ctdb_req_control_old *c,
720                                  TDB_DATA indata,
721                                  bool *async_reply)
722 {
723         int ret;
724         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725         struct ctdb_vnn *vnn;
726         bool have_ip = false;
727         bool do_updateip = false;
728         bool do_takeip = false;
729         struct ctdb_interface *best_iface = NULL;
730
731         if (pip->pnn != ctdb->pnn) {
732                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733                                  "with pnn %d, but we're node %d\n",
734                                  ctdb_addr_to_str(&pip->addr),
735                                  pip->pnn, ctdb->pnn));
736                 return -1;
737         }
738
739         /* update out vnn list */
740         vnn = find_public_ip_vnn(ctdb, &pip->addr);
741         if (vnn == NULL) {
742                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743                         ctdb_addr_to_str(&pip->addr)));
744                 return 0;
745         }
746
747         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748                 have_ip = ctdb_sys_have_ip(&pip->addr);
749         }
750         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751         if (best_iface == NULL) {
752                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753                                  "a usable interface (old %s, have_ip %d)\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  vnn->public_netmask_bits,
756                                  ctdb_vnn_iface_string(vnn),
757                                  have_ip));
758                 return -1;
759         }
760
761         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
763                 have_ip = false;
764         }
765
766
767         if (vnn->iface == NULL && have_ip) {
768                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770                                  ctdb_addr_to_str(&vnn->public_address)));
771                 return 0;
772         }
773
774         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776                                   "and we have it on iface[%s], but it was assigned to node %d"
777                                   "and we are node %d, banning ourself\n",
778                                  ctdb_addr_to_str(&vnn->public_address),
779                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
780                 ctdb_ban_self(ctdb);
781                 return -1;
782         }
783
784         if (vnn->pnn == -1 && have_ip) {
785                 vnn->pnn = ctdb->pnn;
786                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787                                   "and we already have it on iface[%s], update local daemon\n",
788                                  ctdb_addr_to_str(&vnn->public_address),
789                                   ctdb_vnn_iface_string(vnn)));
790                 return 0;
791         }
792
793         if (vnn->iface) {
794                 if (vnn->iface != best_iface) {
795                         if (!vnn->iface->link_up) {
796                                 do_updateip = true;
797                         } else if (vnn->iface->references > (best_iface->references + 1)) {
798                                 /* only move when the rebalance gains something */
799                                         do_updateip = true;
800                         }
801                 }
802         }
803
804         if (!have_ip) {
805                 if (do_updateip) {
806                         ctdb_vnn_unassign_iface(ctdb, vnn);
807                         do_updateip = false;
808                 }
809                 do_takeip = true;
810         }
811
812         if (do_takeip) {
813                 ret = ctdb_do_takeip(ctdb, c, vnn);
814                 if (ret != 0) {
815                         return -1;
816                 }
817         } else if (do_updateip) {
818                 ret = ctdb_do_updateip(ctdb, c, vnn);
819                 if (ret != 0) {
820                         return -1;
821                 }
822         } else {
823                 /*
824                  * The interface is up and the kernel known the ip
825                  * => do nothing
826                  */
827                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828                         ctdb_addr_to_str(&pip->addr),
829                         vnn->public_netmask_bits,
830                         ctdb_vnn_iface_string(vnn)));
831                 return 0;
832         }
833
834         /* tell ctdb_control.c that we will be replying asynchronously */
835         *async_reply = true;
836
837         return 0;
838 }
839
840 /*
841   kill any clients that are registered with a IP that is being released
842  */
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
844 {
845         struct ctdb_client_ip *ip;
846
847         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848                 ctdb_addr_to_str(addr)));
849
850         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851                 ctdb_sock_addr tmp_addr;
852
853                 tmp_addr = ip->addr;
854                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
855                         ip->client_id,
856                         ctdb_addr_to_str(&ip->addr)));
857
858                 if (ctdb_same_ip(&tmp_addr, addr)) {
859                         struct ctdb_client *client = reqid_find(ctdb->idr,
860                                                                 ip->client_id,
861                                                                 struct ctdb_client);
862                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
863                                 ip->client_id,
864                                 ctdb_addr_to_str(&ip->addr),
865                                 client->pid));
866
867                         if (client->pid != 0) {
868                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869                                         (unsigned)client->pid,
870                                         ctdb_addr_to_str(addr),
871                                         ip->client_id));
872                                 kill(client->pid, SIGKILL);
873                         }
874                 }
875         }
876 }
877
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
879 {
880         DLIST_REMOVE(ctdb->vnn, vnn);
881         ctdb_vnn_unassign_iface(ctdb, vnn);
882         ctdb_remove_orphaned_ifaces(ctdb, vnn);
883         talloc_free(vnn);
884 }
885
886 /*
887   called when releaseip event finishes
888  */
889 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
890                                 void *private_data)
891 {
892         struct takeover_callback_state *state = 
893                 talloc_get_type(private_data, struct takeover_callback_state);
894         TDB_DATA data;
895
896         if (status == -ETIME) {
897                 ctdb_ban_self(ctdb);
898         }
899
900         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901                 if  (ctdb_sys_have_ip(state->addr)) {
902                         DEBUG(DEBUG_ERR,
903                               ("IP %s still hosted during release IP callback, failing\n",
904                                ctdb_addr_to_str(state->addr)));
905                         ctdb_request_control_reply(ctdb, state->c,
906                                                    NULL, -1, NULL);
907                         talloc_free(state);
908                         return;
909                 }
910         }
911
912         /* send a message to all clients of this node telling them
913            that the cluster has been reconfigured and they should
914            release any sockets on this IP */
915         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917         data.dsize = strlen((char *)data.dptr)+1;
918
919         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
920
921         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
922
923         /* kill clients that have registered with this IP */
924         release_kill_clients(ctdb, state->addr);
925
926         ctdb_vnn_unassign_iface(ctdb, state->vnn);
927
928         /* Process the IP if it has been marked for deletion */
929         if (state->vnn->delete_pending) {
930                 do_delete_ip(ctdb, state->vnn);
931                 state->vnn = NULL;
932         }
933
934         /* the control succeeded */
935         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
936         talloc_free(state);
937 }
938
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
940 {
941         if (state->vnn != NULL) {
942                 state->vnn->update_in_flight = false;
943         }
944         return 0;
945 }
946
947 /*
948   release an ip address
949  */
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
951                                 struct ctdb_req_control_old *c,
952                                 TDB_DATA indata, 
953                                 bool *async_reply)
954 {
955         int ret;
956         struct takeover_callback_state *state;
957         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958         struct ctdb_vnn *vnn;
959         char *iface;
960
961         /* update our vnn list */
962         vnn = find_public_ip_vnn(ctdb, &pip->addr);
963         if (vnn == NULL) {
964                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965                         ctdb_addr_to_str(&pip->addr)));
966                 return 0;
967         }
968         vnn->pnn = pip->pnn;
969
970         /* stop any previous arps */
971         talloc_free(vnn->takeover_ctx);
972         vnn->takeover_ctx = NULL;
973
974         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975          * lazy multicast to drop an IP from any node that isn't the
976          * intended new node.  The following causes makes ctdbd ignore
977          * a release for any address it doesn't host.
978          */
979         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980                 if (!ctdb_sys_have_ip(&pip->addr)) {
981                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982                                 ctdb_addr_to_str(&pip->addr),
983                                 vnn->public_netmask_bits,
984                                 ctdb_vnn_iface_string(vnn)));
985                         ctdb_vnn_unassign_iface(ctdb, vnn);
986                         return 0;
987                 }
988         } else {
989                 if (vnn->iface == NULL) {
990                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991                                            ctdb_addr_to_str(&pip->addr),
992                                            vnn->public_netmask_bits));
993                         return 0;
994                 }
995         }
996
997         /* There is a potential race between take_ip and us because we
998          * update the VNN via a callback that run when the
999          * eventscripts have been run.  Avoid the race by allowing one
1000          * update to be in flight at a time.
1001          */
1002         if (vnn->update_in_flight) {
1003                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004                                     "update for this IP already in flight\n",
1005                                     ctdb_addr_to_str(&vnn->public_address),
1006                                     vnn->public_netmask_bits));
1007                 return -1;
1008         }
1009
1010         iface = strdup(ctdb_vnn_iface_string(vnn));
1011
1012         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1013                 ctdb_addr_to_str(&pip->addr),
1014                 vnn->public_netmask_bits,
1015                 iface,
1016                 pip->pnn));
1017
1018         state = talloc(ctdb, struct takeover_callback_state);
1019         if (state == NULL) {
1020                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021                                __FILE__, __LINE__);
1022                 free(iface);
1023                 return -1;
1024         }
1025
1026         state->c = talloc_steal(state, c);
1027         state->addr = talloc(state, ctdb_sock_addr);       
1028         if (state->addr == NULL) {
1029                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030                                __FILE__, __LINE__);
1031                 free(iface);
1032                 talloc_free(state);
1033                 return -1;
1034         }
1035         *state->addr = pip->addr;
1036         state->vnn   = vnn;
1037
1038         vnn->update_in_flight = true;
1039         talloc_set_destructor(state, ctdb_releaseip_destructor);
1040
1041         ret = ctdb_event_script_callback(ctdb, 
1042                                          state, release_ip_callback, state,
1043                                          CTDB_EVENT_RELEASE_IP,
1044                                          "%s %s %u",
1045                                          iface,
1046                                          ctdb_addr_to_str(&pip->addr),
1047                                          vnn->public_netmask_bits);
1048         free(iface);
1049         if (ret != 0) {
1050                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051                         ctdb_addr_to_str(&pip->addr),
1052                         ctdb_vnn_iface_string(vnn)));
1053                 talloc_free(state);
1054                 return -1;
1055         }
1056
1057         /* tell the control that we will be reply asynchronously */
1058         *async_reply = true;
1059         return 0;
1060 }
1061
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063                                    ctdb_sock_addr *addr,
1064                                    unsigned mask, const char *ifaces,
1065                                    bool check_address)
1066 {
1067         struct ctdb_vnn      *vnn;
1068         uint32_t num = 0;
1069         char *tmp;
1070         const char *iface;
1071         int i;
1072         int ret;
1073
1074         tmp = strdup(ifaces);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 if (!ctdb_sys_check_iface_exists(iface)) {
1077                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1078                         free(tmp);
1079                         return -1;
1080                 }
1081         }
1082         free(tmp);
1083
1084         /* Verify that we don't have an entry for this ip yet */
1085         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1088                                 ctdb_addr_to_str(addr)));
1089                         return -1;
1090                 }               
1091         }
1092
1093         /* create a new vnn structure for this ip address */
1094         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097         tmp = talloc_strdup(vnn, ifaces);
1098         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1104                 num++;
1105         }
1106         talloc_free(tmp);
1107         vnn->ifaces[num] = NULL;
1108         vnn->public_address      = *addr;
1109         vnn->public_netmask_bits = mask;
1110         vnn->pnn                 = -1;
1111         if (check_address) {
1112                 if (ctdb_sys_have_ip(addr)) {
1113                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114                         vnn->pnn = ctdb->pnn;
1115                 }
1116         }
1117
1118         for (i=0; vnn->ifaces[i]; i++) {
1119                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1120                 if (ret != 0) {
1121                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122                                            "for public_address[%s]\n",
1123                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1124                         talloc_free(vnn);
1125                         return -1;
1126                 }
1127         }
1128
1129         DLIST_ADD(ctdb->vnn, vnn);
1130
1131         return 0;
1132 }
1133
1134 /*
1135   setup the public address lists from a file
1136 */
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1138 {
1139         char **lines;
1140         int nlines;
1141         int i;
1142
1143         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144         if (lines == NULL) {
1145                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1146                 return -1;
1147         }
1148         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1149                 nlines--;
1150         }
1151
1152         for (i=0;i<nlines;i++) {
1153                 unsigned mask;
1154                 ctdb_sock_addr addr;
1155                 const char *addrstr;
1156                 const char *ifaces;
1157                 char *tok, *line;
1158
1159                 line = lines[i];
1160                 while ((*line == ' ') || (*line == '\t')) {
1161                         line++;
1162                 }
1163                 if (*line == '#') {
1164                         continue;
1165                 }
1166                 if (strcmp(line, "") == 0) {
1167                         continue;
1168                 }
1169                 tok = strtok(line, " \t");
1170                 addrstr = tok;
1171                 tok = strtok(NULL, " \t");
1172                 if (tok == NULL) {
1173                         if (NULL == ctdb->default_public_interface) {
1174                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1175                                          i+1));
1176                                 talloc_free(lines);
1177                                 return -1;
1178                         }
1179                         ifaces = ctdb->default_public_interface;
1180                 } else {
1181                         ifaces = tok;
1182                 }
1183
1184                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1186                         talloc_free(lines);
1187                         return -1;
1188                 }
1189                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1191                         talloc_free(lines);
1192                         return -1;
1193                 }
1194         }
1195
1196
1197         talloc_free(lines);
1198         return 0;
1199 }
1200
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1202                               const char *iface,
1203                               const char *ip)
1204 {
1205         struct ctdb_vnn *svnn;
1206         struct ctdb_interface *cur = NULL;
1207         bool ok;
1208         int ret;
1209
1210         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211         CTDB_NO_MEMORY(ctdb, svnn);
1212
1213         svnn->ifaces = talloc_array(svnn, const char *, 2);
1214         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217         svnn->ifaces[1] = NULL;
1218
1219         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1220         if (!ok) {
1221                 talloc_free(svnn);
1222                 return -1;
1223         }
1224
1225         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1226         if (ret != 0) {
1227                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228                                    "for single_ip[%s]\n",
1229                                    svnn->ifaces[0],
1230                                    ctdb_addr_to_str(&svnn->public_address)));
1231                 talloc_free(svnn);
1232                 return -1;
1233         }
1234
1235         /* assume the single public ip interface is initially "good" */
1236         cur = ctdb_find_iface(ctdb, iface);
1237         if (cur == NULL) {
1238                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1239                 return -1;
1240         }
1241         cur->link_up = true;
1242
1243         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1244         if (ret != 0) {
1245                 talloc_free(svnn);
1246                 return -1;
1247         }
1248
1249         ctdb->single_ip_vnn = svnn;
1250         return 0;
1251 }
1252
1253 struct public_ip_list {
1254         struct public_ip_list *next;
1255         uint32_t pnn;
1256         ctdb_sock_addr addr;
1257 };
1258
1259 /* Given a physical node, return the number of
1260    public addresses that is currently assigned to this node.
1261 */
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1263 {
1264         int num=0;
1265
1266         for (;ips;ips=ips->next) {
1267                 if (ips->pnn == pnn) {
1268                         num++;
1269                 }
1270         }
1271         return num;
1272 }
1273
1274
1275 /* Can the given node host the given IP: is the public IP known to the
1276  * node and is NOIPHOST unset?
1277 */
1278 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1279                              struct ctdb_ipflags ipflags,
1280                              struct public_ip_list *ip)
1281 {
1282         struct ctdb_public_ip_list_old *public_ips;
1283         int i;
1284
1285         if (ipflags.noiphost) {
1286                 return false;
1287         }
1288
1289         public_ips = ctdb->ipalloc_state->available_public_ips[pnn];
1290
1291         if (public_ips == NULL) {
1292                 return false;
1293         }
1294
1295         for (i=0; i<public_ips->num; i++) {
1296                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1297                         /* yes, this node can serve this public ip */
1298                         return true;
1299                 }
1300         }
1301
1302         return false;
1303 }
1304
1305 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1306                                  struct ctdb_ipflags ipflags,
1307                                  struct public_ip_list *ip)
1308 {
1309         if (ipflags.noiptakeover) {
1310                 return false;
1311         }
1312
1313         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1314 }
1315
1316 /* search the node lists list for a node to takeover this ip.
1317    pick the node that currently are serving the least number of ips
1318    so that the ips get spread out evenly.
1319 */
1320 static int find_takeover_node(struct ctdb_context *ctdb,
1321                               struct ctdb_ipflags *ipflags,
1322                               struct public_ip_list *ip,
1323                               struct public_ip_list *all_ips)
1324 {
1325         int pnn, min=0, num;
1326         int i, numnodes;
1327
1328         numnodes = talloc_array_length(ipflags);
1329         pnn    = -1;
1330         for (i=0; i<numnodes; i++) {
1331                 /* verify that this node can serve this ip */
1332                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1333                         /* no it couldnt   so skip to the next node */
1334                         continue;
1335                 }
1336
1337                 num = node_ip_coverage(i, all_ips);
1338                 /* was this the first node we checked ? */
1339                 if (pnn == -1) {
1340                         pnn = i;
1341                         min  = num;
1342                 } else {
1343                         if (num < min) {
1344                                 pnn = i;
1345                                 min  = num;
1346                         }
1347                 }
1348         }       
1349         if (pnn == -1) {
1350                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1351                         ctdb_addr_to_str(&ip->addr)));
1352
1353                 return -1;
1354         }
1355
1356         ip->pnn = pnn;
1357         return 0;
1358 }
1359
1360 #define IP_KEYLEN       4
1361 static uint32_t *ip_key(ctdb_sock_addr *ip)
1362 {
1363         static uint32_t key[IP_KEYLEN];
1364
1365         bzero(key, sizeof(key));
1366
1367         switch (ip->sa.sa_family) {
1368         case AF_INET:
1369                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1370                 break;
1371         case AF_INET6: {
1372                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1373                 key[0]  = htonl(s6_a32[0]);
1374                 key[1]  = htonl(s6_a32[1]);
1375                 key[2]  = htonl(s6_a32[2]);
1376                 key[3]  = htonl(s6_a32[3]);
1377                 break;
1378         }
1379         default:
1380                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1381                 return key;
1382         }
1383
1384         return key;
1385 }
1386
1387 static void *add_ip_callback(void *parm, void *data)
1388 {
1389         struct public_ip_list *this_ip = parm;
1390         struct public_ip_list *prev_ip = data;
1391
1392         if (prev_ip == NULL) {
1393                 return parm;
1394         }
1395         if (this_ip->pnn == -1) {
1396                 this_ip->pnn = prev_ip->pnn;
1397         }
1398
1399         return parm;
1400 }
1401
1402 static int getips_count_callback(void *param, void *data)
1403 {
1404         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1405         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1406
1407         new_ip->next = *ip_list;
1408         *ip_list     = new_ip;
1409         return 0;
1410 }
1411
1412 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1413                                        struct ctdb_public_ip_list_old *ips,
1414                                        uint32_t pnn);
1415
1416 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1417                                          struct ipalloc_state *ipalloc_state,
1418                                          struct ctdb_node_map_old *nodemap)
1419 {
1420         int j;
1421         int ret;
1422
1423         if (ipalloc_state->num != nodemap->num) {
1424                 DEBUG(DEBUG_ERR,
1425                       (__location__
1426                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1427                        ipalloc_state->num, nodemap->num));
1428                 return -1;
1429         }
1430
1431         for (j=0; j<nodemap->num; j++) {
1432                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1433                         continue;
1434                 }
1435
1436                 /* Retrieve the list of known public IPs from the node */
1437                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1438                                         TAKEOVER_TIMEOUT(),
1439                                         j,
1440                                         ctdb->nodes,
1441                                         0,
1442                                         &ipalloc_state->known_public_ips[j]);
1443                 if (ret != 0) {
1444                         DEBUG(DEBUG_ERR,
1445                               ("Failed to read known public IPs from node: %u\n",
1446                                j));
1447                         return -1;
1448                 }
1449
1450                 if (ctdb->do_checkpublicip) {
1451                         verify_remote_ip_allocation(ctdb,
1452                                                     ipalloc_state->known_public_ips[j],
1453                                                     j);
1454                 }
1455
1456                 /* Retrieve the list of available public IPs from the node */
1457                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1458                                         TAKEOVER_TIMEOUT(),
1459                                         j,
1460                                         ctdb->nodes,
1461                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1462                                         &ipalloc_state->available_public_ips[j]);
1463                 if (ret != 0) {
1464                         DEBUG(DEBUG_ERR,
1465                               ("Failed to read available public IPs from node: %u\n",
1466                                j));
1467                         return -1;
1468                 }
1469         }
1470
1471         return 0;
1472 }
1473
1474 static struct public_ip_list *
1475 create_merged_ip_list(struct ctdb_context *ctdb)
1476 {
1477         int i, j;
1478         struct public_ip_list *ip_list;
1479         struct ctdb_public_ip_list_old *public_ips;
1480
1481         if (ctdb->ip_tree != NULL) {
1482                 talloc_free(ctdb->ip_tree);
1483                 ctdb->ip_tree = NULL;
1484         }
1485         ctdb->ip_tree = trbt_create(ctdb, 0);
1486
1487         for (i=0;i<ctdb->num_nodes;i++) {
1488                 public_ips = ctdb->ipalloc_state->known_public_ips[i];
1489
1490                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1491                         continue;
1492                 }
1493
1494                 /* there were no public ips for this node */
1495                 if (public_ips == NULL) {
1496                         continue;
1497                 }               
1498
1499                 for (j=0;j<public_ips->num;j++) {
1500                         struct public_ip_list *tmp_ip;
1501
1502                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1503                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1504                         /* Do not use information about IP addresses hosted
1505                          * on other nodes, it may not be accurate */
1506                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1507                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1508                         } else {
1509                                 tmp_ip->pnn = -1;
1510                         }
1511                         tmp_ip->addr = public_ips->ips[j].addr;
1512                         tmp_ip->next = NULL;
1513
1514                         trbt_insertarray32_callback(ctdb->ip_tree,
1515                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1516                                 add_ip_callback,
1517                                 tmp_ip);
1518                 }
1519         }
1520
1521         ip_list = NULL;
1522         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1523
1524         return ip_list;
1525 }
1526
1527 /* 
1528  * This is the length of the longtest common prefix between the IPs.
1529  * It is calculated by XOR-ing the 2 IPs together and counting the
1530  * number of leading zeroes.  The implementation means that all
1531  * addresses end up being 128 bits long.
1532  *
1533  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1534  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1535  * lots of nodes and IP addresses?
1536  */
1537 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1538 {
1539         uint32_t ip1_k[IP_KEYLEN];
1540         uint32_t *t;
1541         int i;
1542         uint32_t x;
1543
1544         uint32_t distance = 0;
1545
1546         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1547         t = ip_key(ip2);
1548         for (i=0; i<IP_KEYLEN; i++) {
1549                 x = ip1_k[i] ^ t[i];
1550                 if (x == 0) {
1551                         distance += 32;
1552                 } else {
1553                         /* Count number of leading zeroes. 
1554                          * FIXME? This could be optimised...
1555                          */
1556                         while ((x & (1 << 31)) == 0) {
1557                                 x <<= 1;
1558                                 distance += 1;
1559                         }
1560                 }
1561         }
1562
1563         return distance;
1564 }
1565
1566 /* Calculate the IP distance for the given IP relative to IPs on the
1567    given node.  The ips argument is generally the all_ips variable
1568    used in the main part of the algorithm.
1569  */
1570 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1571                                   struct public_ip_list *ips,
1572                                   int pnn)
1573 {
1574         struct public_ip_list *t;
1575         uint32_t d;
1576
1577         uint32_t sum = 0;
1578
1579         for (t=ips; t != NULL; t=t->next) {
1580                 if (t->pnn != pnn) {
1581                         continue;
1582                 }
1583
1584                 /* Optimisation: We never calculate the distance
1585                  * between an address and itself.  This allows us to
1586                  * calculate the effect of removing an address from a
1587                  * node by simply calculating the distance between
1588                  * that address and all of the exitsing addresses.
1589                  * Moreover, we assume that we're only ever dealing
1590                  * with addresses from all_ips so we can identify an
1591                  * address via a pointer rather than doing a more
1592                  * expensive address comparison. */
1593                 if (&(t->addr) == ip) {
1594                         continue;
1595                 }
1596
1597                 d = ip_distance(ip, &(t->addr));
1598                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1599         }
1600
1601         return sum;
1602 }
1603
1604 /* Return the LCP2 imbalance metric for addresses currently assigned
1605    to the given node.
1606  */
1607 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1608 {
1609         struct public_ip_list *t;
1610
1611         uint32_t imbalance = 0;
1612
1613         for (t=all_ips; t!=NULL; t=t->next) {
1614                 if (t->pnn != pnn) {
1615                         continue;
1616                 }
1617                 /* Pass the rest of the IPs rather than the whole
1618                    all_ips input list.
1619                 */
1620                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1621         }
1622
1623         return imbalance;
1624 }
1625
1626 /* Allocate any unassigned IPs just by looping through the IPs and
1627  * finding the best node for each.
1628  */
1629 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1630                                       struct ctdb_ipflags *ipflags,
1631                                       struct public_ip_list *all_ips)
1632 {
1633         struct public_ip_list *tmp_ip;
1634
1635         /* loop over all ip's and find a physical node to cover for 
1636            each unassigned ip.
1637         */
1638         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1639                 if (tmp_ip->pnn == -1) {
1640                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1641                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1642                                         ctdb_addr_to_str(&tmp_ip->addr)));
1643                         }
1644                 }
1645         }
1646 }
1647
1648 /* Basic non-deterministic rebalancing algorithm.
1649  */
1650 static void basic_failback(struct ctdb_context *ctdb,
1651                            struct ctdb_ipflags *ipflags,
1652                            struct public_ip_list *all_ips,
1653                            int num_ips)
1654 {
1655         int i, numnodes;
1656         int maxnode, maxnum, minnode, minnum, num, retries;
1657         struct public_ip_list *tmp_ip;
1658
1659         numnodes = talloc_array_length(ipflags);
1660         retries = 0;
1661
1662 try_again:
1663         maxnum=0;
1664         minnum=0;
1665
1666         /* for each ip address, loop over all nodes that can serve
1667            this ip and make sure that the difference between the node
1668            serving the most and the node serving the least ip's are
1669            not greater than 1.
1670         */
1671         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1672                 if (tmp_ip->pnn == -1) {
1673                         continue;
1674                 }
1675
1676                 /* Get the highest and lowest number of ips's served by any 
1677                    valid node which can serve this ip.
1678                 */
1679                 maxnode = -1;
1680                 minnode = -1;
1681                 for (i=0; i<numnodes; i++) {
1682                         /* only check nodes that can actually serve this ip */
1683                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1684                                 /* no it couldnt   so skip to the next node */
1685                                 continue;
1686                         }
1687
1688                         num = node_ip_coverage(i, all_ips);
1689                         if (maxnode == -1) {
1690                                 maxnode = i;
1691                                 maxnum  = num;
1692                         } else {
1693                                 if (num > maxnum) {
1694                                         maxnode = i;
1695                                         maxnum  = num;
1696                                 }
1697                         }
1698                         if (minnode == -1) {
1699                                 minnode = i;
1700                                 minnum  = num;
1701                         } else {
1702                                 if (num < minnum) {
1703                                         minnode = i;
1704                                         minnum  = num;
1705                                 }
1706                         }
1707                 }
1708                 if (maxnode == -1) {
1709                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1710                                 ctdb_addr_to_str(&tmp_ip->addr)));
1711
1712                         continue;
1713                 }
1714
1715                 /* if the spread between the smallest and largest coverage by
1716                    a node is >=2 we steal one of the ips from the node with
1717                    most coverage to even things out a bit.
1718                    try to do this a limited number of times since we dont
1719                    want to spend too much time balancing the ip coverage.
1720                 */
1721                 if ( (maxnum > minnum+1)
1722                      && (retries < (num_ips + 5)) ){
1723                         struct public_ip_list *tmp;
1724
1725                         /* Reassign one of maxnode's VNNs */
1726                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1727                                 if (tmp->pnn == maxnode) {
1728                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1729                                         retries++;
1730                                         goto try_again;;
1731                                 }
1732                         }
1733                 }
1734         }
1735 }
1736
1737 static void lcp2_init(struct ctdb_context *tmp_ctx,
1738                       struct ctdb_ipflags *ipflags,
1739                       struct public_ip_list *all_ips,
1740                       uint32_t *force_rebalance_nodes,
1741                       uint32_t **lcp2_imbalances,
1742                       bool **rebalance_candidates)
1743 {
1744         int i, numnodes;
1745         struct public_ip_list *tmp_ip;
1746
1747         numnodes = talloc_array_length(ipflags);
1748
1749         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1750         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1751         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1752         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1753
1754         for (i=0; i<numnodes; i++) {
1755                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1756                 /* First step: assume all nodes are candidates */
1757                 (*rebalance_candidates)[i] = true;
1758         }
1759
1760         /* 2nd step: if a node has IPs assigned then it must have been
1761          * healthy before, so we remove it from consideration.  This
1762          * is overkill but is all we have because we don't maintain
1763          * state between takeover runs.  An alternative would be to
1764          * keep state and invalidate it every time the recovery master
1765          * changes.
1766          */
1767         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1768                 if (tmp_ip->pnn != -1) {
1769                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1770                 }
1771         }
1772
1773         /* 3rd step: if a node is forced to re-balance then
1774            we allow failback onto the node */
1775         if (force_rebalance_nodes == NULL) {
1776                 return;
1777         }
1778         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1779                 uint32_t pnn = force_rebalance_nodes[i];
1780                 if (pnn >= numnodes) {
1781                         DEBUG(DEBUG_ERR,
1782                               (__location__ "unknown node %u\n", pnn));
1783                         continue;
1784                 }
1785
1786                 DEBUG(DEBUG_NOTICE,
1787                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1788                 (*rebalance_candidates)[pnn] = true;
1789         }
1790 }
1791
1792 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1793  * the IP/node combination that will cost the least.
1794  */
1795 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1796                                      struct ctdb_ipflags *ipflags,
1797                                      struct public_ip_list *all_ips,
1798                                      uint32_t *lcp2_imbalances)
1799 {
1800         struct public_ip_list *tmp_ip;
1801         int dstnode, numnodes;
1802
1803         int minnode;
1804         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1805         struct public_ip_list *minip;
1806
1807         bool should_loop = true;
1808         bool have_unassigned = true;
1809
1810         numnodes = talloc_array_length(ipflags);
1811
1812         while (have_unassigned && should_loop) {
1813                 should_loop = false;
1814
1815                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1816                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1817
1818                 minnode = -1;
1819                 mindsum = 0;
1820                 minip = NULL;
1821
1822                 /* loop over each unassigned ip. */
1823                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1824                         if (tmp_ip->pnn != -1) {
1825                                 continue;
1826                         }
1827
1828                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1829                                 /* only check nodes that can actually takeover this ip */
1830                                 if (!can_node_takeover_ip(ctdb, dstnode,
1831                                                           ipflags[dstnode],
1832                                                           tmp_ip)) {
1833                                         /* no it couldnt   so skip to the next node */
1834                                         continue;
1835                                 }
1836
1837                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1838                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1839                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1840                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1841                                                    dstnode,
1842                                                    dstimbl - lcp2_imbalances[dstnode]));
1843
1844
1845                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1846                                         minnode = dstnode;
1847                                         minimbl = dstimbl;
1848                                         mindsum = dstdsum;
1849                                         minip = tmp_ip;
1850                                         should_loop = true;
1851                                 }
1852                         }
1853                 }
1854
1855                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1856
1857                 /* If we found one then assign it to the given node. */
1858                 if (minnode != -1) {
1859                         minip->pnn = minnode;
1860                         lcp2_imbalances[minnode] = minimbl;
1861                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1862                                           ctdb_addr_to_str(&(minip->addr)),
1863                                           minnode,
1864                                           mindsum));
1865                 }
1866
1867                 /* There might be a better way but at least this is clear. */
1868                 have_unassigned = false;
1869                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1870                         if (tmp_ip->pnn == -1) {
1871                                 have_unassigned = true;
1872                         }
1873                 }
1874         }
1875
1876         /* We know if we have an unassigned addresses so we might as
1877          * well optimise.
1878          */
1879         if (have_unassigned) {
1880                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1881                         if (tmp_ip->pnn == -1) {
1882                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1883                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1884                         }
1885                 }
1886         }
1887 }
1888
1889 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1890  * to move IPs from, determines the best IP/destination node
1891  * combination to move from the source node.
1892  */
1893 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1894                                     struct ctdb_ipflags *ipflags,
1895                                     struct public_ip_list *all_ips,
1896                                     int srcnode,
1897                                     uint32_t *lcp2_imbalances,
1898                                     bool *rebalance_candidates)
1899 {
1900         int dstnode, mindstnode, numnodes;
1901         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1902         uint32_t minsrcimbl, mindstimbl;
1903         struct public_ip_list *minip;
1904         struct public_ip_list *tmp_ip;
1905
1906         /* Find an IP and destination node that best reduces imbalance. */
1907         srcimbl = 0;
1908         minip = NULL;
1909         minsrcimbl = 0;
1910         mindstnode = -1;
1911         mindstimbl = 0;
1912
1913         numnodes = talloc_array_length(ipflags);
1914
1915         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1916         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1917                            srcnode, lcp2_imbalances[srcnode]));
1918
1919         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1920                 /* Only consider addresses on srcnode. */
1921                 if (tmp_ip->pnn != srcnode) {
1922                         continue;
1923                 }
1924
1925                 /* What is this IP address costing the source node? */
1926                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1927                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1928
1929                 /* Consider this IP address would cost each potential
1930                  * destination node.  Destination nodes are limited to
1931                  * those that are newly healthy, since we don't want
1932                  * to do gratuitous failover of IPs just to make minor
1933                  * balance improvements.
1934                  */
1935                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1936                         if (!rebalance_candidates[dstnode]) {
1937                                 continue;
1938                         }
1939
1940                         /* only check nodes that can actually takeover this ip */
1941                         if (!can_node_takeover_ip(ctdb, dstnode,
1942                                                   ipflags[dstnode], tmp_ip)) {
1943                                 /* no it couldnt   so skip to the next node */
1944                                 continue;
1945                         }
1946
1947                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1948                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1949                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1950                                            srcnode, -srcdsum,
1951                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1952                                            dstnode, dstdsum));
1953
1954                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1955                             (dstdsum < srcdsum) &&                      \
1956                             ((mindstnode == -1) ||                              \
1957                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1958
1959                                 minip = tmp_ip;
1960                                 minsrcimbl = srcimbl;
1961                                 mindstnode = dstnode;
1962                                 mindstimbl = dstimbl;
1963                         }
1964                 }
1965         }
1966         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1967
1968         if (mindstnode != -1) {
1969                 /* We found a move that makes things better... */
1970                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1971                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1972                                   ctdb_addr_to_str(&(minip->addr)),
1973                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1974
1975
1976                 lcp2_imbalances[srcnode] = minsrcimbl;
1977                 lcp2_imbalances[mindstnode] = mindstimbl;
1978                 minip->pnn = mindstnode;
1979
1980                 return true;
1981         }
1982
1983         return false;
1984         
1985 }
1986
1987 struct lcp2_imbalance_pnn {
1988         uint32_t imbalance;
1989         int pnn;
1990 };
1991
1992 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1993 {
1994         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1995         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1996
1997         if (lipa->imbalance > lipb->imbalance) {
1998                 return -1;
1999         } else if (lipa->imbalance == lipb->imbalance) {
2000                 return 0;
2001         } else {
2002                 return 1;
2003         }
2004 }
2005
2006 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2007  * node with the highest LCP2 imbalance, and then determines the best
2008  * IP/destination node combination to move from the source node.
2009  */
2010 static void lcp2_failback(struct ctdb_context *ctdb,
2011                           struct ctdb_ipflags *ipflags,
2012                           struct public_ip_list *all_ips,
2013                           uint32_t *lcp2_imbalances,
2014                           bool *rebalance_candidates)
2015 {
2016         int i, numnodes;
2017         struct lcp2_imbalance_pnn * lips;
2018         bool again;
2019
2020         numnodes = talloc_array_length(ipflags);
2021
2022 try_again:
2023         /* Put the imbalances and nodes into an array, sort them and
2024          * iterate through candidates.  Usually the 1st one will be
2025          * used, so this doesn't cost much...
2026          */
2027         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2028         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2029         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2030         for (i=0; i<numnodes; i++) {
2031                 lips[i].imbalance = lcp2_imbalances[i];
2032                 lips[i].pnn = i;
2033                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2034         }
2035         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2036               lcp2_cmp_imbalance_pnn);
2037
2038         again = false;
2039         for (i=0; i<numnodes; i++) {
2040                 /* This means that all nodes had 0 or 1 addresses, so
2041                  * can't be imbalanced.
2042                  */
2043                 if (lips[i].imbalance == 0) {
2044                         break;
2045                 }
2046
2047                 if (lcp2_failback_candidate(ctdb,
2048                                             ipflags,
2049                                             all_ips,
2050                                             lips[i].pnn,
2051                                             lcp2_imbalances,
2052                                             rebalance_candidates)) {
2053                         again = true;
2054                         break;
2055                 }
2056         }
2057
2058         talloc_free(lips);
2059         if (again) {
2060                 goto try_again;
2061         }
2062 }
2063
2064 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2065                                     struct ctdb_ipflags *ipflags,
2066                                     struct public_ip_list *all_ips)
2067 {
2068         struct public_ip_list *tmp_ip;
2069
2070         /* verify that the assigned nodes can serve that public ip
2071            and set it to -1 if not
2072         */
2073         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2074                 if (tmp_ip->pnn == -1) {
2075                         continue;
2076                 }
2077                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2078                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2079                         /* this node can not serve this ip. */
2080                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2081                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2082                                            tmp_ip->pnn));
2083                         tmp_ip->pnn = -1;
2084                 }
2085         }
2086 }
2087
2088 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2089                                        struct ctdb_ipflags *ipflags,
2090                                        struct public_ip_list *all_ips)
2091 {
2092         struct public_ip_list *tmp_ip;
2093         int i, numnodes;
2094
2095         numnodes = talloc_array_length(ipflags);
2096
2097         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2098        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2099         *  always be allocated the same way for a specific set of
2100         *  available/unavailable nodes.
2101         */
2102
2103         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2104                 tmp_ip->pnn = i % numnodes;
2105         }
2106
2107         /* IP failback doesn't make sense with deterministic
2108          * IPs, since the modulo step above implicitly fails
2109          * back IPs to their "home" node.
2110          */
2111         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2112                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2113         }
2114
2115         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2116
2117         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2118
2119         /* No failback here! */
2120 }
2121
2122 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2123                                           struct ctdb_ipflags *ipflags,
2124                                           struct public_ip_list *all_ips)
2125 {
2126         /* This should be pushed down into basic_failback. */
2127         struct public_ip_list *tmp_ip;
2128         int num_ips = 0;
2129         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2130                 num_ips++;
2131         }
2132
2133         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2134
2135         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2136
2137         /* If we don't want IPs to fail back then don't rebalance IPs. */
2138         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2139                 return;
2140         }
2141
2142         /* Now, try to make sure the ip adresses are evenly distributed
2143            across the nodes.
2144         */
2145         basic_failback(ctdb, ipflags, all_ips, num_ips);
2146 }
2147
2148 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2149                           struct ctdb_ipflags *ipflags,
2150                           struct public_ip_list *all_ips,
2151                           uint32_t *force_rebalance_nodes)
2152 {
2153         uint32_t *lcp2_imbalances;
2154         bool *rebalance_candidates;
2155         int numnodes, num_rebalance_candidates, i;
2156
2157         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2158
2159         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2160
2161         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2162                   &lcp2_imbalances, &rebalance_candidates);
2163
2164         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2165
2166         /* If we don't want IPs to fail back then don't rebalance IPs. */
2167         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2168                 goto finished;
2169         }
2170
2171         /* It is only worth continuing if we have suitable target
2172          * nodes to transfer IPs to.  This check is much cheaper than
2173          * continuing on...
2174          */
2175         numnodes = talloc_array_length(ipflags);
2176         num_rebalance_candidates = 0;
2177         for (i=0; i<numnodes; i++) {
2178                 if (rebalance_candidates[i]) {
2179                         num_rebalance_candidates++;
2180                 }
2181         }
2182         if (num_rebalance_candidates == 0) {
2183                 goto finished;
2184         }
2185
2186         /* Now, try to make sure the ip adresses are evenly distributed
2187            across the nodes.
2188         */
2189         lcp2_failback(ctdb, ipflags, all_ips,
2190                       lcp2_imbalances, rebalance_candidates);
2191
2192 finished:
2193         talloc_free(tmp_ctx);
2194 }
2195
2196 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2197 {
2198         int i;
2199
2200         for (i=0;i<nodemap->num;i++) {
2201                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2202                         /* Found one completely healthy node */
2203                         return false;
2204                 }
2205         }
2206
2207         return true;
2208 }
2209
2210 /* The calculation part of the IP allocation algorithm. */
2211 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2212                                    struct ctdb_ipflags *ipflags,
2213                                    struct public_ip_list *all_ips,
2214                                    uint32_t *force_rebalance_nodes)
2215 {
2216         switch (ctdb->ipalloc_state->algorithm) {
2217         case IPALLOC_LCP2:
2218                 ip_alloc_lcp2(ctdb, ipflags, all_ips, force_rebalance_nodes);
2219                 break;
2220         case IPALLOC_DETERMINISTIC:
2221                 ip_alloc_deterministic_ips(ctdb, ipflags, all_ips);
2222                 break;
2223         case IPALLOC_NONDETERMINISTIC:
2224                 ip_alloc_nondeterministic_ips(ctdb, ipflags, all_ips);
2225                break;
2226         }
2227
2228         /* at this point ->pnn is the node which will own each IP
2229            or -1 if there is no node that can cover this ip
2230         */
2231
2232         return;
2233 }
2234
2235 struct get_tunable_callback_data {
2236         const char *tunable;
2237         uint32_t *out;
2238         bool fatal;
2239 };
2240
2241 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2242                                  int32_t res, TDB_DATA outdata,
2243                                  void *callback)
2244 {
2245         struct get_tunable_callback_data *cd =
2246                 (struct get_tunable_callback_data *)callback;
2247         int size;
2248
2249         if (res != 0) {
2250                 /* Already handled in fail callback */
2251                 return;
2252         }
2253
2254         if (outdata.dsize != sizeof(uint32_t)) {
2255                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2256                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2257                                  (int)outdata.dsize));
2258                 cd->fatal = true;
2259                 return;
2260         }
2261
2262         size = talloc_array_length(cd->out);
2263         if (pnn >= size) {
2264                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2265                                  cd->tunable, pnn, size));
2266                 return;
2267         }
2268
2269                 
2270         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2271 }
2272
2273 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2274                                        int32_t res, TDB_DATA outdata,
2275                                        void *callback)
2276 {
2277         struct get_tunable_callback_data *cd =
2278                 (struct get_tunable_callback_data *)callback;
2279
2280         switch (res) {
2281         case -ETIME:
2282                 DEBUG(DEBUG_ERR,
2283                       ("Timed out getting tunable \"%s\" from node %d\n",
2284                        cd->tunable, pnn));
2285                 cd->fatal = true;
2286                 break;
2287         case -EINVAL:
2288         case -1:
2289                 DEBUG(DEBUG_WARNING,
2290                       ("Tunable \"%s\" not implemented on node %d\n",
2291                        cd->tunable, pnn));
2292                 break;
2293         default:
2294                 DEBUG(DEBUG_ERR,
2295                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2296                        cd->tunable, pnn));
2297                 cd->fatal = true;
2298         }
2299 }
2300
2301 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2302                                         TALLOC_CTX *tmp_ctx,
2303                                         struct ctdb_node_map_old *nodemap,
2304                                         const char *tunable,
2305                                         uint32_t default_value)
2306 {
2307         TDB_DATA data;
2308         struct ctdb_control_get_tunable *t;
2309         uint32_t *nodes;
2310         uint32_t *tvals;
2311         struct get_tunable_callback_data callback_data;
2312         int i;
2313
2314         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2315         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2316         for (i=0; i<nodemap->num; i++) {
2317                 tvals[i] = default_value;
2318         }
2319                 
2320         callback_data.out = tvals;
2321         callback_data.tunable = tunable;
2322         callback_data.fatal = false;
2323
2324         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2325         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2326         t = (struct ctdb_control_get_tunable *)data.dptr;
2327         t->length = strlen(tunable)+1;
2328         memcpy(t->name, tunable, t->length);
2329         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2330         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2331                                       nodes, 0, TAKEOVER_TIMEOUT(),
2332                                       false, data,
2333                                       get_tunable_callback,
2334                                       get_tunable_fail_callback,
2335                                       &callback_data) != 0) {
2336                 if (callback_data.fatal) {
2337                         talloc_free(tvals);
2338                         tvals = NULL;
2339                 }
2340         }
2341         talloc_free(nodes);
2342         talloc_free(data.dptr);
2343
2344         return tvals;
2345 }
2346
2347 /* Set internal flags for IP allocation:
2348  *   Clear ip flags
2349  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2350  *   Set NOIPHOST ip flag for each INACTIVE node
2351  *   if all nodes are disabled:
2352  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2353  *   else
2354  *     Set NOIPHOST ip flags for disabled nodes
2355  */
2356 static struct ctdb_ipflags *
2357 set_ipflags_internal(struct ctdb_context *ctdb,
2358                      TALLOC_CTX *tmp_ctx,
2359                      struct ctdb_node_map_old *nodemap,
2360                      uint32_t *tval_noiptakeover,
2361                      uint32_t *tval_noiphostonalldisabled)
2362 {
2363         int i;
2364         struct ctdb_ipflags *ipflags;
2365
2366         /* Clear IP flags - implicit due to talloc_zero */
2367         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2368         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2369
2370         for (i=0;i<nodemap->num;i++) {
2371                 /* Can not take IPs on node with NoIPTakeover set */
2372                 if (tval_noiptakeover[i] != 0) {
2373                         ipflags[i].noiptakeover = true;
2374                 }
2375
2376                 /* Can not host IPs on INACTIVE node */
2377                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2378                         ipflags[i].noiphost = true;
2379                 }
2380         }
2381
2382         if (all_nodes_are_disabled(nodemap)) {
2383                 /* If all nodes are disabled, can not host IPs on node
2384                  * with NoIPHostOnAllDisabled set
2385                  */
2386                 for (i=0;i<nodemap->num;i++) {
2387                         if (tval_noiphostonalldisabled[i] != 0) {
2388                                 ipflags[i].noiphost = true;
2389                         }
2390                 }
2391         } else {
2392                 /* If some nodes are not disabled, then can not host
2393                  * IPs on DISABLED node
2394                  */
2395                 for (i=0;i<nodemap->num;i++) {
2396                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2397                                 ipflags[i].noiphost = true;
2398                         }
2399                 }
2400         }
2401
2402         return ipflags;
2403 }
2404
2405 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2406                                         TALLOC_CTX *tmp_ctx,
2407                                         struct ctdb_node_map_old *nodemap)
2408 {
2409         uint32_t *tval_noiptakeover;
2410         uint32_t *tval_noiphostonalldisabled;
2411         struct ctdb_ipflags *ipflags;
2412
2413
2414         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2415                                                    "NoIPTakeover", 0);
2416         if (tval_noiptakeover == NULL) {
2417                 return NULL;
2418         }
2419
2420         tval_noiphostonalldisabled =
2421                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2422                                        "NoIPHostOnAllDisabled", 0);
2423         if (tval_noiphostonalldisabled == NULL) {
2424                 /* Caller frees tmp_ctx */
2425                 return NULL;
2426         }
2427
2428         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2429                                        tval_noiptakeover,
2430                                        tval_noiphostonalldisabled);
2431
2432         talloc_free(tval_noiptakeover);
2433         talloc_free(tval_noiphostonalldisabled);
2434
2435         return ipflags;
2436 }
2437
2438 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2439                                                  TALLOC_CTX *mem_ctx)
2440 {
2441         struct ipalloc_state *ipalloc_state =
2442                 talloc_zero(mem_ctx, struct ipalloc_state);
2443         if (ipalloc_state == NULL) {
2444                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2445                 return NULL;
2446         }
2447
2448         ipalloc_state->num = ctdb->num_nodes;
2449         ipalloc_state->known_public_ips =
2450                 talloc_zero_array(ipalloc_state,
2451                                   struct ctdb_public_ip_list_old *,
2452                                   ipalloc_state->num);
2453         if (ipalloc_state->known_public_ips == NULL) {
2454                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2455                 talloc_free(ipalloc_state);
2456                 return NULL;
2457         }
2458         ipalloc_state->available_public_ips =
2459                 talloc_zero_array(ipalloc_state,
2460                                   struct ctdb_public_ip_list_old *,
2461                                   ipalloc_state->num);
2462         if (ipalloc_state->available_public_ips == NULL) {
2463                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2464                 talloc_free(ipalloc_state);
2465                 return NULL;
2466         }
2467
2468         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2469                 ipalloc_state->algorithm = IPALLOC_LCP2;
2470         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2471                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2472         } else {
2473                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2474         }
2475
2476         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2477
2478         return ipalloc_state;
2479 }
2480
2481 struct iprealloc_callback_data {
2482         bool *retry_nodes;
2483         int retry_count;
2484         client_async_callback fail_callback;
2485         void *fail_callback_data;
2486         struct ctdb_node_map_old *nodemap;
2487 };
2488
2489 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2490                                         int32_t res, TDB_DATA outdata,
2491                                         void *callback)
2492 {
2493         int numnodes;
2494         struct iprealloc_callback_data *cd =
2495                 (struct iprealloc_callback_data *)callback;
2496
2497         numnodes = talloc_array_length(cd->retry_nodes);
2498         if (pnn > numnodes) {
2499                 DEBUG(DEBUG_ERR,
2500                       ("ipreallocated failure from node %d, "
2501                        "but only %d nodes in nodemap\n",
2502                        pnn, numnodes));
2503                 return;
2504         }
2505
2506         /* Can't run the "ipreallocated" event on a INACTIVE node */
2507         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2508                 DEBUG(DEBUG_WARNING,
2509                       ("ipreallocated failed on inactive node %d, ignoring\n",
2510                        pnn));
2511                 return;
2512         }
2513
2514         switch (res) {
2515         case -ETIME:
2516                 /* If the control timed out then that's a real error,
2517                  * so call the real fail callback
2518                  */
2519                 if (cd->fail_callback) {
2520                         cd->fail_callback(ctdb, pnn, res, outdata,
2521                                           cd->fail_callback_data);
2522                 } else {
2523                         DEBUG(DEBUG_WARNING,
2524                               ("iprealloc timed out but no callback registered\n"));
2525                 }
2526                 break;
2527         default:
2528                 /* If not a timeout then either the ipreallocated
2529                  * eventscript (or some setup) failed.  This might
2530                  * have failed because the IPREALLOCATED control isn't
2531                  * implemented - right now there is no way of knowing
2532                  * because the error codes are all folded down to -1.
2533                  * Consider retrying using EVENTSCRIPT control...
2534                  */
2535                 DEBUG(DEBUG_WARNING,
2536                       ("ipreallocated failure from node %d, flagging retry\n",
2537                        pnn));
2538                 cd->retry_nodes[pnn] = true;
2539                 cd->retry_count++;
2540         }
2541 }
2542
2543 struct takeover_callback_data {
2544         bool *node_failed;
2545         client_async_callback fail_callback;
2546         void *fail_callback_data;
2547         struct ctdb_node_map_old *nodemap;
2548 };
2549
2550 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2551                                        uint32_t node_pnn, int32_t res,
2552                                        TDB_DATA outdata, void *callback_data)
2553 {
2554         struct takeover_callback_data *cd =
2555                 talloc_get_type_abort(callback_data,
2556                                       struct takeover_callback_data);
2557         int i;
2558
2559         for (i = 0; i < cd->nodemap->num; i++) {
2560                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2561                         break;
2562                 }
2563         }
2564
2565         if (i == cd->nodemap->num) {
2566                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2567                 return;
2568         }
2569
2570         if (!cd->node_failed[i]) {
2571                 cd->node_failed[i] = true;
2572                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2573                                   cd->fail_callback_data);
2574         }
2575 }
2576
2577 /*
2578   make any IP alias changes for public addresses that are necessary 
2579  */
2580 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2581                       uint32_t *force_rebalance_nodes,
2582                       client_async_callback fail_callback, void *callback_data)
2583 {
2584         int i, j, ret;
2585         struct ctdb_public_ip ip;
2586         uint32_t *nodes;
2587         struct public_ip_list *all_ips, *tmp_ip;
2588         TDB_DATA data;
2589         struct timeval timeout;
2590         struct client_async_data *async_data;
2591         struct ctdb_client_control_state *state;
2592         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2593         struct ctdb_ipflags *ipflags;
2594         struct ipalloc_state *ipalloc_state;
2595         struct takeover_callback_data *takeover_data;
2596         struct iprealloc_callback_data iprealloc_data;
2597         bool *retry_data;
2598         bool can_host_ips;
2599
2600         /*
2601          * ip failover is completely disabled, just send out the 
2602          * ipreallocated event.
2603          */
2604         if (ctdb->tunable.disable_ip_failover != 0) {
2605                 goto ipreallocated;
2606         }
2607
2608         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2609         if (ipalloc_state == NULL) {
2610                 talloc_free(tmp_ctx);
2611                 return -1;
2612         }
2613         ctdb->ipalloc_state = ipalloc_state;
2614
2615         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2616         if (ipflags == NULL) {
2617                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2618                 talloc_free(tmp_ctx);
2619                 return -1;
2620         }
2621
2622         /* Fetch known/available public IPs from each active node */
2623         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2624         if (ret != 0) {
2625                 talloc_free(tmp_ctx);
2626                 return -1;
2627         }
2628
2629         /* Short-circuit IP allocation if no node has available IPs */
2630         can_host_ips = false;
2631         for (i=0; i < ipalloc_state->num; i++) {
2632                 if (ipalloc_state->available_public_ips[i] != NULL) {
2633                         can_host_ips = true;
2634                 }
2635         }
2636         if (!can_host_ips) {
2637                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2638                 return 0;
2639         }
2640
2641         /* since nodes only know about those public addresses that
2642            can be served by that particular node, no single node has
2643            a full list of all public addresses that exist in the cluster.
2644            Walk over all node structures and create a merged list of
2645            all public addresses that exist in the cluster.
2646
2647            keep the tree of ips around as ctdb->ip_tree
2648         */
2649         all_ips = create_merged_ip_list(ctdb);
2650
2651         /* Do the IP reassignment calculations */
2652         ctdb_takeover_run_core(ctdb, ipflags, all_ips, force_rebalance_nodes);
2653
2654         /* Now tell all nodes to release any public IPs should not
2655          * host.  This will be a NOOP on nodes that don't currently
2656          * hold the given IP.
2657          */
2658         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2659         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2660
2661         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2662                                                        bool, nodemap->num);
2663         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2664         takeover_data->fail_callback = fail_callback;
2665         takeover_data->fail_callback_data = callback_data;
2666         takeover_data->nodemap = nodemap;
2667
2668         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2669         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2670
2671         async_data->fail_callback = takeover_run_fail_callback;
2672         async_data->callback_data = takeover_data;
2673
2674         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2675
2676         /* Send a RELEASE_IP to all nodes that should not be hosting
2677          * each IP.  For each IP, all but one of these will be
2678          * redundant.  However, the redundant ones are used to tell
2679          * nodes which node should be hosting the IP so that commands
2680          * like "ctdb ip" can display a particular nodes idea of who
2681          * is hosting what. */
2682         for (i=0;i<nodemap->num;i++) {
2683                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2684                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2685                         continue;
2686                 }
2687
2688                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2689                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2690                                 /* This node should be serving this
2691                                    vnn so don't tell it to release the ip
2692                                 */
2693                                 continue;
2694                         }
2695                         ip.pnn  = tmp_ip->pnn;
2696                         ip.addr = tmp_ip->addr;
2697
2698                         timeout = TAKEOVER_TIMEOUT();
2699                         data.dsize = sizeof(ip);
2700                         data.dptr  = (uint8_t *)&ip;
2701                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2702                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2703                                                   data, async_data,
2704                                                   &timeout, NULL);
2705                         if (state == NULL) {
2706                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2707                                 talloc_free(tmp_ctx);
2708                                 return -1;
2709                         }
2710
2711                         ctdb_client_async_add(async_data, state);
2712                 }
2713         }
2714         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2715                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2716                 talloc_free(tmp_ctx);
2717                 return -1;
2718         }
2719         talloc_free(async_data);
2720
2721
2722         /* For each IP, send a TAKOVER_IP to the node that should be
2723          * hosting it.  Many of these will often be redundant (since
2724          * the allocation won't have changed) but they can be useful
2725          * to recover from inconsistencies. */
2726         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2727         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2728
2729         async_data->fail_callback = fail_callback;
2730         async_data->callback_data = callback_data;
2731
2732         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2733                 if (tmp_ip->pnn == -1) {
2734                         /* this IP won't be taken over */
2735                         continue;
2736                 }
2737
2738                 ip.pnn  = tmp_ip->pnn;
2739                 ip.addr = tmp_ip->addr;
2740
2741                 timeout = TAKEOVER_TIMEOUT();
2742                 data.dsize = sizeof(ip);
2743                 data.dptr  = (uint8_t *)&ip;
2744                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2745                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2746                                           data, async_data, &timeout, NULL);
2747                 if (state == NULL) {
2748                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2749                         talloc_free(tmp_ctx);
2750                         return -1;
2751                 }
2752
2753                 ctdb_client_async_add(async_data, state);
2754         }
2755         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2756                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2757                 talloc_free(tmp_ctx);
2758                 return -1;
2759         }
2760
2761 ipreallocated:
2762         /*
2763          * Tell all nodes to run eventscripts to process the
2764          * "ipreallocated" event.  This can do a lot of things,
2765          * including restarting services to reconfigure them if public
2766          * IPs have moved.  Once upon a time this event only used to
2767          * update natgw.
2768          */
2769         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2770         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2771         iprealloc_data.retry_nodes = retry_data;
2772         iprealloc_data.retry_count = 0;
2773         iprealloc_data.fail_callback = fail_callback;
2774         iprealloc_data.fail_callback_data = callback_data;
2775         iprealloc_data.nodemap = nodemap;
2776
2777         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2778         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2779                                         nodes, 0, TAKEOVER_TIMEOUT(),
2780                                         false, tdb_null,
2781                                         NULL, iprealloc_fail_callback,
2782                                         &iprealloc_data);
2783         if (ret != 0) {
2784                 /* If the control failed then we should retry to any
2785                  * nodes flagged by iprealloc_fail_callback using the
2786                  * EVENTSCRIPT control.  This is a best-effort at
2787                  * backward compatiblity when running a mixed cluster
2788                  * where some nodes have not yet been upgraded to
2789                  * support the IPREALLOCATED control.
2790                  */
2791                 DEBUG(DEBUG_WARNING,
2792                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2793
2794                 nodes = talloc_array(tmp_ctx, uint32_t,
2795                                      iprealloc_data.retry_count);
2796                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2797
2798                 j = 0;
2799                 for (i=0; i<nodemap->num; i++) {
2800                         if (iprealloc_data.retry_nodes[i]) {
2801                                 nodes[j] = i;
2802                                 j++;
2803                         }
2804                 }
2805
2806                 data.dptr  = discard_const("ipreallocated");
2807                 data.dsize = strlen((char *)data.dptr) + 1; 
2808                 ret = ctdb_client_async_control(ctdb,
2809                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2810                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2811                                                 false, data,
2812                                                 NULL, fail_callback,
2813                                                 callback_data);
2814                 if (ret != 0) {
2815                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2816                 }
2817         }
2818
2819         talloc_free(tmp_ctx);
2820         return ret;
2821 }
2822
2823
2824 /*
2825   destroy a ctdb_client_ip structure
2826  */
2827 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2828 {
2829         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2830                 ctdb_addr_to_str(&ip->addr),
2831                 ntohs(ip->addr.ip.sin_port),
2832                 ip->client_id));
2833
2834         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2835         return 0;
2836 }
2837
2838 /*
2839   called by a client to inform us of a TCP connection that it is managing
2840   that should tickled with an ACK when IP takeover is done
2841  */
2842 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2843                                 TDB_DATA indata)
2844 {
2845         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2846         struct ctdb_connection *tcp_sock = NULL;
2847         struct ctdb_tcp_list *tcp;
2848         struct ctdb_connection t;
2849         int ret;
2850         TDB_DATA data;
2851         struct ctdb_client_ip *ip;
2852         struct ctdb_vnn *vnn;
2853         ctdb_sock_addr addr;
2854
2855         /* If we don't have public IPs, tickles are useless */
2856         if (ctdb->vnn == NULL) {
2857                 return 0;
2858         }
2859
2860         tcp_sock = (struct ctdb_connection *)indata.dptr;
2861
2862         addr = tcp_sock->src;
2863         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2864         addr = tcp_sock->dst;
2865         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2866
2867         ZERO_STRUCT(addr);
2868         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2869         vnn = find_public_ip_vnn(ctdb, &addr);
2870         if (vnn == NULL) {
2871                 switch (addr.sa.sa_family) {
2872                 case AF_INET:
2873                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2874                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2875                                         ctdb_addr_to_str(&addr)));
2876                         }
2877                         break;
2878                 case AF_INET6:
2879                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2880                                 ctdb_addr_to_str(&addr)));
2881                         break;
2882                 default:
2883                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2884                 }
2885
2886                 return 0;
2887         }
2888
2889         if (vnn->pnn != ctdb->pnn) {
2890                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2891                         ctdb_addr_to_str(&addr),
2892                         client_id, client->pid));
2893                 /* failing this call will tell smbd to die */
2894                 return -1;
2895         }
2896
2897         ip = talloc(client, struct ctdb_client_ip);
2898         CTDB_NO_MEMORY(ctdb, ip);
2899
2900         ip->ctdb      = ctdb;
2901         ip->addr      = addr;
2902         ip->client_id = client_id;
2903         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2904         DLIST_ADD(ctdb->client_ip_list, ip);
2905
2906         tcp = talloc(client, struct ctdb_tcp_list);
2907         CTDB_NO_MEMORY(ctdb, tcp);
2908
2909         tcp->connection.src = tcp_sock->src;
2910         tcp->connection.dst = tcp_sock->dst;
2911
2912         DLIST_ADD(client->tcp_list, tcp);
2913
2914         t.src = tcp_sock->src;
2915         t.dst = tcp_sock->dst;
2916
2917         data.dptr = (uint8_t *)&t;
2918         data.dsize = sizeof(t);
2919
2920         switch (addr.sa.sa_family) {
2921         case AF_INET:
2922                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2923                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2924                         ctdb_addr_to_str(&tcp_sock->src),
2925                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2926                 break;
2927         case AF_INET6:
2928                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2929                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2930                         ctdb_addr_to_str(&tcp_sock->src),
2931                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2932                 break;
2933         default:
2934                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2935         }
2936
2937
2938         /* tell all nodes about this tcp connection */
2939         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2940                                        CTDB_CONTROL_TCP_ADD,
2941                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2942         if (ret != 0) {
2943                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2944                 return -1;
2945         }
2946
2947         return 0;
2948 }
2949
2950 /*
2951   find a tcp address on a list
2952  */
2953 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2954                                            struct ctdb_connection *tcp)
2955 {
2956         int i;
2957
2958         if (array == NULL) {
2959                 return NULL;
2960         }
2961
2962         for (i=0;i<array->num;i++) {
2963                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2964                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2965                         return &array->connections[i];
2966                 }
2967         }
2968         return NULL;
2969 }
2970
2971
2972
2973 /*
2974   called by a daemon to inform us of a TCP connection that one of its
2975   clients managing that should tickled with an ACK when IP takeover is
2976   done
2977  */
2978 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2979 {
2980         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2981         struct ctdb_tcp_array *tcparray;
2982         struct ctdb_connection tcp;
2983         struct ctdb_vnn *vnn;
2984
2985         /* If we don't have public IPs, tickles are useless */
2986         if (ctdb->vnn == NULL) {
2987                 return 0;
2988         }
2989
2990         vnn = find_public_ip_vnn(ctdb, &p->dst);
2991         if (vnn == NULL) {
2992                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2993                         ctdb_addr_to_str(&p->dst)));
2994
2995                 return -1;
2996         }
2997
2998
2999         tcparray = vnn->tcp_array;
3000
3001         /* If this is the first tickle */
3002         if (tcparray == NULL) {
3003                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3004                 CTDB_NO_MEMORY(ctdb, tcparray);
3005                 vnn->tcp_array = tcparray;
3006
3007                 tcparray->num = 0;
3008                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3009                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3010
3011                 tcparray->connections[tcparray->num].src = p->src;
3012                 tcparray->connections[tcparray->num].dst = p->dst;
3013                 tcparray->num++;
3014
3015                 if (tcp_update_needed) {
3016                         vnn->tcp_update_needed = true;
3017                 }
3018                 return 0;
3019         }
3020
3021
3022         /* Do we already have this tickle ?*/
3023         tcp.src = p->src;
3024         tcp.dst = p->dst;
3025         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3026                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3027                         ctdb_addr_to_str(&tcp.dst),
3028                         ntohs(tcp.dst.ip.sin_port),
3029                         vnn->pnn));
3030                 return 0;
3031         }
3032
3033         /* A new tickle, we must add it to the array */
3034         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3035                                         struct ctdb_connection,
3036                                         tcparray->num+1);
3037         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3038
3039         tcparray->connections[tcparray->num].src = p->src;
3040         tcparray->connections[tcparray->num].dst = p->dst;
3041         tcparray->num++;
3042
3043         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3044                 ctdb_addr_to_str(&tcp.dst),
3045                 ntohs(tcp.dst.ip.sin_port),
3046                 vnn->pnn));
3047
3048         if (tcp_update_needed) {
3049                 vnn->tcp_update_needed = true;
3050         }
3051
3052         return 0;
3053 }
3054
3055
3056 /*
3057   called by a daemon to inform us of a TCP connection that one of its
3058   clients managing that should tickled with an ACK when IP takeover is
3059   done
3060  */
3061 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3062 {
3063         struct ctdb_connection *tcpp;
3064         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3065
3066         if (vnn == NULL) {
3067                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3068                         ctdb_addr_to_str(&conn->dst)));
3069                 return;
3070         }
3071
3072         /* if the array is empty we cant remove it
3073            and we don't need to do anything
3074          */
3075         if (vnn->tcp_array == NULL) {
3076                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3077                         ctdb_addr_to_str(&conn->dst),
3078                         ntohs(conn->dst.ip.sin_port)));
3079                 return;
3080         }
3081
3082
3083         /* See if we know this connection
3084            if we don't know this connection  then we dont need to do anything
3085          */
3086         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3087         if (tcpp == NULL) {
3088                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3089                         ctdb_addr_to_str(&conn->dst),
3090                         ntohs(conn->dst.ip.sin_port)));
3091                 return;
3092         }
3093
3094
3095         /* We need to remove this entry from the array.
3096            Instead of allocating a new array and copying data to it
3097            we cheat and just copy the last entry in the existing array
3098            to the entry that is to be removed and just shring the 
3099            ->num field
3100          */
3101         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3102         vnn->tcp_array->num--;
3103
3104         /* If we deleted the last entry we also need to remove the entire array
3105          */
3106         if (vnn->tcp_array->num == 0) {
3107                 talloc_free(vnn->tcp_array);
3108                 vnn->tcp_array = NULL;
3109         }               
3110
3111         vnn->tcp_update_needed = true;
3112
3113         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3114                 ctdb_addr_to_str(&conn->src),
3115                 ntohs(conn->src.ip.sin_port)));
3116 }
3117
3118
3119 /*
3120   called by a daemon to inform us of a TCP connection that one of its
3121   clients used are no longer needed in the tickle database
3122  */
3123 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3124 {
3125         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3126
3127         /* If we don't have public IPs, tickles are useless */
3128         if (ctdb->vnn == NULL) {
3129                 return 0;
3130         }
3131
3132         ctdb_remove_connection(ctdb, conn);
3133
3134         return 0;
3135 }
3136
3137
3138 /*
3139   Called when another daemon starts - causes all tickles for all
3140   public addresses we are serving to be sent to the new node on the
3141   next check.  This actually causes the next scheduled call to
3142   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3143   doesn't require careful error handling.
3144  */
3145 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3146 {
3147         struct ctdb_vnn *vnn;
3148
3149         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3150                            (unsigned long) pnn));
3151
3152         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3153                 vnn->tcp_update_needed = true;
3154         }
3155
3156         return 0;
3157 }
3158
3159
3160 /*
3161   called when a client structure goes away - hook to remove
3162   elements from the tcp_list in all daemons
3163  */
3164 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3165 {
3166         while (client->tcp_list) {
3167                 struct ctdb_tcp_list *tcp = client->tcp_list;
3168                 DLIST_REMOVE(client->tcp_list, tcp);
3169                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3170         }
3171 }
3172
3173
3174 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3175 {
3176         struct ctdb_vnn *vnn;
3177         int count = 0;
3178
3179         if (ctdb->tunable.disable_ip_failover == 1) {
3180                 return;
3181         }
3182
3183         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3184                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3185                         ctdb_vnn_unassign_iface(ctdb, vnn);
3186                         continue;
3187                 }
3188                 if (!vnn->iface) {
3189                         continue;
3190                 }
3191
3192                 /* Don't allow multiple releases at once.  Some code,
3193                  * particularly ctdb_tickle_sentenced_connections() is
3194                  * not re-entrant */
3195                 if (vnn->update_in_flight) {
3196                         DEBUG(DEBUG_WARNING,
3197                               (__location__
3198                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3199                                     ctdb_addr_to_str(&vnn->public_address),
3200                                     vnn->public_netmask_bits,
3201                                     ctdb_vnn_iface_string(vnn)));
3202                         continue;
3203                 }
3204                 vnn->update_in_flight = true;
3205
3206                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3207                                     ctdb_addr_to_str(&vnn->public_address),
3208                                     vnn->public_netmask_bits,
3209                                     ctdb_vnn_iface_string(vnn)));
3210
3211                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3212                                   ctdb_vnn_iface_string(vnn),
3213                                   ctdb_addr_to_str(&vnn->public_address),
3214                                   vnn->public_netmask_bits);
3215                 release_kill_clients(ctdb, &vnn->public_address);
3216                 ctdb_vnn_unassign_iface(ctdb, vnn);
3217                 vnn->update_in_flight = false;
3218                 count++;
3219         }
3220
3221         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3222 }
3223
3224
3225 /*
3226   get list of public IPs
3227  */
3228 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3229                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3230 {
3231         int i, num, len;
3232         struct ctdb_public_ip_list_old *ips;
3233         struct ctdb_vnn *vnn;
3234         bool only_available = false;
3235
3236         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3237                 only_available = true;
3238         }
3239
3240         /* count how many public ip structures we have */
3241         num = 0;
3242         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3243                 num++;
3244         }
3245
3246         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3247                 num*sizeof(struct ctdb_public_ip);
3248         ips = talloc_zero_size(outdata, len);
3249         CTDB_NO_MEMORY(ctdb, ips);
3250
3251         i = 0;
3252         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3253                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3254                         continue;
3255                 }
3256                 ips->ips[i].pnn  = vnn->pnn;
3257                 ips->ips[i].addr = vnn->public_address;
3258                 i++;
3259         }
3260         ips->num = i;
3261         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3262                 i*sizeof(struct ctdb_public_ip);
3263
3264         outdata->dsize = len;
3265         outdata->dptr  = (uint8_t *)ips;
3266
3267         return 0;
3268 }
3269
3270
3271 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3272                                         struct ctdb_req_control_old *c,
3273                                         TDB_DATA indata,
3274                                         TDB_DATA *outdata)
3275 {
3276         int i, num, len;
3277         ctdb_sock_addr *addr;
3278         struct ctdb_public_ip_info_old *info;
3279         struct ctdb_vnn *vnn;
3280
3281         addr = (ctdb_sock_addr *)indata.dptr;
3282
3283         vnn = find_public_ip_vnn(ctdb, addr);
3284         if (vnn == NULL) {
3285                 /* if it is not a public ip   it could be our 'single ip' */
3286                 if (ctdb->single_ip_vnn) {
3287                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3288                                 vnn = ctdb->single_ip_vnn;
3289                         }
3290                 }
3291         }
3292         if (vnn == NULL) {
3293                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3294                                  "'%s'not a public address\n",
3295                                  ctdb_addr_to_str(addr)));
3296                 return -1;
3297         }
3298
3299         /* count how many public ip structures we have */
3300         num = 0;
3301         for (;vnn->ifaces[num];) {
3302                 num++;
3303         }
3304
3305         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3306                 num*sizeof(struct ctdb_iface);
3307         info = talloc_zero_size(outdata, len);
3308         CTDB_NO_MEMORY(ctdb, info);
3309
3310         info->ip.addr = vnn->public_address;
3311         info->ip.pnn = vnn->pnn;
3312         info->active_idx = 0xFFFFFFFF;
3313
3314         for (i=0; vnn->ifaces[i]; i++) {
3315                 struct ctdb_interface *cur;
3316
3317                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3318                 if (cur == NULL) {
3319                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3320                                            vnn->ifaces[i]));
3321                         return -1;
3322                 }
3323                 if (vnn->iface == cur) {
3324                         info->active_idx = i;
3325                 }
3326                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3327                 info->ifaces[i].link_state = cur->link_up;
3328                 info->ifaces[i].references = cur->references;
3329         }
3330         info->num = i;
3331         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3332                 i*sizeof(struct ctdb_iface);
3333
3334         outdata->dsize = len;
3335         outdata->dptr  = (uint8_t *)info;
3336
3337         return 0;
3338 }
3339
3340 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3341                                 struct ctdb_req_control_old *c,
3342                                 TDB_DATA *outdata)
3343 {
3344         int i, num, len;
3345         struct ctdb_iface_list_old *ifaces;
3346         struct ctdb_interface *cur;
3347
3348         /* count how many public ip structures we have */
3349         num = 0;
3350         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3351                 num++;
3352         }
3353
3354         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3355                 num*sizeof(struct ctdb_iface);
3356         ifaces = talloc_zero_size(outdata, len);
3357         CTDB_NO_MEMORY(ctdb, ifaces);
3358
3359         i = 0;
3360         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3361                 strcpy(ifaces->ifaces[i].name, cur->name);
3362                 ifaces->ifaces[i].link_state = cur->link_up;
3363                 ifaces->ifaces[i].references = cur->references;
3364                 i++;
3365         }
3366         ifaces->num = i;
3367         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3368                 i*sizeof(struct ctdb_iface);
3369
3370         outdata->dsize = len;
3371         outdata->dptr  = (uint8_t *)ifaces;
3372
3373         return 0;
3374 }
3375
3376 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3377                                     struct ctdb_req_control_old *c,
3378                                     TDB_DATA indata)
3379 {
3380         struct ctdb_iface *info;
3381         struct ctdb_interface *iface;
3382         bool link_up = false;
3383
3384         info = (struct ctdb_iface *)indata.dptr;
3385
3386         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3387                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3388                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3389                                   len, len, info->name));
3390                 return -1;
3391         }
3392
3393         switch (info->link_state) {
3394         case 0:
3395                 link_up = false;
3396                 break;
3397         case 1:
3398                 link_up = true;
3399                 break;
3400         default:
3401                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3402                                   (unsigned int)info->link_state));
3403                 return -1;
3404         }
3405
3406         if (info->references != 0) {
3407                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3408                                   (unsigned int)info->references));
3409                 return -1;
3410         }
3411
3412         iface = ctdb_find_iface(ctdb, info->name);
3413         if (iface == NULL) {
3414                 return -1;
3415         }
3416
3417         if (link_up == iface->link_up) {
3418                 return 0;
3419         }
3420
3421         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3422               ("iface[%s] has changed it's link status %s => %s\n",
3423                iface->name,
3424                iface->link_up?"up":"down",
3425                link_up?"up":"down"));
3426
3427         iface->link_up = link_up;
3428         return 0;
3429 }
3430
3431
3432 /* 
3433    structure containing the listening socket and the list of tcp connections
3434    that the ctdb daemon is to kill
3435 */
3436 struct ctdb_kill_tcp {
3437         struct ctdb_vnn *vnn;
3438         struct ctdb_context *ctdb;
3439         int capture_fd;
3440         struct tevent_fd *fde;
3441         trbt_tree_t *connections;
3442         void *private_data;
3443 };
3444
3445 /*
3446   a tcp connection that is to be killed
3447  */
3448 struct ctdb_killtcp_con {
3449         ctdb_sock_addr src_addr;
3450         ctdb_sock_addr dst_addr;
3451         int count;
3452         struct ctdb_kill_tcp *killtcp;
3453 };
3454
3455 /* this function is used to create a key to represent this socketpair
3456    in the killtcp tree.
3457    this key is used to insert and lookup matching socketpairs that are
3458    to be tickled and RST
3459 */
3460 #define KILLTCP_KEYLEN  10
3461 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3462 {
3463         static uint32_t key[KILLTCP_KEYLEN];
3464
3465         bzero(key, sizeof(key));
3466
3467         if (src->sa.sa_family != dst->sa.sa_family) {
3468                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3469                 return key;
3470         }
3471         
3472         switch (src->sa.sa_family) {
3473         case AF_INET:
3474                 key[0]  = dst->ip.sin_addr.s_addr;
3475                 key[1]  = src->ip.sin_addr.s_addr;
3476                 key[2]  = dst->ip.sin_port;
3477