ctdb-ipalloc: Move IP allocation state into its own struct
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 struct ipalloc_state {
57         uint32_t num;
58
59         /* Arrays with data for each node */
60         struct ctdb_public_ip_list_old **known_public_ips;
61         struct ctdb_public_ip_list_old **available_public_ips;
62 };
63
64 struct ctdb_interface {
65         struct ctdb_interface *prev, *next;
66         const char *name;
67         bool link_up;
68         uint32_t references;
69 };
70
71 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
72 {
73         if (vnn->iface) {
74                 return vnn->iface->name;
75         }
76
77         return "__none__";
78 }
79
80 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
81 {
82         struct ctdb_interface *i;
83
84         /* Verify that we don't have an entry for this ip yet */
85         for (i=ctdb->ifaces;i;i=i->next) {
86                 if (strcmp(i->name, iface) == 0) {
87                         return 0;
88                 }
89         }
90
91         /* create a new structure for this interface */
92         i = talloc_zero(ctdb, struct ctdb_interface);
93         CTDB_NO_MEMORY_FATAL(ctdb, i);
94         i->name = talloc_strdup(i, iface);
95         CTDB_NO_MEMORY(ctdb, i->name);
96
97         i->link_up = true;
98
99         DLIST_ADD(ctdb->ifaces, i);
100
101         return 0;
102 }
103
104 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
105                                         const char *name)
106 {
107         int n;
108
109         for (n = 0; vnn->ifaces[n] != NULL; n++) {
110                 if (strcmp(name, vnn->ifaces[n]) == 0) {
111                         return true;
112                 }
113         }
114
115         return false;
116 }
117
118 /* If any interfaces now have no possible IPs then delete them.  This
119  * implementation is naive (i.e. simple) rather than clever
120  * (i.e. complex).  Given that this is run on delip and that operation
121  * is rare, this doesn't need to be efficient - it needs to be
122  * foolproof.  One alternative is reference counting, where the logic
123  * is distributed and can, therefore, be broken in multiple places.
124  * Another alternative is to build a red-black tree of interfaces that
125  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
126  * once) and then walking ctdb->ifaces once and deleting those not in
127  * the tree.  Let's go to one of those if the naive implementation
128  * causes problems...  :-)
129  */
130 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
131                                         struct ctdb_vnn *vnn)
132 {
133         struct ctdb_interface *i, *next;
134
135         /* For each interface, check if there's an IP using it. */
136         for (i = ctdb->ifaces; i != NULL; i = next) {
137                 struct ctdb_vnn *tv;
138                 bool found;
139                 next = i->next;
140
141                 /* Only consider interfaces named in the given VNN. */
142                 if (!vnn_has_interface_with_name(vnn, i->name)) {
143                         continue;
144                 }
145
146                 /* Is the "single IP" on this interface? */
147                 if ((ctdb->single_ip_vnn != NULL) &&
148                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
149                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
150                         /* Found, next interface please... */
151                         continue;
152                 }
153                 /* Search for a vnn with this interface. */
154                 found = false;
155                 for (tv=ctdb->vnn; tv; tv=tv->next) {
156                         if (vnn_has_interface_with_name(tv, i->name)) {
157                                 found = true;
158                                 break;
159                         }
160                 }
161
162                 if (!found) {
163                         /* None of the VNNs are using this interface. */
164                         DLIST_REMOVE(ctdb->ifaces, i);
165                         talloc_free(i);
166                 }
167         }
168 }
169
170
171 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
172                                               const char *iface)
173 {
174         struct ctdb_interface *i;
175
176         for (i=ctdb->ifaces;i;i=i->next) {
177                 if (strcmp(i->name, iface) == 0) {
178                         return i;
179                 }
180         }
181
182         return NULL;
183 }
184
185 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
186                                                   struct ctdb_vnn *vnn)
187 {
188         int i;
189         struct ctdb_interface *cur = NULL;
190         struct ctdb_interface *best = NULL;
191
192         for (i=0; vnn->ifaces[i]; i++) {
193
194                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
195                 if (cur == NULL) {
196                         continue;
197                 }
198
199                 if (!cur->link_up) {
200                         continue;
201                 }
202
203                 if (best == NULL) {
204                         best = cur;
205                         continue;
206                 }
207
208                 if (cur->references < best->references) {
209                         best = cur;
210                         continue;
211                 }
212         }
213
214         return best;
215 }
216
217 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
218                                      struct ctdb_vnn *vnn)
219 {
220         struct ctdb_interface *best = NULL;
221
222         if (vnn->iface) {
223                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
224                                    "still assigned to iface '%s'\n",
225                                    ctdb_addr_to_str(&vnn->public_address),
226                                    ctdb_vnn_iface_string(vnn)));
227                 return 0;
228         }
229
230         best = ctdb_vnn_best_iface(ctdb, vnn);
231         if (best == NULL) {
232                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
233                                   "cannot assign to iface any iface\n",
234                                   ctdb_addr_to_str(&vnn->public_address)));
235                 return -1;
236         }
237
238         vnn->iface = best;
239         best->references++;
240         vnn->pnn = ctdb->pnn;
241
242         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
243                            "now assigned to iface '%s' refs[%d]\n",
244                            ctdb_addr_to_str(&vnn->public_address),
245                            ctdb_vnn_iface_string(vnn),
246                            best->references));
247         return 0;
248 }
249
250 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
251                                     struct ctdb_vnn *vnn)
252 {
253         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
254                            "now unassigned (old iface '%s' refs[%d])\n",
255                            ctdb_addr_to_str(&vnn->public_address),
256                            ctdb_vnn_iface_string(vnn),
257                            vnn->iface?vnn->iface->references:0));
258         if (vnn->iface) {
259                 vnn->iface->references--;
260         }
261         vnn->iface = NULL;
262         if (vnn->pnn == ctdb->pnn) {
263                 vnn->pnn = -1;
264         }
265 }
266
267 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
268                                struct ctdb_vnn *vnn)
269 {
270         int i;
271
272         /* Nodes that are not RUNNING can not host IPs */
273         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
274                 return false;
275         }
276
277         if (vnn->delete_pending) {
278                 return false;
279         }
280
281         if (vnn->iface && vnn->iface->link_up) {
282                 return true;
283         }
284
285         for (i=0; vnn->ifaces[i]; i++) {
286                 struct ctdb_interface *cur;
287
288                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
289                 if (cur == NULL) {
290                         continue;
291                 }
292
293                 if (cur->link_up) {
294                         return true;
295                 }
296         }
297
298         return false;
299 }
300
301 struct ctdb_takeover_arp {
302         struct ctdb_context *ctdb;
303         uint32_t count;
304         ctdb_sock_addr addr;
305         struct ctdb_tcp_array *tcparray;
306         struct ctdb_vnn *vnn;
307 };
308
309
310 /*
311   lists of tcp endpoints
312  */
313 struct ctdb_tcp_list {
314         struct ctdb_tcp_list *prev, *next;
315         struct ctdb_connection connection;
316 };
317
318 /*
319   list of clients to kill on IP release
320  */
321 struct ctdb_client_ip {
322         struct ctdb_client_ip *prev, *next;
323         struct ctdb_context *ctdb;
324         ctdb_sock_addr addr;
325         uint32_t client_id;
326 };
327
328
329 /*
330   send a gratuitous arp
331  */
332 static void ctdb_control_send_arp(struct tevent_context *ev,
333                                   struct tevent_timer *te,
334                                   struct timeval t, void *private_data)
335 {
336         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
337                                                         struct ctdb_takeover_arp);
338         int i, ret;
339         struct ctdb_tcp_array *tcparray;
340         const char *iface = ctdb_vnn_iface_string(arp->vnn);
341
342         ret = ctdb_sys_send_arp(&arp->addr, iface);
343         if (ret != 0) {
344                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
345                                   iface, strerror(errno)));
346         }
347
348         tcparray = arp->tcparray;
349         if (tcparray) {
350                 for (i=0;i<tcparray->num;i++) {
351                         struct ctdb_connection *tcon;
352
353                         tcon = &tcparray->connections[i];
354                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
355                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
356                                 ctdb_addr_to_str(&tcon->src),
357                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
358                         ret = ctdb_sys_send_tcp(
359                                 &tcon->src,
360                                 &tcon->dst,
361                                 0, 0, 0);
362                         if (ret != 0) {
363                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
364                                         ctdb_addr_to_str(&tcon->src)));
365                         }
366                 }
367         }
368
369         arp->count++;
370
371         if (arp->count == CTDB_ARP_REPEAT) {
372                 talloc_free(arp);
373                 return;
374         }
375
376         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
377                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
378                          ctdb_control_send_arp, arp);
379 }
380
381 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
382                                        struct ctdb_vnn *vnn)
383 {
384         struct ctdb_takeover_arp *arp;
385         struct ctdb_tcp_array *tcparray;
386
387         if (!vnn->takeover_ctx) {
388                 vnn->takeover_ctx = talloc_new(vnn);
389                 if (!vnn->takeover_ctx) {
390                         return -1;
391                 }
392         }
393
394         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
395         if (!arp) {
396                 return -1;
397         }
398
399         arp->ctdb = ctdb;
400         arp->addr = vnn->public_address;
401         arp->vnn  = vnn;
402
403         tcparray = vnn->tcp_array;
404         if (tcparray) {
405                 /* add all of the known tcp connections for this IP to the
406                    list of tcp connections to send tickle acks for */
407                 arp->tcparray = talloc_steal(arp, tcparray);
408
409                 vnn->tcp_array = NULL;
410                 vnn->tcp_update_needed = true;
411         }
412
413         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
414                          timeval_zero(), ctdb_control_send_arp, arp);
415
416         return 0;
417 }
418
419 struct takeover_callback_state {
420         struct ctdb_req_control_old *c;
421         ctdb_sock_addr *addr;
422         struct ctdb_vnn *vnn;
423 };
424
425 struct ctdb_do_takeip_state {
426         struct ctdb_req_control_old *c;
427         struct ctdb_vnn *vnn;
428 };
429
430 /*
431   called when takeip event finishes
432  */
433 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
434                                     void *private_data)
435 {
436         struct ctdb_do_takeip_state *state =
437                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
438         int32_t ret;
439         TDB_DATA data;
440
441         if (status != 0) {
442                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
443         
444                 if (status == -ETIME) {
445                         ctdb_ban_self(ctdb);
446                 }
447                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
448                                  ctdb_addr_to_str(&state->vnn->public_address),
449                                  ctdb_vnn_iface_string(state->vnn)));
450                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
451
452                 node->flags |= NODE_FLAGS_UNHEALTHY;
453                 talloc_free(state);
454                 return;
455         }
456
457         if (ctdb->do_checkpublicip) {
458
459         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
460         if (ret != 0) {
461                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
462                 talloc_free(state);
463                 return;
464         }
465
466         }
467
468         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
469         data.dsize = strlen((char *)data.dptr) + 1;
470         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
471
472         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
473
474
475         /* the control succeeded */
476         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
477         talloc_free(state);
478         return;
479 }
480
481 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
482 {
483         state->vnn->update_in_flight = false;
484         return 0;
485 }
486
487 /*
488   take over an ip address
489  */
490 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
491                               struct ctdb_req_control_old *c,
492                               struct ctdb_vnn *vnn)
493 {
494         int ret;
495         struct ctdb_do_takeip_state *state;
496
497         if (vnn->update_in_flight) {
498                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
499                                     "update for this IP already in flight\n",
500                                     ctdb_addr_to_str(&vnn->public_address),
501                                     vnn->public_netmask_bits));
502                 return -1;
503         }
504
505         ret = ctdb_vnn_assign_iface(ctdb, vnn);
506         if (ret != 0) {
507                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
508                                  "assign a usable interface\n",
509                                  ctdb_addr_to_str(&vnn->public_address),
510                                  vnn->public_netmask_bits));
511                 return -1;
512         }
513
514         state = talloc(vnn, struct ctdb_do_takeip_state);
515         CTDB_NO_MEMORY(ctdb, state);
516
517         state->c = talloc_steal(ctdb, c);
518         state->vnn   = vnn;
519
520         vnn->update_in_flight = true;
521         talloc_set_destructor(state, ctdb_takeip_destructor);
522
523         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
524                             ctdb_addr_to_str(&vnn->public_address),
525                             vnn->public_netmask_bits,
526                             ctdb_vnn_iface_string(vnn)));
527
528         ret = ctdb_event_script_callback(ctdb,
529                                          state,
530                                          ctdb_do_takeip_callback,
531                                          state,
532                                          CTDB_EVENT_TAKE_IP,
533                                          "%s %s %u",
534                                          ctdb_vnn_iface_string(vnn),
535                                          ctdb_addr_to_str(&vnn->public_address),
536                                          vnn->public_netmask_bits);
537
538         if (ret != 0) {
539                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
540                         ctdb_addr_to_str(&vnn->public_address),
541                         ctdb_vnn_iface_string(vnn)));
542                 talloc_free(state);
543                 return -1;
544         }
545
546         return 0;
547 }
548
549 struct ctdb_do_updateip_state {
550         struct ctdb_req_control_old *c;
551         struct ctdb_interface *old;
552         struct ctdb_vnn *vnn;
553 };
554
555 /*
556   called when updateip event finishes
557  */
558 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
559                                       void *private_data)
560 {
561         struct ctdb_do_updateip_state *state =
562                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
563         int32_t ret;
564
565         if (status != 0) {
566                 if (status == -ETIME) {
567                         ctdb_ban_self(ctdb);
568                 }
569                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
570                         ctdb_addr_to_str(&state->vnn->public_address),
571                         state->old->name,
572                         ctdb_vnn_iface_string(state->vnn)));
573
574                 /*
575                  * All we can do is reset the old interface
576                  * and let the next run fix it
577                  */
578                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
579                 state->vnn->iface = state->old;
580                 state->vnn->iface->references++;
581
582                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
583                 talloc_free(state);
584                 return;
585         }
586
587         if (ctdb->do_checkpublicip) {
588
589         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
590         if (ret != 0) {
591                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
592                 talloc_free(state);
593                 return;
594         }
595
596         }
597
598         /* the control succeeded */
599         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
600         talloc_free(state);
601         return;
602 }
603
604 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
605 {
606         state->vnn->update_in_flight = false;
607         return 0;
608 }
609
610 /*
611   update (move) an ip address
612  */
613 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
614                                 struct ctdb_req_control_old *c,
615                                 struct ctdb_vnn *vnn)
616 {
617         int ret;
618         struct ctdb_do_updateip_state *state;
619         struct ctdb_interface *old = vnn->iface;
620         const char *new_name;
621
622         if (vnn->update_in_flight) {
623                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
624                                     "update for this IP already in flight\n",
625                                     ctdb_addr_to_str(&vnn->public_address),
626                                     vnn->public_netmask_bits));
627                 return -1;
628         }
629
630         ctdb_vnn_unassign_iface(ctdb, vnn);
631         ret = ctdb_vnn_assign_iface(ctdb, vnn);
632         if (ret != 0) {
633                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
634                                  "assin a usable interface (old iface '%s')\n",
635                                  ctdb_addr_to_str(&vnn->public_address),
636                                  vnn->public_netmask_bits,
637                                  old->name));
638                 return -1;
639         }
640
641         new_name = ctdb_vnn_iface_string(vnn);
642         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
643                 /* A benign update from one interface onto itself.
644                  * no need to run the eventscripts in this case, just return
645                  * success.
646                  */
647                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
648                 return 0;
649         }
650
651         state = talloc(vnn, struct ctdb_do_updateip_state);
652         CTDB_NO_MEMORY(ctdb, state);
653
654         state->c = talloc_steal(ctdb, c);
655         state->old = old;
656         state->vnn = vnn;
657
658         vnn->update_in_flight = true;
659         talloc_set_destructor(state, ctdb_updateip_destructor);
660
661         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
662                             "interface %s to %s\n",
663                             ctdb_addr_to_str(&vnn->public_address),
664                             vnn->public_netmask_bits,
665                             old->name,
666                             new_name));
667
668         ret = ctdb_event_script_callback(ctdb,
669                                          state,
670                                          ctdb_do_updateip_callback,
671                                          state,
672                                          CTDB_EVENT_UPDATE_IP,
673                                          "%s %s %s %u",
674                                          state->old->name,
675                                          new_name,
676                                          ctdb_addr_to_str(&vnn->public_address),
677                                          vnn->public_netmask_bits);
678         if (ret != 0) {
679                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
680                                  ctdb_addr_to_str(&vnn->public_address),
681                                  old->name, new_name));
682                 talloc_free(state);
683                 return -1;
684         }
685
686         return 0;
687 }
688
689 /*
690   Find the vnn of the node that has a public ip address
691   returns -1 if the address is not known as a public address
692  */
693 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
694 {
695         struct ctdb_vnn *vnn;
696
697         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
698                 if (ctdb_same_ip(&vnn->public_address, addr)) {
699                         return vnn;
700                 }
701         }
702
703         return NULL;
704 }
705
706 /*
707   take over an ip address
708  */
709 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
710                                  struct ctdb_req_control_old *c,
711                                  TDB_DATA indata,
712                                  bool *async_reply)
713 {
714         int ret;
715         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
716         struct ctdb_vnn *vnn;
717         bool have_ip = false;
718         bool do_updateip = false;
719         bool do_takeip = false;
720         struct ctdb_interface *best_iface = NULL;
721
722         if (pip->pnn != ctdb->pnn) {
723                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
724                                  "with pnn %d, but we're node %d\n",
725                                  ctdb_addr_to_str(&pip->addr),
726                                  pip->pnn, ctdb->pnn));
727                 return -1;
728         }
729
730         /* update out vnn list */
731         vnn = find_public_ip_vnn(ctdb, &pip->addr);
732         if (vnn == NULL) {
733                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
734                         ctdb_addr_to_str(&pip->addr)));
735                 return 0;
736         }
737
738         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
739                 have_ip = ctdb_sys_have_ip(&pip->addr);
740         }
741         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
742         if (best_iface == NULL) {
743                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
744                                  "a usable interface (old %s, have_ip %d)\n",
745                                  ctdb_addr_to_str(&vnn->public_address),
746                                  vnn->public_netmask_bits,
747                                  ctdb_vnn_iface_string(vnn),
748                                  have_ip));
749                 return -1;
750         }
751
752         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
753                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
754                 have_ip = false;
755         }
756
757
758         if (vnn->iface == NULL && have_ip) {
759                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
760                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
761                                  ctdb_addr_to_str(&vnn->public_address)));
762                 return 0;
763         }
764
765         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
766                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
767                                   "and we have it on iface[%s], but it was assigned to node %d"
768                                   "and we are node %d, banning ourself\n",
769                                  ctdb_addr_to_str(&vnn->public_address),
770                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
771                 ctdb_ban_self(ctdb);
772                 return -1;
773         }
774
775         if (vnn->pnn == -1 && have_ip) {
776                 vnn->pnn = ctdb->pnn;
777                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
778                                   "and we already have it on iface[%s], update local daemon\n",
779                                  ctdb_addr_to_str(&vnn->public_address),
780                                   ctdb_vnn_iface_string(vnn)));
781                 return 0;
782         }
783
784         if (vnn->iface) {
785                 if (vnn->iface != best_iface) {
786                         if (!vnn->iface->link_up) {
787                                 do_updateip = true;
788                         } else if (vnn->iface->references > (best_iface->references + 1)) {
789                                 /* only move when the rebalance gains something */
790                                         do_updateip = true;
791                         }
792                 }
793         }
794
795         if (!have_ip) {
796                 if (do_updateip) {
797                         ctdb_vnn_unassign_iface(ctdb, vnn);
798                         do_updateip = false;
799                 }
800                 do_takeip = true;
801         }
802
803         if (do_takeip) {
804                 ret = ctdb_do_takeip(ctdb, c, vnn);
805                 if (ret != 0) {
806                         return -1;
807                 }
808         } else if (do_updateip) {
809                 ret = ctdb_do_updateip(ctdb, c, vnn);
810                 if (ret != 0) {
811                         return -1;
812                 }
813         } else {
814                 /*
815                  * The interface is up and the kernel known the ip
816                  * => do nothing
817                  */
818                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
819                         ctdb_addr_to_str(&pip->addr),
820                         vnn->public_netmask_bits,
821                         ctdb_vnn_iface_string(vnn)));
822                 return 0;
823         }
824
825         /* tell ctdb_control.c that we will be replying asynchronously */
826         *async_reply = true;
827
828         return 0;
829 }
830
831 /*
832   kill any clients that are registered with a IP that is being released
833  */
834 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
835 {
836         struct ctdb_client_ip *ip;
837
838         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
839                 ctdb_addr_to_str(addr)));
840
841         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
842                 ctdb_sock_addr tmp_addr;
843
844                 tmp_addr = ip->addr;
845                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
846                         ip->client_id,
847                         ctdb_addr_to_str(&ip->addr)));
848
849                 if (ctdb_same_ip(&tmp_addr, addr)) {
850                         struct ctdb_client *client = reqid_find(ctdb->idr,
851                                                                 ip->client_id,
852                                                                 struct ctdb_client);
853                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
854                                 ip->client_id,
855                                 ctdb_addr_to_str(&ip->addr),
856                                 client->pid));
857
858                         if (client->pid != 0) {
859                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
860                                         (unsigned)client->pid,
861                                         ctdb_addr_to_str(addr),
862                                         ip->client_id));
863                                 kill(client->pid, SIGKILL);
864                         }
865                 }
866         }
867 }
868
869 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
870 {
871         DLIST_REMOVE(ctdb->vnn, vnn);
872         ctdb_vnn_unassign_iface(ctdb, vnn);
873         ctdb_remove_orphaned_ifaces(ctdb, vnn);
874         talloc_free(vnn);
875 }
876
877 /*
878   called when releaseip event finishes
879  */
880 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
881                                 void *private_data)
882 {
883         struct takeover_callback_state *state = 
884                 talloc_get_type(private_data, struct takeover_callback_state);
885         TDB_DATA data;
886
887         if (status == -ETIME) {
888                 ctdb_ban_self(ctdb);
889         }
890
891         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
892                 if  (ctdb_sys_have_ip(state->addr)) {
893                         DEBUG(DEBUG_ERR,
894                               ("IP %s still hosted during release IP callback, failing\n",
895                                ctdb_addr_to_str(state->addr)));
896                         ctdb_request_control_reply(ctdb, state->c,
897                                                    NULL, -1, NULL);
898                         talloc_free(state);
899                         return;
900                 }
901         }
902
903         /* send a message to all clients of this node telling them
904            that the cluster has been reconfigured and they should
905            release any sockets on this IP */
906         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
907         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
908         data.dsize = strlen((char *)data.dptr)+1;
909
910         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
911
912         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
913
914         /* kill clients that have registered with this IP */
915         release_kill_clients(ctdb, state->addr);
916
917         ctdb_vnn_unassign_iface(ctdb, state->vnn);
918
919         /* Process the IP if it has been marked for deletion */
920         if (state->vnn->delete_pending) {
921                 do_delete_ip(ctdb, state->vnn);
922                 state->vnn = NULL;
923         }
924
925         /* the control succeeded */
926         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
927         talloc_free(state);
928 }
929
930 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
931 {
932         if (state->vnn != NULL) {
933                 state->vnn->update_in_flight = false;
934         }
935         return 0;
936 }
937
938 /*
939   release an ip address
940  */
941 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
942                                 struct ctdb_req_control_old *c,
943                                 TDB_DATA indata, 
944                                 bool *async_reply)
945 {
946         int ret;
947         struct takeover_callback_state *state;
948         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
949         struct ctdb_vnn *vnn;
950         char *iface;
951
952         /* update our vnn list */
953         vnn = find_public_ip_vnn(ctdb, &pip->addr);
954         if (vnn == NULL) {
955                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
956                         ctdb_addr_to_str(&pip->addr)));
957                 return 0;
958         }
959         vnn->pnn = pip->pnn;
960
961         /* stop any previous arps */
962         talloc_free(vnn->takeover_ctx);
963         vnn->takeover_ctx = NULL;
964
965         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
966          * lazy multicast to drop an IP from any node that isn't the
967          * intended new node.  The following causes makes ctdbd ignore
968          * a release for any address it doesn't host.
969          */
970         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
971                 if (!ctdb_sys_have_ip(&pip->addr)) {
972                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
973                                 ctdb_addr_to_str(&pip->addr),
974                                 vnn->public_netmask_bits,
975                                 ctdb_vnn_iface_string(vnn)));
976                         ctdb_vnn_unassign_iface(ctdb, vnn);
977                         return 0;
978                 }
979         } else {
980                 if (vnn->iface == NULL) {
981                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
982                                            ctdb_addr_to_str(&pip->addr),
983                                            vnn->public_netmask_bits));
984                         return 0;
985                 }
986         }
987
988         /* There is a potential race between take_ip and us because we
989          * update the VNN via a callback that run when the
990          * eventscripts have been run.  Avoid the race by allowing one
991          * update to be in flight at a time.
992          */
993         if (vnn->update_in_flight) {
994                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
995                                     "update for this IP already in flight\n",
996                                     ctdb_addr_to_str(&vnn->public_address),
997                                     vnn->public_netmask_bits));
998                 return -1;
999         }
1000
1001         iface = strdup(ctdb_vnn_iface_string(vnn));
1002
1003         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1004                 ctdb_addr_to_str(&pip->addr),
1005                 vnn->public_netmask_bits,
1006                 iface,
1007                 pip->pnn));
1008
1009         state = talloc(ctdb, struct takeover_callback_state);
1010         if (state == NULL) {
1011                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1012                                __FILE__, __LINE__);
1013                 free(iface);
1014                 return -1;
1015         }
1016
1017         state->c = talloc_steal(state, c);
1018         state->addr = talloc(state, ctdb_sock_addr);       
1019         if (state->addr == NULL) {
1020                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021                                __FILE__, __LINE__);
1022                 free(iface);
1023                 talloc_free(state);
1024                 return -1;
1025         }
1026         *state->addr = pip->addr;
1027         state->vnn   = vnn;
1028
1029         vnn->update_in_flight = true;
1030         talloc_set_destructor(state, ctdb_releaseip_destructor);
1031
1032         ret = ctdb_event_script_callback(ctdb, 
1033                                          state, release_ip_callback, state,
1034                                          CTDB_EVENT_RELEASE_IP,
1035                                          "%s %s %u",
1036                                          iface,
1037                                          ctdb_addr_to_str(&pip->addr),
1038                                          vnn->public_netmask_bits);
1039         free(iface);
1040         if (ret != 0) {
1041                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1042                         ctdb_addr_to_str(&pip->addr),
1043                         ctdb_vnn_iface_string(vnn)));
1044                 talloc_free(state);
1045                 return -1;
1046         }
1047
1048         /* tell the control that we will be reply asynchronously */
1049         *async_reply = true;
1050         return 0;
1051 }
1052
1053 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1054                                    ctdb_sock_addr *addr,
1055                                    unsigned mask, const char *ifaces,
1056                                    bool check_address)
1057 {
1058         struct ctdb_vnn      *vnn;
1059         uint32_t num = 0;
1060         char *tmp;
1061         const char *iface;
1062         int i;
1063         int ret;
1064
1065         tmp = strdup(ifaces);
1066         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1067                 if (!ctdb_sys_check_iface_exists(iface)) {
1068                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1069                         free(tmp);
1070                         return -1;
1071                 }
1072         }
1073         free(tmp);
1074
1075         /* Verify that we don't have an entry for this ip yet */
1076         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1077                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1078                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1079                                 ctdb_addr_to_str(addr)));
1080                         return -1;
1081                 }               
1082         }
1083
1084         /* create a new vnn structure for this ip address */
1085         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1086         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1087         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1088         tmp = talloc_strdup(vnn, ifaces);
1089         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1090         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1091                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1092                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1093                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1094                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1095                 num++;
1096         }
1097         talloc_free(tmp);
1098         vnn->ifaces[num] = NULL;
1099         vnn->public_address      = *addr;
1100         vnn->public_netmask_bits = mask;
1101         vnn->pnn                 = -1;
1102         if (check_address) {
1103                 if (ctdb_sys_have_ip(addr)) {
1104                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1105                         vnn->pnn = ctdb->pnn;
1106                 }
1107         }
1108
1109         for (i=0; vnn->ifaces[i]; i++) {
1110                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1111                 if (ret != 0) {
1112                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1113                                            "for public_address[%s]\n",
1114                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1115                         talloc_free(vnn);
1116                         return -1;
1117                 }
1118         }
1119
1120         DLIST_ADD(ctdb->vnn, vnn);
1121
1122         return 0;
1123 }
1124
1125 /*
1126   setup the public address lists from a file
1127 */
1128 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1129 {
1130         char **lines;
1131         int nlines;
1132         int i;
1133
1134         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1135         if (lines == NULL) {
1136                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1137                 return -1;
1138         }
1139         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1140                 nlines--;
1141         }
1142
1143         for (i=0;i<nlines;i++) {
1144                 unsigned mask;
1145                 ctdb_sock_addr addr;
1146                 const char *addrstr;
1147                 const char *ifaces;
1148                 char *tok, *line;
1149
1150                 line = lines[i];
1151                 while ((*line == ' ') || (*line == '\t')) {
1152                         line++;
1153                 }
1154                 if (*line == '#') {
1155                         continue;
1156                 }
1157                 if (strcmp(line, "") == 0) {
1158                         continue;
1159                 }
1160                 tok = strtok(line, " \t");
1161                 addrstr = tok;
1162                 tok = strtok(NULL, " \t");
1163                 if (tok == NULL) {
1164                         if (NULL == ctdb->default_public_interface) {
1165                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1166                                          i+1));
1167                                 talloc_free(lines);
1168                                 return -1;
1169                         }
1170                         ifaces = ctdb->default_public_interface;
1171                 } else {
1172                         ifaces = tok;
1173                 }
1174
1175                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1176                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1177                         talloc_free(lines);
1178                         return -1;
1179                 }
1180                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1181                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1182                         talloc_free(lines);
1183                         return -1;
1184                 }
1185         }
1186
1187
1188         talloc_free(lines);
1189         return 0;
1190 }
1191
1192 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1193                               const char *iface,
1194                               const char *ip)
1195 {
1196         struct ctdb_vnn *svnn;
1197         struct ctdb_interface *cur = NULL;
1198         bool ok;
1199         int ret;
1200
1201         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1202         CTDB_NO_MEMORY(ctdb, svnn);
1203
1204         svnn->ifaces = talloc_array(svnn, const char *, 2);
1205         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1206         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1207         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1208         svnn->ifaces[1] = NULL;
1209
1210         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1211         if (!ok) {
1212                 talloc_free(svnn);
1213                 return -1;
1214         }
1215
1216         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1217         if (ret != 0) {
1218                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1219                                    "for single_ip[%s]\n",
1220                                    svnn->ifaces[0],
1221                                    ctdb_addr_to_str(&svnn->public_address)));
1222                 talloc_free(svnn);
1223                 return -1;
1224         }
1225
1226         /* assume the single public ip interface is initially "good" */
1227         cur = ctdb_find_iface(ctdb, iface);
1228         if (cur == NULL) {
1229                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1230                 return -1;
1231         }
1232         cur->link_up = true;
1233
1234         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1235         if (ret != 0) {
1236                 talloc_free(svnn);
1237                 return -1;
1238         }
1239
1240         ctdb->single_ip_vnn = svnn;
1241         return 0;
1242 }
1243
1244 struct public_ip_list {
1245         struct public_ip_list *next;
1246         uint32_t pnn;
1247         ctdb_sock_addr addr;
1248 };
1249
1250 /* Given a physical node, return the number of
1251    public addresses that is currently assigned to this node.
1252 */
1253 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1254 {
1255         int num=0;
1256
1257         for (;ips;ips=ips->next) {
1258                 if (ips->pnn == pnn) {
1259                         num++;
1260                 }
1261         }
1262         return num;
1263 }
1264
1265
1266 /* Can the given node host the given IP: is the public IP known to the
1267  * node and is NOIPHOST unset?
1268 */
1269 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1270                              struct ctdb_ipflags ipflags,
1271                              struct public_ip_list *ip)
1272 {
1273         struct ctdb_public_ip_list_old *public_ips;
1274         int i;
1275
1276         if (ipflags.noiphost) {
1277                 return false;
1278         }
1279
1280         public_ips = ctdb->ipalloc_state->available_public_ips[pnn];
1281
1282         if (public_ips == NULL) {
1283                 return false;
1284         }
1285
1286         for (i=0; i<public_ips->num; i++) {
1287                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1288                         /* yes, this node can serve this public ip */
1289                         return true;
1290                 }
1291         }
1292
1293         return false;
1294 }
1295
1296 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1297                                  struct ctdb_ipflags ipflags,
1298                                  struct public_ip_list *ip)
1299 {
1300         if (ipflags.noiptakeover) {
1301                 return false;
1302         }
1303
1304         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1305 }
1306
1307 /* search the node lists list for a node to takeover this ip.
1308    pick the node that currently are serving the least number of ips
1309    so that the ips get spread out evenly.
1310 */
1311 static int find_takeover_node(struct ctdb_context *ctdb,
1312                               struct ctdb_ipflags *ipflags,
1313                               struct public_ip_list *ip,
1314                               struct public_ip_list *all_ips)
1315 {
1316         int pnn, min=0, num;
1317         int i, numnodes;
1318
1319         numnodes = talloc_array_length(ipflags);
1320         pnn    = -1;
1321         for (i=0; i<numnodes; i++) {
1322                 /* verify that this node can serve this ip */
1323                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1324                         /* no it couldnt   so skip to the next node */
1325                         continue;
1326                 }
1327
1328                 num = node_ip_coverage(i, all_ips);
1329                 /* was this the first node we checked ? */
1330                 if (pnn == -1) {
1331                         pnn = i;
1332                         min  = num;
1333                 } else {
1334                         if (num < min) {
1335                                 pnn = i;
1336                                 min  = num;
1337                         }
1338                 }
1339         }       
1340         if (pnn == -1) {
1341                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1342                         ctdb_addr_to_str(&ip->addr)));
1343
1344                 return -1;
1345         }
1346
1347         ip->pnn = pnn;
1348         return 0;
1349 }
1350
1351 #define IP_KEYLEN       4
1352 static uint32_t *ip_key(ctdb_sock_addr *ip)
1353 {
1354         static uint32_t key[IP_KEYLEN];
1355
1356         bzero(key, sizeof(key));
1357
1358         switch (ip->sa.sa_family) {
1359         case AF_INET:
1360                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1361                 break;
1362         case AF_INET6: {
1363                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1364                 key[0]  = htonl(s6_a32[0]);
1365                 key[1]  = htonl(s6_a32[1]);
1366                 key[2]  = htonl(s6_a32[2]);
1367                 key[3]  = htonl(s6_a32[3]);
1368                 break;
1369         }
1370         default:
1371                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1372                 return key;
1373         }
1374
1375         return key;
1376 }
1377
1378 static void *add_ip_callback(void *parm, void *data)
1379 {
1380         struct public_ip_list *this_ip = parm;
1381         struct public_ip_list *prev_ip = data;
1382
1383         if (prev_ip == NULL) {
1384                 return parm;
1385         }
1386         if (this_ip->pnn == -1) {
1387                 this_ip->pnn = prev_ip->pnn;
1388         }
1389
1390         return parm;
1391 }
1392
1393 static int getips_count_callback(void *param, void *data)
1394 {
1395         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1396         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1397
1398         new_ip->next = *ip_list;
1399         *ip_list     = new_ip;
1400         return 0;
1401 }
1402
1403 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1404                                        struct ctdb_public_ip_list_old *ips,
1405                                        uint32_t pnn);
1406
1407 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1408                                          struct ipalloc_state *ipalloc_state,
1409                                          struct ctdb_node_map_old *nodemap)
1410 {
1411         int j;
1412         int ret;
1413
1414         if (ipalloc_state->num != nodemap->num) {
1415                 DEBUG(DEBUG_ERR,
1416                       (__location__
1417                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1418                        ipalloc_state->num, nodemap->num));
1419                 return -1;
1420         }
1421
1422         for (j=0; j<nodemap->num; j++) {
1423                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1424                         continue;
1425                 }
1426
1427                 /* Retrieve the list of known public IPs from the node */
1428                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1429                                         TAKEOVER_TIMEOUT(),
1430                                         j,
1431                                         ctdb->nodes,
1432                                         0,
1433                                         &ipalloc_state->known_public_ips[j]);
1434                 if (ret != 0) {
1435                         DEBUG(DEBUG_ERR,
1436                               ("Failed to read known public IPs from node: %u\n",
1437                                j));
1438                         return -1;
1439                 }
1440
1441                 if (ctdb->do_checkpublicip) {
1442                         verify_remote_ip_allocation(ctdb,
1443                                                     ipalloc_state->known_public_ips[j],
1444                                                     j);
1445                 }
1446
1447                 /* Retrieve the list of available public IPs from the node */
1448                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1449                                         TAKEOVER_TIMEOUT(),
1450                                         j,
1451                                         ctdb->nodes,
1452                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1453                                         &ipalloc_state->available_public_ips[j]);
1454                 if (ret != 0) {
1455                         DEBUG(DEBUG_ERR,
1456                               ("Failed to read available public IPs from node: %u\n",
1457                                j));
1458                         return -1;
1459                 }
1460         }
1461
1462         return 0;
1463 }
1464
1465 static struct public_ip_list *
1466 create_merged_ip_list(struct ctdb_context *ctdb)
1467 {
1468         int i, j;
1469         struct public_ip_list *ip_list;
1470         struct ctdb_public_ip_list_old *public_ips;
1471
1472         if (ctdb->ip_tree != NULL) {
1473                 talloc_free(ctdb->ip_tree);
1474                 ctdb->ip_tree = NULL;
1475         }
1476         ctdb->ip_tree = trbt_create(ctdb, 0);
1477
1478         for (i=0;i<ctdb->num_nodes;i++) {
1479                 public_ips = ctdb->ipalloc_state->known_public_ips[i];
1480
1481                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1482                         continue;
1483                 }
1484
1485                 /* there were no public ips for this node */
1486                 if (public_ips == NULL) {
1487                         continue;
1488                 }               
1489
1490                 for (j=0;j<public_ips->num;j++) {
1491                         struct public_ip_list *tmp_ip;
1492
1493                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1494                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1495                         /* Do not use information about IP addresses hosted
1496                          * on other nodes, it may not be accurate */
1497                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1498                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1499                         } else {
1500                                 tmp_ip->pnn = -1;
1501                         }
1502                         tmp_ip->addr = public_ips->ips[j].addr;
1503                         tmp_ip->next = NULL;
1504
1505                         trbt_insertarray32_callback(ctdb->ip_tree,
1506                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1507                                 add_ip_callback,
1508                                 tmp_ip);
1509                 }
1510         }
1511
1512         ip_list = NULL;
1513         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1514
1515         return ip_list;
1516 }
1517
1518 /* 
1519  * This is the length of the longtest common prefix between the IPs.
1520  * It is calculated by XOR-ing the 2 IPs together and counting the
1521  * number of leading zeroes.  The implementation means that all
1522  * addresses end up being 128 bits long.
1523  *
1524  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1525  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1526  * lots of nodes and IP addresses?
1527  */
1528 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1529 {
1530         uint32_t ip1_k[IP_KEYLEN];
1531         uint32_t *t;
1532         int i;
1533         uint32_t x;
1534
1535         uint32_t distance = 0;
1536
1537         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1538         t = ip_key(ip2);
1539         for (i=0; i<IP_KEYLEN; i++) {
1540                 x = ip1_k[i] ^ t[i];
1541                 if (x == 0) {
1542                         distance += 32;
1543                 } else {
1544                         /* Count number of leading zeroes. 
1545                          * FIXME? This could be optimised...
1546                          */
1547                         while ((x & (1 << 31)) == 0) {
1548                                 x <<= 1;
1549                                 distance += 1;
1550                         }
1551                 }
1552         }
1553
1554         return distance;
1555 }
1556
1557 /* Calculate the IP distance for the given IP relative to IPs on the
1558    given node.  The ips argument is generally the all_ips variable
1559    used in the main part of the algorithm.
1560  */
1561 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1562                                   struct public_ip_list *ips,
1563                                   int pnn)
1564 {
1565         struct public_ip_list *t;
1566         uint32_t d;
1567
1568         uint32_t sum = 0;
1569
1570         for (t=ips; t != NULL; t=t->next) {
1571                 if (t->pnn != pnn) {
1572                         continue;
1573                 }
1574
1575                 /* Optimisation: We never calculate the distance
1576                  * between an address and itself.  This allows us to
1577                  * calculate the effect of removing an address from a
1578                  * node by simply calculating the distance between
1579                  * that address and all of the exitsing addresses.
1580                  * Moreover, we assume that we're only ever dealing
1581                  * with addresses from all_ips so we can identify an
1582                  * address via a pointer rather than doing a more
1583                  * expensive address comparison. */
1584                 if (&(t->addr) == ip) {
1585                         continue;
1586                 }
1587
1588                 d = ip_distance(ip, &(t->addr));
1589                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1590         }
1591
1592         return sum;
1593 }
1594
1595 /* Return the LCP2 imbalance metric for addresses currently assigned
1596    to the given node.
1597  */
1598 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1599 {
1600         struct public_ip_list *t;
1601
1602         uint32_t imbalance = 0;
1603
1604         for (t=all_ips; t!=NULL; t=t->next) {
1605                 if (t->pnn != pnn) {
1606                         continue;
1607                 }
1608                 /* Pass the rest of the IPs rather than the whole
1609                    all_ips input list.
1610                 */
1611                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1612         }
1613
1614         return imbalance;
1615 }
1616
1617 /* Allocate any unassigned IPs just by looping through the IPs and
1618  * finding the best node for each.
1619  */
1620 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1621                                       struct ctdb_ipflags *ipflags,
1622                                       struct public_ip_list *all_ips)
1623 {
1624         struct public_ip_list *tmp_ip;
1625
1626         /* loop over all ip's and find a physical node to cover for 
1627            each unassigned ip.
1628         */
1629         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1630                 if (tmp_ip->pnn == -1) {
1631                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1632                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1633                                         ctdb_addr_to_str(&tmp_ip->addr)));
1634                         }
1635                 }
1636         }
1637 }
1638
1639 /* Basic non-deterministic rebalancing algorithm.
1640  */
1641 static void basic_failback(struct ctdb_context *ctdb,
1642                            struct ctdb_ipflags *ipflags,
1643                            struct public_ip_list *all_ips,
1644                            int num_ips)
1645 {
1646         int i, numnodes;
1647         int maxnode, maxnum, minnode, minnum, num, retries;
1648         struct public_ip_list *tmp_ip;
1649
1650         numnodes = talloc_array_length(ipflags);
1651         retries = 0;
1652
1653 try_again:
1654         maxnum=0;
1655         minnum=0;
1656
1657         /* for each ip address, loop over all nodes that can serve
1658            this ip and make sure that the difference between the node
1659            serving the most and the node serving the least ip's are
1660            not greater than 1.
1661         */
1662         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1663                 if (tmp_ip->pnn == -1) {
1664                         continue;
1665                 }
1666
1667                 /* Get the highest and lowest number of ips's served by any 
1668                    valid node which can serve this ip.
1669                 */
1670                 maxnode = -1;
1671                 minnode = -1;
1672                 for (i=0; i<numnodes; i++) {
1673                         /* only check nodes that can actually serve this ip */
1674                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1675                                 /* no it couldnt   so skip to the next node */
1676                                 continue;
1677                         }
1678
1679                         num = node_ip_coverage(i, all_ips);
1680                         if (maxnode == -1) {
1681                                 maxnode = i;
1682                                 maxnum  = num;
1683                         } else {
1684                                 if (num > maxnum) {
1685                                         maxnode = i;
1686                                         maxnum  = num;
1687                                 }
1688                         }
1689                         if (minnode == -1) {
1690                                 minnode = i;
1691                                 minnum  = num;
1692                         } else {
1693                                 if (num < minnum) {
1694                                         minnode = i;
1695                                         minnum  = num;
1696                                 }
1697                         }
1698                 }
1699                 if (maxnode == -1) {
1700                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1701                                 ctdb_addr_to_str(&tmp_ip->addr)));
1702
1703                         continue;
1704                 }
1705
1706                 /* if the spread between the smallest and largest coverage by
1707                    a node is >=2 we steal one of the ips from the node with
1708                    most coverage to even things out a bit.
1709                    try to do this a limited number of times since we dont
1710                    want to spend too much time balancing the ip coverage.
1711                 */
1712                 if ( (maxnum > minnum+1)
1713                      && (retries < (num_ips + 5)) ){
1714                         struct public_ip_list *tmp;
1715
1716                         /* Reassign one of maxnode's VNNs */
1717                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1718                                 if (tmp->pnn == maxnode) {
1719                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1720                                         retries++;
1721                                         goto try_again;;
1722                                 }
1723                         }
1724                 }
1725         }
1726 }
1727
1728 static void lcp2_init(struct ctdb_context *tmp_ctx,
1729                       struct ctdb_ipflags *ipflags,
1730                       struct public_ip_list *all_ips,
1731                       uint32_t *force_rebalance_nodes,
1732                       uint32_t **lcp2_imbalances,
1733                       bool **rebalance_candidates)
1734 {
1735         int i, numnodes;
1736         struct public_ip_list *tmp_ip;
1737
1738         numnodes = talloc_array_length(ipflags);
1739
1740         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1741         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1742         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1743         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1744
1745         for (i=0; i<numnodes; i++) {
1746                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1747                 /* First step: assume all nodes are candidates */
1748                 (*rebalance_candidates)[i] = true;
1749         }
1750
1751         /* 2nd step: if a node has IPs assigned then it must have been
1752          * healthy before, so we remove it from consideration.  This
1753          * is overkill but is all we have because we don't maintain
1754          * state between takeover runs.  An alternative would be to
1755          * keep state and invalidate it every time the recovery master
1756          * changes.
1757          */
1758         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1759                 if (tmp_ip->pnn != -1) {
1760                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1761                 }
1762         }
1763
1764         /* 3rd step: if a node is forced to re-balance then
1765            we allow failback onto the node */
1766         if (force_rebalance_nodes == NULL) {
1767                 return;
1768         }
1769         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1770                 uint32_t pnn = force_rebalance_nodes[i];
1771                 if (pnn >= numnodes) {
1772                         DEBUG(DEBUG_ERR,
1773                               (__location__ "unknown node %u\n", pnn));
1774                         continue;
1775                 }
1776
1777                 DEBUG(DEBUG_NOTICE,
1778                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1779                 (*rebalance_candidates)[pnn] = true;
1780         }
1781 }
1782
1783 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1784  * the IP/node combination that will cost the least.
1785  */
1786 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1787                                      struct ctdb_ipflags *ipflags,
1788                                      struct public_ip_list *all_ips,
1789                                      uint32_t *lcp2_imbalances)
1790 {
1791         struct public_ip_list *tmp_ip;
1792         int dstnode, numnodes;
1793
1794         int minnode;
1795         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1796         struct public_ip_list *minip;
1797
1798         bool should_loop = true;
1799         bool have_unassigned = true;
1800
1801         numnodes = talloc_array_length(ipflags);
1802
1803         while (have_unassigned && should_loop) {
1804                 should_loop = false;
1805
1806                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1807                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1808
1809                 minnode = -1;
1810                 mindsum = 0;
1811                 minip = NULL;
1812
1813                 /* loop over each unassigned ip. */
1814                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1815                         if (tmp_ip->pnn != -1) {
1816                                 continue;
1817                         }
1818
1819                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1820                                 /* only check nodes that can actually takeover this ip */
1821                                 if (!can_node_takeover_ip(ctdb, dstnode,
1822                                                           ipflags[dstnode],
1823                                                           tmp_ip)) {
1824                                         /* no it couldnt   so skip to the next node */
1825                                         continue;
1826                                 }
1827
1828                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1829                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1830                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1831                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1832                                                    dstnode,
1833                                                    dstimbl - lcp2_imbalances[dstnode]));
1834
1835
1836                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1837                                         minnode = dstnode;
1838                                         minimbl = dstimbl;
1839                                         mindsum = dstdsum;
1840                                         minip = tmp_ip;
1841                                         should_loop = true;
1842                                 }
1843                         }
1844                 }
1845
1846                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1847
1848                 /* If we found one then assign it to the given node. */
1849                 if (minnode != -1) {
1850                         minip->pnn = minnode;
1851                         lcp2_imbalances[minnode] = minimbl;
1852                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1853                                           ctdb_addr_to_str(&(minip->addr)),
1854                                           minnode,
1855                                           mindsum));
1856                 }
1857
1858                 /* There might be a better way but at least this is clear. */
1859                 have_unassigned = false;
1860                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1861                         if (tmp_ip->pnn == -1) {
1862                                 have_unassigned = true;
1863                         }
1864                 }
1865         }
1866
1867         /* We know if we have an unassigned addresses so we might as
1868          * well optimise.
1869          */
1870         if (have_unassigned) {
1871                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1872                         if (tmp_ip->pnn == -1) {
1873                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1874                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1875                         }
1876                 }
1877         }
1878 }
1879
1880 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1881  * to move IPs from, determines the best IP/destination node
1882  * combination to move from the source node.
1883  */
1884 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1885                                     struct ctdb_ipflags *ipflags,
1886                                     struct public_ip_list *all_ips,
1887                                     int srcnode,
1888                                     uint32_t *lcp2_imbalances,
1889                                     bool *rebalance_candidates)
1890 {
1891         int dstnode, mindstnode, numnodes;
1892         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1893         uint32_t minsrcimbl, mindstimbl;
1894         struct public_ip_list *minip;
1895         struct public_ip_list *tmp_ip;
1896
1897         /* Find an IP and destination node that best reduces imbalance. */
1898         srcimbl = 0;
1899         minip = NULL;
1900         minsrcimbl = 0;
1901         mindstnode = -1;
1902         mindstimbl = 0;
1903
1904         numnodes = talloc_array_length(ipflags);
1905
1906         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1907         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1908                            srcnode, lcp2_imbalances[srcnode]));
1909
1910         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1911                 /* Only consider addresses on srcnode. */
1912                 if (tmp_ip->pnn != srcnode) {
1913                         continue;
1914                 }
1915
1916                 /* What is this IP address costing the source node? */
1917                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1918                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1919
1920                 /* Consider this IP address would cost each potential
1921                  * destination node.  Destination nodes are limited to
1922                  * those that are newly healthy, since we don't want
1923                  * to do gratuitous failover of IPs just to make minor
1924                  * balance improvements.
1925                  */
1926                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1927                         if (!rebalance_candidates[dstnode]) {
1928                                 continue;
1929                         }
1930
1931                         /* only check nodes that can actually takeover this ip */
1932                         if (!can_node_takeover_ip(ctdb, dstnode,
1933                                                   ipflags[dstnode], tmp_ip)) {
1934                                 /* no it couldnt   so skip to the next node */
1935                                 continue;
1936                         }
1937
1938                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1939                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1940                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1941                                            srcnode, -srcdsum,
1942                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1943                                            dstnode, dstdsum));
1944
1945                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1946                             (dstdsum < srcdsum) &&                      \
1947                             ((mindstnode == -1) ||                              \
1948                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1949
1950                                 minip = tmp_ip;
1951                                 minsrcimbl = srcimbl;
1952                                 mindstnode = dstnode;
1953                                 mindstimbl = dstimbl;
1954                         }
1955                 }
1956         }
1957         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1958
1959         if (mindstnode != -1) {
1960                 /* We found a move that makes things better... */
1961                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1962                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1963                                   ctdb_addr_to_str(&(minip->addr)),
1964                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1965
1966
1967                 lcp2_imbalances[srcnode] = minsrcimbl;
1968                 lcp2_imbalances[mindstnode] = mindstimbl;
1969                 minip->pnn = mindstnode;
1970
1971                 return true;
1972         }
1973
1974         return false;
1975         
1976 }
1977
1978 struct lcp2_imbalance_pnn {
1979         uint32_t imbalance;
1980         int pnn;
1981 };
1982
1983 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1984 {
1985         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1986         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1987
1988         if (lipa->imbalance > lipb->imbalance) {
1989                 return -1;
1990         } else if (lipa->imbalance == lipb->imbalance) {
1991                 return 0;
1992         } else {
1993                 return 1;
1994         }
1995 }
1996
1997 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1998  * node with the highest LCP2 imbalance, and then determines the best
1999  * IP/destination node combination to move from the source node.
2000  */
2001 static void lcp2_failback(struct ctdb_context *ctdb,
2002                           struct ctdb_ipflags *ipflags,
2003                           struct public_ip_list *all_ips,
2004                           uint32_t *lcp2_imbalances,
2005                           bool *rebalance_candidates)
2006 {
2007         int i, numnodes;
2008         struct lcp2_imbalance_pnn * lips;
2009         bool again;
2010
2011         numnodes = talloc_array_length(ipflags);
2012
2013 try_again:
2014         /* Put the imbalances and nodes into an array, sort them and
2015          * iterate through candidates.  Usually the 1st one will be
2016          * used, so this doesn't cost much...
2017          */
2018         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2019         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2020         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2021         for (i=0; i<numnodes; i++) {
2022                 lips[i].imbalance = lcp2_imbalances[i];
2023                 lips[i].pnn = i;
2024                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2025         }
2026         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2027               lcp2_cmp_imbalance_pnn);
2028
2029         again = false;
2030         for (i=0; i<numnodes; i++) {
2031                 /* This means that all nodes had 0 or 1 addresses, so
2032                  * can't be imbalanced.
2033                  */
2034                 if (lips[i].imbalance == 0) {
2035                         break;
2036                 }
2037
2038                 if (lcp2_failback_candidate(ctdb,
2039                                             ipflags,
2040                                             all_ips,
2041                                             lips[i].pnn,
2042                                             lcp2_imbalances,
2043                                             rebalance_candidates)) {
2044                         again = true;
2045                         break;
2046                 }
2047         }
2048
2049         talloc_free(lips);
2050         if (again) {
2051                 goto try_again;
2052         }
2053 }
2054
2055 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2056                                     struct ctdb_ipflags *ipflags,
2057                                     struct public_ip_list *all_ips)
2058 {
2059         struct public_ip_list *tmp_ip;
2060
2061         /* verify that the assigned nodes can serve that public ip
2062            and set it to -1 if not
2063         */
2064         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2065                 if (tmp_ip->pnn == -1) {
2066                         continue;
2067                 }
2068                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2069                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2070                         /* this node can not serve this ip. */
2071                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2072                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2073                                            tmp_ip->pnn));
2074                         tmp_ip->pnn = -1;
2075                 }
2076         }
2077 }
2078
2079 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2080                                        struct ctdb_ipflags *ipflags,
2081                                        struct public_ip_list *all_ips)
2082 {
2083         struct public_ip_list *tmp_ip;
2084         int i, numnodes;
2085
2086         numnodes = talloc_array_length(ipflags);
2087
2088         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2089        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2090         *  always be allocated the same way for a specific set of
2091         *  available/unavailable nodes.
2092         */
2093
2094         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2095                 tmp_ip->pnn = i % numnodes;
2096         }
2097
2098         /* IP failback doesn't make sense with deterministic
2099          * IPs, since the modulo step above implicitly fails
2100          * back IPs to their "home" node.
2101          */
2102         if (1 == ctdb->tunable.no_ip_failback) {
2103                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2104         }
2105
2106         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2107
2108         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2109
2110         /* No failback here! */
2111 }
2112
2113 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2114                                           struct ctdb_ipflags *ipflags,
2115                                           struct public_ip_list *all_ips)
2116 {
2117         /* This should be pushed down into basic_failback. */
2118         struct public_ip_list *tmp_ip;
2119         int num_ips = 0;
2120         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2121                 num_ips++;
2122         }
2123
2124         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2125
2126         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2127
2128         /* If we don't want IPs to fail back then don't rebalance IPs. */
2129         if (1 == ctdb->tunable.no_ip_failback) {
2130                 return;
2131         }
2132
2133         /* Now, try to make sure the ip adresses are evenly distributed
2134            across the nodes.
2135         */
2136         basic_failback(ctdb, ipflags, all_ips, num_ips);
2137 }
2138
2139 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2140                           struct ctdb_ipflags *ipflags,
2141                           struct public_ip_list *all_ips,
2142                           uint32_t *force_rebalance_nodes)
2143 {
2144         uint32_t *lcp2_imbalances;
2145         bool *rebalance_candidates;
2146         int numnodes, num_rebalance_candidates, i;
2147
2148         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2149
2150         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2151
2152         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2153                   &lcp2_imbalances, &rebalance_candidates);
2154
2155         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2156
2157         /* If we don't want IPs to fail back then don't rebalance IPs. */
2158         if (1 == ctdb->tunable.no_ip_failback) {
2159                 goto finished;
2160         }
2161
2162         /* It is only worth continuing if we have suitable target
2163          * nodes to transfer IPs to.  This check is much cheaper than
2164          * continuing on...
2165          */
2166         numnodes = talloc_array_length(ipflags);
2167         num_rebalance_candidates = 0;
2168         for (i=0; i<numnodes; i++) {
2169                 if (rebalance_candidates[i]) {
2170                         num_rebalance_candidates++;
2171                 }
2172         }
2173         if (num_rebalance_candidates == 0) {
2174                 goto finished;
2175         }
2176
2177         /* Now, try to make sure the ip adresses are evenly distributed
2178            across the nodes.
2179         */
2180         lcp2_failback(ctdb, ipflags, all_ips,
2181                       lcp2_imbalances, rebalance_candidates);
2182
2183 finished:
2184         talloc_free(tmp_ctx);
2185 }
2186
2187 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2188 {
2189         int i;
2190
2191         for (i=0;i<nodemap->num;i++) {
2192                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2193                         /* Found one completely healthy node */
2194                         return false;
2195                 }
2196         }
2197
2198         return true;
2199 }
2200
2201 /* The calculation part of the IP allocation algorithm. */
2202 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2203                                    struct ctdb_ipflags *ipflags,
2204                                    struct public_ip_list **all_ips_p,
2205                                    uint32_t *force_rebalance_nodes)
2206 {
2207         /* since nodes only know about those public addresses that
2208            can be served by that particular node, no single node has
2209            a full list of all public addresses that exist in the cluster.
2210            Walk over all node structures and create a merged list of
2211            all public addresses that exist in the cluster.
2212
2213            keep the tree of ips around as ctdb->ip_tree
2214         */
2215         *all_ips_p = create_merged_ip_list(ctdb);
2216
2217         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2218                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2219         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2220                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2221         } else {
2222                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2223         }
2224
2225         /* at this point ->pnn is the node which will own each IP
2226            or -1 if there is no node that can cover this ip
2227         */
2228
2229         return;
2230 }
2231
2232 struct get_tunable_callback_data {
2233         const char *tunable;
2234         uint32_t *out;
2235         bool fatal;
2236 };
2237
2238 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2239                                  int32_t res, TDB_DATA outdata,
2240                                  void *callback)
2241 {
2242         struct get_tunable_callback_data *cd =
2243                 (struct get_tunable_callback_data *)callback;
2244         int size;
2245
2246         if (res != 0) {
2247                 /* Already handled in fail callback */
2248                 return;
2249         }
2250
2251         if (outdata.dsize != sizeof(uint32_t)) {
2252                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2253                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2254                                  (int)outdata.dsize));
2255                 cd->fatal = true;
2256                 return;
2257         }
2258
2259         size = talloc_array_length(cd->out);
2260         if (pnn >= size) {
2261                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2262                                  cd->tunable, pnn, size));
2263                 return;
2264         }
2265
2266                 
2267         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2268 }
2269
2270 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2271                                        int32_t res, TDB_DATA outdata,
2272                                        void *callback)
2273 {
2274         struct get_tunable_callback_data *cd =
2275                 (struct get_tunable_callback_data *)callback;
2276
2277         switch (res) {
2278         case -ETIME:
2279                 DEBUG(DEBUG_ERR,
2280                       ("Timed out getting tunable \"%s\" from node %d\n",
2281                        cd->tunable, pnn));
2282                 cd->fatal = true;
2283                 break;
2284         case -EINVAL:
2285         case -1:
2286                 DEBUG(DEBUG_WARNING,
2287                       ("Tunable \"%s\" not implemented on node %d\n",
2288                        cd->tunable, pnn));
2289                 break;
2290         default:
2291                 DEBUG(DEBUG_ERR,
2292                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2293                        cd->tunable, pnn));
2294                 cd->fatal = true;
2295         }
2296 }
2297
2298 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2299                                         TALLOC_CTX *tmp_ctx,
2300                                         struct ctdb_node_map_old *nodemap,
2301                                         const char *tunable,
2302                                         uint32_t default_value)
2303 {
2304         TDB_DATA data;
2305         struct ctdb_control_get_tunable *t;
2306         uint32_t *nodes;
2307         uint32_t *tvals;
2308         struct get_tunable_callback_data callback_data;
2309         int i;
2310
2311         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2312         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2313         for (i=0; i<nodemap->num; i++) {
2314                 tvals[i] = default_value;
2315         }
2316                 
2317         callback_data.out = tvals;
2318         callback_data.tunable = tunable;
2319         callback_data.fatal = false;
2320
2321         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2322         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2323         t = (struct ctdb_control_get_tunable *)data.dptr;
2324         t->length = strlen(tunable)+1;
2325         memcpy(t->name, tunable, t->length);
2326         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2327         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2328                                       nodes, 0, TAKEOVER_TIMEOUT(),
2329                                       false, data,
2330                                       get_tunable_callback,
2331                                       get_tunable_fail_callback,
2332                                       &callback_data) != 0) {
2333                 if (callback_data.fatal) {
2334                         talloc_free(tvals);
2335                         tvals = NULL;
2336                 }
2337         }
2338         talloc_free(nodes);
2339         talloc_free(data.dptr);
2340
2341         return tvals;
2342 }
2343
2344 /* Set internal flags for IP allocation:
2345  *   Clear ip flags
2346  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2347  *   Set NOIPHOST ip flag for each INACTIVE node
2348  *   if all nodes are disabled:
2349  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2350  *   else
2351  *     Set NOIPHOST ip flags for disabled nodes
2352  */
2353 static struct ctdb_ipflags *
2354 set_ipflags_internal(struct ctdb_context *ctdb,
2355                      TALLOC_CTX *tmp_ctx,
2356                      struct ctdb_node_map_old *nodemap,
2357                      uint32_t *tval_noiptakeover,
2358                      uint32_t *tval_noiphostonalldisabled)
2359 {
2360         int i;
2361         struct ctdb_ipflags *ipflags;
2362
2363         /* Clear IP flags - implicit due to talloc_zero */
2364         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2365         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2366
2367         for (i=0;i<nodemap->num;i++) {
2368                 /* Can not take IPs on node with NoIPTakeover set */
2369                 if (tval_noiptakeover[i] != 0) {
2370                         ipflags[i].noiptakeover = true;
2371                 }
2372
2373                 /* Can not host IPs on INACTIVE node */
2374                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2375                         ipflags[i].noiphost = true;
2376                 }
2377         }
2378
2379         if (all_nodes_are_disabled(nodemap)) {
2380                 /* If all nodes are disabled, can not host IPs on node
2381                  * with NoIPHostOnAllDisabled set
2382                  */
2383                 for (i=0;i<nodemap->num;i++) {
2384                         if (tval_noiphostonalldisabled[i] != 0) {
2385                                 ipflags[i].noiphost = true;
2386                         }
2387                 }
2388         } else {
2389                 /* If some nodes are not disabled, then can not host
2390                  * IPs on DISABLED node
2391                  */
2392                 for (i=0;i<nodemap->num;i++) {
2393                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2394                                 ipflags[i].noiphost = true;
2395                         }
2396                 }
2397         }
2398
2399         return ipflags;
2400 }
2401
2402 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2403                                         TALLOC_CTX *tmp_ctx,
2404                                         struct ctdb_node_map_old *nodemap)
2405 {
2406         uint32_t *tval_noiptakeover;
2407         uint32_t *tval_noiphostonalldisabled;
2408         struct ctdb_ipflags *ipflags;
2409
2410
2411         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2412                                                    "NoIPTakeover", 0);
2413         if (tval_noiptakeover == NULL) {
2414                 return NULL;
2415         }
2416
2417         tval_noiphostonalldisabled =
2418                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2419                                        "NoIPHostOnAllDisabled", 0);
2420         if (tval_noiphostonalldisabled == NULL) {
2421                 /* Caller frees tmp_ctx */
2422                 return NULL;
2423         }
2424
2425         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2426                                        tval_noiptakeover,
2427                                        tval_noiphostonalldisabled);
2428
2429         talloc_free(tval_noiptakeover);
2430         talloc_free(tval_noiphostonalldisabled);
2431
2432         return ipflags;
2433 }
2434
2435 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2436                                                  TALLOC_CTX *mem_ctx)
2437 {
2438         struct ipalloc_state *ipalloc_state =
2439                 talloc_zero(mem_ctx, struct ipalloc_state);
2440         if (ipalloc_state == NULL) {
2441                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2442                 return NULL;
2443         }
2444
2445         ipalloc_state->num = ctdb->num_nodes;
2446         ipalloc_state->known_public_ips =
2447                 talloc_zero_array(ipalloc_state,
2448                                   struct ctdb_public_ip_list_old *,
2449                                   ipalloc_state->num);
2450         if (ipalloc_state->known_public_ips == NULL) {
2451                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2452                 talloc_free(ipalloc_state);
2453                 return NULL;
2454         }
2455         ipalloc_state->available_public_ips =
2456                 talloc_zero_array(ipalloc_state,
2457                                   struct ctdb_public_ip_list_old *,
2458                                   ipalloc_state->num);
2459         if (ipalloc_state->available_public_ips == NULL) {
2460                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2461                 talloc_free(ipalloc_state);
2462                 return NULL;
2463         }
2464
2465         return ipalloc_state;
2466 }
2467
2468 struct iprealloc_callback_data {
2469         bool *retry_nodes;
2470         int retry_count;
2471         client_async_callback fail_callback;
2472         void *fail_callback_data;
2473         struct ctdb_node_map_old *nodemap;
2474 };
2475
2476 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2477                                         int32_t res, TDB_DATA outdata,
2478                                         void *callback)
2479 {
2480         int numnodes;
2481         struct iprealloc_callback_data *cd =
2482                 (struct iprealloc_callback_data *)callback;
2483
2484         numnodes = talloc_array_length(cd->retry_nodes);
2485         if (pnn > numnodes) {
2486                 DEBUG(DEBUG_ERR,
2487                       ("ipreallocated failure from node %d, "
2488                        "but only %d nodes in nodemap\n",
2489                        pnn, numnodes));
2490                 return;
2491         }
2492
2493         /* Can't run the "ipreallocated" event on a INACTIVE node */
2494         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2495                 DEBUG(DEBUG_WARNING,
2496                       ("ipreallocated failed on inactive node %d, ignoring\n",
2497                        pnn));
2498                 return;
2499         }
2500
2501         switch (res) {
2502         case -ETIME:
2503                 /* If the control timed out then that's a real error,
2504                  * so call the real fail callback
2505                  */
2506                 if (cd->fail_callback) {
2507                         cd->fail_callback(ctdb, pnn, res, outdata,
2508                                           cd->fail_callback_data);
2509                 } else {
2510                         DEBUG(DEBUG_WARNING,
2511                               ("iprealloc timed out but no callback registered\n"));
2512                 }
2513                 break;
2514         default:
2515                 /* If not a timeout then either the ipreallocated
2516                  * eventscript (or some setup) failed.  This might
2517                  * have failed because the IPREALLOCATED control isn't
2518                  * implemented - right now there is no way of knowing
2519                  * because the error codes are all folded down to -1.
2520                  * Consider retrying using EVENTSCRIPT control...
2521                  */
2522                 DEBUG(DEBUG_WARNING,
2523                       ("ipreallocated failure from node %d, flagging retry\n",
2524                        pnn));
2525                 cd->retry_nodes[pnn] = true;
2526                 cd->retry_count++;
2527         }
2528 }
2529
2530 struct takeover_callback_data {
2531         bool *node_failed;
2532         client_async_callback fail_callback;
2533         void *fail_callback_data;
2534         struct ctdb_node_map_old *nodemap;
2535 };
2536
2537 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2538                                        uint32_t node_pnn, int32_t res,
2539                                        TDB_DATA outdata, void *callback_data)
2540 {
2541         struct takeover_callback_data *cd =
2542                 talloc_get_type_abort(callback_data,
2543                                       struct takeover_callback_data);
2544         int i;
2545
2546         for (i = 0; i < cd->nodemap->num; i++) {
2547                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2548                         break;
2549                 }
2550         }
2551
2552         if (i == cd->nodemap->num) {
2553                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2554                 return;
2555         }
2556
2557         if (!cd->node_failed[i]) {
2558                 cd->node_failed[i] = true;
2559                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2560                                   cd->fail_callback_data);
2561         }
2562 }
2563
2564 /*
2565   make any IP alias changes for public addresses that are necessary 
2566  */
2567 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2568                       uint32_t *force_rebalance_nodes,
2569                       client_async_callback fail_callback, void *callback_data)
2570 {
2571         int i, j, ret;
2572         struct ctdb_public_ip ip;
2573         uint32_t *nodes;
2574         struct public_ip_list *all_ips, *tmp_ip;
2575         TDB_DATA data;
2576         struct timeval timeout;
2577         struct client_async_data *async_data;
2578         struct ctdb_client_control_state *state;
2579         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2580         struct ctdb_ipflags *ipflags;
2581         struct ipalloc_state *ipalloc_state;
2582         struct takeover_callback_data *takeover_data;
2583         struct iprealloc_callback_data iprealloc_data;
2584         bool *retry_data;
2585         bool can_host_ips;
2586
2587         /*
2588          * ip failover is completely disabled, just send out the 
2589          * ipreallocated event.
2590          */
2591         if (ctdb->tunable.disable_ip_failover != 0) {
2592                 goto ipreallocated;
2593         }
2594
2595         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2596         if (ipalloc_state == NULL) {
2597                 talloc_free(tmp_ctx);
2598                 return -1;
2599         }
2600         ctdb->ipalloc_state = ipalloc_state;
2601
2602         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2603         if (ipflags == NULL) {
2604                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2605                 talloc_free(tmp_ctx);
2606                 return -1;
2607         }
2608
2609         /* Fetch known/available public IPs from each active node */
2610         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2611         if (ret != 0) {
2612                 talloc_free(tmp_ctx);
2613                 return -1;
2614         }
2615
2616         /* Short-circuit IP allocation if no node has available IPs */
2617         can_host_ips = false;
2618         for (i=0; i < ipalloc_state->num; i++) {
2619                 if (ipalloc_state->available_public_ips[i] != NULL) {
2620                         can_host_ips = true;
2621                 }
2622         }
2623         if (!can_host_ips) {
2624                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2625                 return 0;
2626         }
2627
2628         /* Do the IP reassignment calculations */
2629         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2630
2631         /* Now tell all nodes to release any public IPs should not
2632          * host.  This will be a NOOP on nodes that don't currently
2633          * hold the given IP.
2634          */
2635         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2636         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2637
2638         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2639                                                        bool, nodemap->num);
2640         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2641         takeover_data->fail_callback = fail_callback;
2642         takeover_data->fail_callback_data = callback_data;
2643         takeover_data->nodemap = nodemap;
2644
2645         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2646         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2647
2648         async_data->fail_callback = takeover_run_fail_callback;
2649         async_data->callback_data = takeover_data;
2650
2651         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2652
2653         /* Send a RELEASE_IP to all nodes that should not be hosting
2654          * each IP.  For each IP, all but one of these will be
2655          * redundant.  However, the redundant ones are used to tell
2656          * nodes which node should be hosting the IP so that commands
2657          * like "ctdb ip" can display a particular nodes idea of who
2658          * is hosting what. */
2659         for (i=0;i<nodemap->num;i++) {
2660                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2661                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2662                         continue;
2663                 }
2664
2665                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2666                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2667                                 /* This node should be serving this
2668                                    vnn so don't tell it to release the ip
2669                                 */
2670                                 continue;
2671                         }
2672                         ip.pnn  = tmp_ip->pnn;
2673                         ip.addr = tmp_ip->addr;
2674
2675                         timeout = TAKEOVER_TIMEOUT();
2676                         data.dsize = sizeof(ip);
2677                         data.dptr  = (uint8_t *)&ip;
2678                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2679                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2680                                                   data, async_data,
2681                                                   &timeout, NULL);
2682                         if (state == NULL) {
2683                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2684                                 talloc_free(tmp_ctx);
2685                                 return -1;
2686                         }
2687
2688                         ctdb_client_async_add(async_data, state);
2689                 }
2690         }
2691         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2692                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2693                 talloc_free(tmp_ctx);
2694                 return -1;
2695         }
2696         talloc_free(async_data);
2697
2698
2699         /* For each IP, send a TAKOVER_IP to the node that should be
2700          * hosting it.  Many of these will often be redundant (since
2701          * the allocation won't have changed) but they can be useful
2702          * to recover from inconsistencies. */
2703         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2704         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2705
2706         async_data->fail_callback = fail_callback;
2707         async_data->callback_data = callback_data;
2708
2709         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2710                 if (tmp_ip->pnn == -1) {
2711                         /* this IP won't be taken over */
2712                         continue;
2713                 }
2714
2715                 ip.pnn  = tmp_ip->pnn;
2716                 ip.addr = tmp_ip->addr;
2717
2718                 timeout = TAKEOVER_TIMEOUT();
2719                 data.dsize = sizeof(ip);
2720                 data.dptr  = (uint8_t *)&ip;
2721                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2722                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2723                                           data, async_data, &timeout, NULL);
2724                 if (state == NULL) {
2725                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2726                         talloc_free(tmp_ctx);
2727                         return -1;
2728                 }
2729
2730                 ctdb_client_async_add(async_data, state);
2731         }
2732         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2733                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2734                 talloc_free(tmp_ctx);
2735                 return -1;
2736         }
2737
2738 ipreallocated:
2739         /*
2740          * Tell all nodes to run eventscripts to process the
2741          * "ipreallocated" event.  This can do a lot of things,
2742          * including restarting services to reconfigure them if public
2743          * IPs have moved.  Once upon a time this event only used to
2744          * update natgw.
2745          */
2746         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2747         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2748         iprealloc_data.retry_nodes = retry_data;
2749         iprealloc_data.retry_count = 0;
2750         iprealloc_data.fail_callback = fail_callback;
2751         iprealloc_data.fail_callback_data = callback_data;
2752         iprealloc_data.nodemap = nodemap;
2753
2754         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2755         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2756                                         nodes, 0, TAKEOVER_TIMEOUT(),
2757                                         false, tdb_null,
2758                                         NULL, iprealloc_fail_callback,
2759                                         &iprealloc_data);
2760         if (ret != 0) {
2761                 /* If the control failed then we should retry to any
2762                  * nodes flagged by iprealloc_fail_callback using the
2763                  * EVENTSCRIPT control.  This is a best-effort at
2764                  * backward compatiblity when running a mixed cluster
2765                  * where some nodes have not yet been upgraded to
2766                  * support the IPREALLOCATED control.
2767                  */
2768                 DEBUG(DEBUG_WARNING,
2769                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2770
2771                 nodes = talloc_array(tmp_ctx, uint32_t,
2772                                      iprealloc_data.retry_count);
2773                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2774
2775                 j = 0;
2776                 for (i=0; i<nodemap->num; i++) {
2777                         if (iprealloc_data.retry_nodes[i]) {
2778                                 nodes[j] = i;
2779                                 j++;
2780                         }
2781                 }
2782
2783                 data.dptr  = discard_const("ipreallocated");
2784                 data.dsize = strlen((char *)data.dptr) + 1; 
2785                 ret = ctdb_client_async_control(ctdb,
2786                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2787                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2788                                                 false, data,
2789                                                 NULL, fail_callback,
2790                                                 callback_data);
2791                 if (ret != 0) {
2792                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2793                 }
2794         }
2795
2796         talloc_free(tmp_ctx);
2797         return ret;
2798 }
2799
2800
2801 /*
2802   destroy a ctdb_client_ip structure
2803  */
2804 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2805 {
2806         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2807                 ctdb_addr_to_str(&ip->addr),
2808                 ntohs(ip->addr.ip.sin_port),
2809                 ip->client_id));
2810
2811         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2812         return 0;
2813 }
2814
2815 /*
2816   called by a client to inform us of a TCP connection that it is managing
2817   that should tickled with an ACK when IP takeover is done
2818  */
2819 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2820                                 TDB_DATA indata)
2821 {
2822         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2823         struct ctdb_connection *tcp_sock = NULL;
2824         struct ctdb_tcp_list *tcp;
2825         struct ctdb_connection t;
2826         int ret;
2827         TDB_DATA data;
2828         struct ctdb_client_ip *ip;
2829         struct ctdb_vnn *vnn;
2830         ctdb_sock_addr addr;
2831
2832         /* If we don't have public IPs, tickles are useless */
2833         if (ctdb->vnn == NULL) {
2834                 return 0;
2835         }
2836
2837         tcp_sock = (struct ctdb_connection *)indata.dptr;
2838
2839         addr = tcp_sock->src;
2840         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2841         addr = tcp_sock->dst;
2842         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2843
2844         ZERO_STRUCT(addr);
2845         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2846         vnn = find_public_ip_vnn(ctdb, &addr);
2847         if (vnn == NULL) {
2848                 switch (addr.sa.sa_family) {
2849                 case AF_INET:
2850                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2851                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2852                                         ctdb_addr_to_str(&addr)));
2853                         }
2854                         break;
2855                 case AF_INET6:
2856                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2857                                 ctdb_addr_to_str(&addr)));
2858                         break;
2859                 default:
2860                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2861                 }
2862
2863                 return 0;
2864         }
2865
2866         if (vnn->pnn != ctdb->pnn) {
2867                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2868                         ctdb_addr_to_str(&addr),
2869                         client_id, client->pid));
2870                 /* failing this call will tell smbd to die */
2871                 return -1;
2872         }
2873
2874         ip = talloc(client, struct ctdb_client_ip);
2875         CTDB_NO_MEMORY(ctdb, ip);
2876
2877         ip->ctdb      = ctdb;
2878         ip->addr      = addr;
2879         ip->client_id = client_id;
2880         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2881         DLIST_ADD(ctdb->client_ip_list, ip);
2882
2883         tcp = talloc(client, struct ctdb_tcp_list);
2884         CTDB_NO_MEMORY(ctdb, tcp);
2885
2886         tcp->connection.src = tcp_sock->src;
2887         tcp->connection.dst = tcp_sock->dst;
2888
2889         DLIST_ADD(client->tcp_list, tcp);
2890
2891         t.src = tcp_sock->src;
2892         t.dst = tcp_sock->dst;
2893
2894         data.dptr = (uint8_t *)&t;
2895         data.dsize = sizeof(t);
2896
2897         switch (addr.sa.sa_family) {
2898         case AF_INET:
2899                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2900                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2901                         ctdb_addr_to_str(&tcp_sock->src),
2902                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2903                 break;
2904         case AF_INET6:
2905                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2906                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2907                         ctdb_addr_to_str(&tcp_sock->src),
2908                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2909                 break;
2910         default:
2911                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2912         }
2913
2914
2915         /* tell all nodes about this tcp connection */
2916         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2917                                        CTDB_CONTROL_TCP_ADD,
2918                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2919         if (ret != 0) {
2920                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2921                 return -1;
2922         }
2923
2924         return 0;
2925 }
2926
2927 /*
2928   find a tcp address on a list
2929  */
2930 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2931                                            struct ctdb_connection *tcp)
2932 {
2933         int i;
2934
2935         if (array == NULL) {
2936                 return NULL;
2937         }
2938
2939         for (i=0;i<array->num;i++) {
2940                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2941                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2942                         return &array->connections[i];
2943                 }
2944         }
2945         return NULL;
2946 }
2947
2948
2949
2950 /*
2951   called by a daemon to inform us of a TCP connection that one of its
2952   clients managing that should tickled with an ACK when IP takeover is
2953   done
2954  */
2955 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2956 {
2957         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2958         struct ctdb_tcp_array *tcparray;
2959         struct ctdb_connection tcp;
2960         struct ctdb_vnn *vnn;
2961
2962         /* If we don't have public IPs, tickles are useless */
2963         if (ctdb->vnn == NULL) {
2964                 return 0;
2965         }
2966
2967         vnn = find_public_ip_vnn(ctdb, &p->dst);
2968         if (vnn == NULL) {
2969                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2970                         ctdb_addr_to_str(&p->dst)));
2971
2972                 return -1;
2973         }
2974
2975
2976         tcparray = vnn->tcp_array;
2977
2978         /* If this is the first tickle */
2979         if (tcparray == NULL) {
2980                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2981                 CTDB_NO_MEMORY(ctdb, tcparray);
2982                 vnn->tcp_array = tcparray;
2983
2984                 tcparray->num = 0;
2985                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2986                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2987
2988                 tcparray->connections[tcparray->num].src = p->src;
2989                 tcparray->connections[tcparray->num].dst = p->dst;
2990                 tcparray->num++;
2991
2992                 if (tcp_update_needed) {
2993                         vnn->tcp_update_needed = true;
2994                 }
2995                 return 0;
2996         }
2997
2998
2999         /* Do we already have this tickle ?*/
3000         tcp.src = p->src;
3001         tcp.dst = p->dst;
3002         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3003                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3004                         ctdb_addr_to_str(&tcp.dst),
3005                         ntohs(tcp.dst.ip.sin_port),
3006                         vnn->pnn));
3007                 return 0;