c84e3cacee0b884b4e35bc900e313112080cdcd1
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 struct ctdb_interface {
57         struct ctdb_interface *prev, *next;
58         const char *name;
59         bool link_up;
60         uint32_t references;
61 };
62
63 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
64 {
65         if (vnn->iface) {
66                 return vnn->iface->name;
67         }
68
69         return "__none__";
70 }
71
72 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
73 {
74         struct ctdb_interface *i;
75
76         /* Verify that we don't have an entry for this ip yet */
77         for (i=ctdb->ifaces;i;i=i->next) {
78                 if (strcmp(i->name, iface) == 0) {
79                         return 0;
80                 }
81         }
82
83         /* create a new structure for this interface */
84         i = talloc_zero(ctdb, struct ctdb_interface);
85         CTDB_NO_MEMORY_FATAL(ctdb, i);
86         i->name = talloc_strdup(i, iface);
87         CTDB_NO_MEMORY(ctdb, i->name);
88
89         i->link_up = true;
90
91         DLIST_ADD(ctdb->ifaces, i);
92
93         return 0;
94 }
95
96 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
97                                         const char *name)
98 {
99         int n;
100
101         for (n = 0; vnn->ifaces[n] != NULL; n++) {
102                 if (strcmp(name, vnn->ifaces[n]) == 0) {
103                         return true;
104                 }
105         }
106
107         return false;
108 }
109
110 /* If any interfaces now have no possible IPs then delete them.  This
111  * implementation is naive (i.e. simple) rather than clever
112  * (i.e. complex).  Given that this is run on delip and that operation
113  * is rare, this doesn't need to be efficient - it needs to be
114  * foolproof.  One alternative is reference counting, where the logic
115  * is distributed and can, therefore, be broken in multiple places.
116  * Another alternative is to build a red-black tree of interfaces that
117  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
118  * once) and then walking ctdb->ifaces once and deleting those not in
119  * the tree.  Let's go to one of those if the naive implementation
120  * causes problems...  :-)
121  */
122 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
123                                         struct ctdb_vnn *vnn)
124 {
125         struct ctdb_interface *i, *next;
126
127         /* For each interface, check if there's an IP using it. */
128         for (i = ctdb->ifaces; i != NULL; i = next) {
129                 struct ctdb_vnn *tv;
130                 bool found;
131                 next = i->next;
132
133                 /* Only consider interfaces named in the given VNN. */
134                 if (!vnn_has_interface_with_name(vnn, i->name)) {
135                         continue;
136                 }
137
138                 /* Is the "single IP" on this interface? */
139                 if ((ctdb->single_ip_vnn != NULL) &&
140                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
141                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
142                         /* Found, next interface please... */
143                         continue;
144                 }
145                 /* Search for a vnn with this interface. */
146                 found = false;
147                 for (tv=ctdb->vnn; tv; tv=tv->next) {
148                         if (vnn_has_interface_with_name(tv, i->name)) {
149                                 found = true;
150                                 break;
151                         }
152                 }
153
154                 if (!found) {
155                         /* None of the VNNs are using this interface. */
156                         DLIST_REMOVE(ctdb->ifaces, i);
157                         talloc_free(i);
158                 }
159         }
160 }
161
162
163 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
164                                               const char *iface)
165 {
166         struct ctdb_interface *i;
167
168         for (i=ctdb->ifaces;i;i=i->next) {
169                 if (strcmp(i->name, iface) == 0) {
170                         return i;
171                 }
172         }
173
174         return NULL;
175 }
176
177 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178                                                   struct ctdb_vnn *vnn)
179 {
180         int i;
181         struct ctdb_interface *cur = NULL;
182         struct ctdb_interface *best = NULL;
183
184         for (i=0; vnn->ifaces[i]; i++) {
185
186                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
187                 if (cur == NULL) {
188                         continue;
189                 }
190
191                 if (!cur->link_up) {
192                         continue;
193                 }
194
195                 if (best == NULL) {
196                         best = cur;
197                         continue;
198                 }
199
200                 if (cur->references < best->references) {
201                         best = cur;
202                         continue;
203                 }
204         }
205
206         return best;
207 }
208
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210                                      struct ctdb_vnn *vnn)
211 {
212         struct ctdb_interface *best = NULL;
213
214         if (vnn->iface) {
215                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216                                    "still assigned to iface '%s'\n",
217                                    ctdb_addr_to_str(&vnn->public_address),
218                                    ctdb_vnn_iface_string(vnn)));
219                 return 0;
220         }
221
222         best = ctdb_vnn_best_iface(ctdb, vnn);
223         if (best == NULL) {
224                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225                                   "cannot assign to iface any iface\n",
226                                   ctdb_addr_to_str(&vnn->public_address)));
227                 return -1;
228         }
229
230         vnn->iface = best;
231         best->references++;
232         vnn->pnn = ctdb->pnn;
233
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now assigned to iface '%s' refs[%d]\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            best->references));
239         return 0;
240 }
241
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243                                     struct ctdb_vnn *vnn)
244 {
245         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246                            "now unassigned (old iface '%s' refs[%d])\n",
247                            ctdb_addr_to_str(&vnn->public_address),
248                            ctdb_vnn_iface_string(vnn),
249                            vnn->iface?vnn->iface->references:0));
250         if (vnn->iface) {
251                 vnn->iface->references--;
252         }
253         vnn->iface = NULL;
254         if (vnn->pnn == ctdb->pnn) {
255                 vnn->pnn = -1;
256         }
257 }
258
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260                                struct ctdb_vnn *vnn)
261 {
262         int i;
263
264         /* Nodes that are not RUNNING can not host IPs */
265         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
266                 return false;
267         }
268
269         if (vnn->delete_pending) {
270                 return false;
271         }
272
273         if (vnn->iface && vnn->iface->link_up) {
274                 return true;
275         }
276
277         for (i=0; vnn->ifaces[i]; i++) {
278                 struct ctdb_interface *cur;
279
280                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
281                 if (cur == NULL) {
282                         continue;
283                 }
284
285                 if (cur->link_up) {
286                         return true;
287                 }
288         }
289
290         return false;
291 }
292
293 struct ctdb_takeover_arp {
294         struct ctdb_context *ctdb;
295         uint32_t count;
296         ctdb_sock_addr addr;
297         struct ctdb_tcp_array *tcparray;
298         struct ctdb_vnn *vnn;
299 };
300
301
302 /*
303   lists of tcp endpoints
304  */
305 struct ctdb_tcp_list {
306         struct ctdb_tcp_list *prev, *next;
307         struct ctdb_connection connection;
308 };
309
310 /*
311   list of clients to kill on IP release
312  */
313 struct ctdb_client_ip {
314         struct ctdb_client_ip *prev, *next;
315         struct ctdb_context *ctdb;
316         ctdb_sock_addr addr;
317         uint32_t client_id;
318 };
319
320
321 /*
322   send a gratuitous arp
323  */
324 static void ctdb_control_send_arp(struct tevent_context *ev,
325                                   struct tevent_timer *te,
326                                   struct timeval t, void *private_data)
327 {
328         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
329                                                         struct ctdb_takeover_arp);
330         int i, ret;
331         struct ctdb_tcp_array *tcparray;
332         const char *iface = ctdb_vnn_iface_string(arp->vnn);
333
334         ret = ctdb_sys_send_arp(&arp->addr, iface);
335         if (ret != 0) {
336                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
337                                   iface, strerror(errno)));
338         }
339
340         tcparray = arp->tcparray;
341         if (tcparray) {
342                 for (i=0;i<tcparray->num;i++) {
343                         struct ctdb_connection *tcon;
344
345                         tcon = &tcparray->connections[i];
346                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
347                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
348                                 ctdb_addr_to_str(&tcon->src),
349                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
350                         ret = ctdb_sys_send_tcp(
351                                 &tcon->src,
352                                 &tcon->dst,
353                                 0, 0, 0);
354                         if (ret != 0) {
355                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
356                                         ctdb_addr_to_str(&tcon->src)));
357                         }
358                 }
359         }
360
361         arp->count++;
362
363         if (arp->count == CTDB_ARP_REPEAT) {
364                 talloc_free(arp);
365                 return;
366         }
367
368         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
369                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
370                          ctdb_control_send_arp, arp);
371 }
372
373 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
374                                        struct ctdb_vnn *vnn)
375 {
376         struct ctdb_takeover_arp *arp;
377         struct ctdb_tcp_array *tcparray;
378
379         if (!vnn->takeover_ctx) {
380                 vnn->takeover_ctx = talloc_new(vnn);
381                 if (!vnn->takeover_ctx) {
382                         return -1;
383                 }
384         }
385
386         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
387         if (!arp) {
388                 return -1;
389         }
390
391         arp->ctdb = ctdb;
392         arp->addr = vnn->public_address;
393         arp->vnn  = vnn;
394
395         tcparray = vnn->tcp_array;
396         if (tcparray) {
397                 /* add all of the known tcp connections for this IP to the
398                    list of tcp connections to send tickle acks for */
399                 arp->tcparray = talloc_steal(arp, tcparray);
400
401                 vnn->tcp_array = NULL;
402                 vnn->tcp_update_needed = true;
403         }
404
405         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
406                          timeval_zero(), ctdb_control_send_arp, arp);
407
408         return 0;
409 }
410
411 struct takeover_callback_state {
412         struct ctdb_req_control_old *c;
413         ctdb_sock_addr *addr;
414         struct ctdb_vnn *vnn;
415 };
416
417 struct ctdb_do_takeip_state {
418         struct ctdb_req_control_old *c;
419         struct ctdb_vnn *vnn;
420 };
421
422 /*
423   called when takeip event finishes
424  */
425 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
426                                     void *private_data)
427 {
428         struct ctdb_do_takeip_state *state =
429                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
430         int32_t ret;
431         TDB_DATA data;
432
433         if (status != 0) {
434                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
435         
436                 if (status == -ETIME) {
437                         ctdb_ban_self(ctdb);
438                 }
439                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
440                                  ctdb_addr_to_str(&state->vnn->public_address),
441                                  ctdb_vnn_iface_string(state->vnn)));
442                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
443
444                 node->flags |= NODE_FLAGS_UNHEALTHY;
445                 talloc_free(state);
446                 return;
447         }
448
449         if (ctdb->do_checkpublicip) {
450
451         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
452         if (ret != 0) {
453                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
454                 talloc_free(state);
455                 return;
456         }
457
458         }
459
460         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
461         data.dsize = strlen((char *)data.dptr) + 1;
462         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
463
464         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
465
466
467         /* the control succeeded */
468         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
469         talloc_free(state);
470         return;
471 }
472
473 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
474 {
475         state->vnn->update_in_flight = false;
476         return 0;
477 }
478
479 /*
480   take over an ip address
481  */
482 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
483                               struct ctdb_req_control_old *c,
484                               struct ctdb_vnn *vnn)
485 {
486         int ret;
487         struct ctdb_do_takeip_state *state;
488
489         if (vnn->update_in_flight) {
490                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
491                                     "update for this IP already in flight\n",
492                                     ctdb_addr_to_str(&vnn->public_address),
493                                     vnn->public_netmask_bits));
494                 return -1;
495         }
496
497         ret = ctdb_vnn_assign_iface(ctdb, vnn);
498         if (ret != 0) {
499                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
500                                  "assign a usable interface\n",
501                                  ctdb_addr_to_str(&vnn->public_address),
502                                  vnn->public_netmask_bits));
503                 return -1;
504         }
505
506         state = talloc(vnn, struct ctdb_do_takeip_state);
507         CTDB_NO_MEMORY(ctdb, state);
508
509         state->c = talloc_steal(ctdb, c);
510         state->vnn   = vnn;
511
512         vnn->update_in_flight = true;
513         talloc_set_destructor(state, ctdb_takeip_destructor);
514
515         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
516                             ctdb_addr_to_str(&vnn->public_address),
517                             vnn->public_netmask_bits,
518                             ctdb_vnn_iface_string(vnn)));
519
520         ret = ctdb_event_script_callback(ctdb,
521                                          state,
522                                          ctdb_do_takeip_callback,
523                                          state,
524                                          CTDB_EVENT_TAKE_IP,
525                                          "%s %s %u",
526                                          ctdb_vnn_iface_string(vnn),
527                                          ctdb_addr_to_str(&vnn->public_address),
528                                          vnn->public_netmask_bits);
529
530         if (ret != 0) {
531                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
532                         ctdb_addr_to_str(&vnn->public_address),
533                         ctdb_vnn_iface_string(vnn)));
534                 talloc_free(state);
535                 return -1;
536         }
537
538         return 0;
539 }
540
541 struct ctdb_do_updateip_state {
542         struct ctdb_req_control_old *c;
543         struct ctdb_interface *old;
544         struct ctdb_vnn *vnn;
545 };
546
547 /*
548   called when updateip event finishes
549  */
550 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
551                                       void *private_data)
552 {
553         struct ctdb_do_updateip_state *state =
554                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
555         int32_t ret;
556
557         if (status != 0) {
558                 if (status == -ETIME) {
559                         ctdb_ban_self(ctdb);
560                 }
561                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
562                         ctdb_addr_to_str(&state->vnn->public_address),
563                         state->old->name,
564                         ctdb_vnn_iface_string(state->vnn)));
565
566                 /*
567                  * All we can do is reset the old interface
568                  * and let the next run fix it
569                  */
570                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
571                 state->vnn->iface = state->old;
572                 state->vnn->iface->references++;
573
574                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
575                 talloc_free(state);
576                 return;
577         }
578
579         if (ctdb->do_checkpublicip) {
580
581         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
582         if (ret != 0) {
583                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
584                 talloc_free(state);
585                 return;
586         }
587
588         }
589
590         /* the control succeeded */
591         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
592         talloc_free(state);
593         return;
594 }
595
596 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
597 {
598         state->vnn->update_in_flight = false;
599         return 0;
600 }
601
602 /*
603   update (move) an ip address
604  */
605 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
606                                 struct ctdb_req_control_old *c,
607                                 struct ctdb_vnn *vnn)
608 {
609         int ret;
610         struct ctdb_do_updateip_state *state;
611         struct ctdb_interface *old = vnn->iface;
612         const char *new_name;
613
614         if (vnn->update_in_flight) {
615                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
616                                     "update for this IP already in flight\n",
617                                     ctdb_addr_to_str(&vnn->public_address),
618                                     vnn->public_netmask_bits));
619                 return -1;
620         }
621
622         ctdb_vnn_unassign_iface(ctdb, vnn);
623         ret = ctdb_vnn_assign_iface(ctdb, vnn);
624         if (ret != 0) {
625                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
626                                  "assin a usable interface (old iface '%s')\n",
627                                  ctdb_addr_to_str(&vnn->public_address),
628                                  vnn->public_netmask_bits,
629                                  old->name));
630                 return -1;
631         }
632
633         new_name = ctdb_vnn_iface_string(vnn);
634         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
635                 /* A benign update from one interface onto itself.
636                  * no need to run the eventscripts in this case, just return
637                  * success.
638                  */
639                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
640                 return 0;
641         }
642
643         state = talloc(vnn, struct ctdb_do_updateip_state);
644         CTDB_NO_MEMORY(ctdb, state);
645
646         state->c = talloc_steal(ctdb, c);
647         state->old = old;
648         state->vnn = vnn;
649
650         vnn->update_in_flight = true;
651         talloc_set_destructor(state, ctdb_updateip_destructor);
652
653         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
654                             "interface %s to %s\n",
655                             ctdb_addr_to_str(&vnn->public_address),
656                             vnn->public_netmask_bits,
657                             old->name,
658                             new_name));
659
660         ret = ctdb_event_script_callback(ctdb,
661                                          state,
662                                          ctdb_do_updateip_callback,
663                                          state,
664                                          CTDB_EVENT_UPDATE_IP,
665                                          "%s %s %s %u",
666                                          state->old->name,
667                                          new_name,
668                                          ctdb_addr_to_str(&vnn->public_address),
669                                          vnn->public_netmask_bits);
670         if (ret != 0) {
671                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
672                                  ctdb_addr_to_str(&vnn->public_address),
673                                  old->name, new_name));
674                 talloc_free(state);
675                 return -1;
676         }
677
678         return 0;
679 }
680
681 /*
682   Find the vnn of the node that has a public ip address
683   returns -1 if the address is not known as a public address
684  */
685 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
686 {
687         struct ctdb_vnn *vnn;
688
689         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
690                 if (ctdb_same_ip(&vnn->public_address, addr)) {
691                         return vnn;
692                 }
693         }
694
695         return NULL;
696 }
697
698 /*
699   take over an ip address
700  */
701 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
702                                  struct ctdb_req_control_old *c,
703                                  TDB_DATA indata,
704                                  bool *async_reply)
705 {
706         int ret;
707         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
708         struct ctdb_vnn *vnn;
709         bool have_ip = false;
710         bool do_updateip = false;
711         bool do_takeip = false;
712         struct ctdb_interface *best_iface = NULL;
713
714         if (pip->pnn != ctdb->pnn) {
715                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
716                                  "with pnn %d, but we're node %d\n",
717                                  ctdb_addr_to_str(&pip->addr),
718                                  pip->pnn, ctdb->pnn));
719                 return -1;
720         }
721
722         /* update out vnn list */
723         vnn = find_public_ip_vnn(ctdb, &pip->addr);
724         if (vnn == NULL) {
725                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
726                         ctdb_addr_to_str(&pip->addr)));
727                 return 0;
728         }
729
730         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
731                 have_ip = ctdb_sys_have_ip(&pip->addr);
732         }
733         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
734         if (best_iface == NULL) {
735                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
736                                  "a usable interface (old %s, have_ip %d)\n",
737                                  ctdb_addr_to_str(&vnn->public_address),
738                                  vnn->public_netmask_bits,
739                                  ctdb_vnn_iface_string(vnn),
740                                  have_ip));
741                 return -1;
742         }
743
744         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
745                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
746                 have_ip = false;
747         }
748
749
750         if (vnn->iface == NULL && have_ip) {
751                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
753                                  ctdb_addr_to_str(&vnn->public_address)));
754                 return 0;
755         }
756
757         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
758                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
759                                   "and we have it on iface[%s], but it was assigned to node %d"
760                                   "and we are node %d, banning ourself\n",
761                                  ctdb_addr_to_str(&vnn->public_address),
762                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
763                 ctdb_ban_self(ctdb);
764                 return -1;
765         }
766
767         if (vnn->pnn == -1 && have_ip) {
768                 vnn->pnn = ctdb->pnn;
769                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
770                                   "and we already have it on iface[%s], update local daemon\n",
771                                  ctdb_addr_to_str(&vnn->public_address),
772                                   ctdb_vnn_iface_string(vnn)));
773                 return 0;
774         }
775
776         if (vnn->iface) {
777                 if (vnn->iface != best_iface) {
778                         if (!vnn->iface->link_up) {
779                                 do_updateip = true;
780                         } else if (vnn->iface->references > (best_iface->references + 1)) {
781                                 /* only move when the rebalance gains something */
782                                         do_updateip = true;
783                         }
784                 }
785         }
786
787         if (!have_ip) {
788                 if (do_updateip) {
789                         ctdb_vnn_unassign_iface(ctdb, vnn);
790                         do_updateip = false;
791                 }
792                 do_takeip = true;
793         }
794
795         if (do_takeip) {
796                 ret = ctdb_do_takeip(ctdb, c, vnn);
797                 if (ret != 0) {
798                         return -1;
799                 }
800         } else if (do_updateip) {
801                 ret = ctdb_do_updateip(ctdb, c, vnn);
802                 if (ret != 0) {
803                         return -1;
804                 }
805         } else {
806                 /*
807                  * The interface is up and the kernel known the ip
808                  * => do nothing
809                  */
810                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
811                         ctdb_addr_to_str(&pip->addr),
812                         vnn->public_netmask_bits,
813                         ctdb_vnn_iface_string(vnn)));
814                 return 0;
815         }
816
817         /* tell ctdb_control.c that we will be replying asynchronously */
818         *async_reply = true;
819
820         return 0;
821 }
822
823 /*
824   kill any clients that are registered with a IP that is being released
825  */
826 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
827 {
828         struct ctdb_client_ip *ip;
829
830         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
831                 ctdb_addr_to_str(addr)));
832
833         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
834                 ctdb_sock_addr tmp_addr;
835
836                 tmp_addr = ip->addr;
837                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
838                         ip->client_id,
839                         ctdb_addr_to_str(&ip->addr)));
840
841                 if (ctdb_same_ip(&tmp_addr, addr)) {
842                         struct ctdb_client *client = reqid_find(ctdb->idr,
843                                                                 ip->client_id,
844                                                                 struct ctdb_client);
845                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
846                                 ip->client_id,
847                                 ctdb_addr_to_str(&ip->addr),
848                                 client->pid));
849
850                         if (client->pid != 0) {
851                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
852                                         (unsigned)client->pid,
853                                         ctdb_addr_to_str(addr),
854                                         ip->client_id));
855                                 kill(client->pid, SIGKILL);
856                         }
857                 }
858         }
859 }
860
861 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
862 {
863         DLIST_REMOVE(ctdb->vnn, vnn);
864         ctdb_vnn_unassign_iface(ctdb, vnn);
865         ctdb_remove_orphaned_ifaces(ctdb, vnn);
866         talloc_free(vnn);
867 }
868
869 /*
870   called when releaseip event finishes
871  */
872 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
873                                 void *private_data)
874 {
875         struct takeover_callback_state *state = 
876                 talloc_get_type(private_data, struct takeover_callback_state);
877         TDB_DATA data;
878
879         if (status == -ETIME) {
880                 ctdb_ban_self(ctdb);
881         }
882
883         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
884                 if  (ctdb_sys_have_ip(state->addr)) {
885                         DEBUG(DEBUG_ERR,
886                               ("IP %s still hosted during release IP callback, failing\n",
887                                ctdb_addr_to_str(state->addr)));
888                         ctdb_request_control_reply(ctdb, state->c,
889                                                    NULL, -1, NULL);
890                         talloc_free(state);
891                         return;
892                 }
893         }
894
895         /* send a message to all clients of this node telling them
896            that the cluster has been reconfigured and they should
897            release any sockets on this IP */
898         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
899         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
900         data.dsize = strlen((char *)data.dptr)+1;
901
902         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
903
904         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
905
906         /* kill clients that have registered with this IP */
907         release_kill_clients(ctdb, state->addr);
908
909         ctdb_vnn_unassign_iface(ctdb, state->vnn);
910
911         /* Process the IP if it has been marked for deletion */
912         if (state->vnn->delete_pending) {
913                 do_delete_ip(ctdb, state->vnn);
914                 state->vnn = NULL;
915         }
916
917         /* the control succeeded */
918         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
919         talloc_free(state);
920 }
921
922 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
923 {
924         if (state->vnn != NULL) {
925                 state->vnn->update_in_flight = false;
926         }
927         return 0;
928 }
929
930 /*
931   release an ip address
932  */
933 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
934                                 struct ctdb_req_control_old *c,
935                                 TDB_DATA indata, 
936                                 bool *async_reply)
937 {
938         int ret;
939         struct takeover_callback_state *state;
940         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
941         struct ctdb_vnn *vnn;
942         char *iface;
943
944         /* update our vnn list */
945         vnn = find_public_ip_vnn(ctdb, &pip->addr);
946         if (vnn == NULL) {
947                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
948                         ctdb_addr_to_str(&pip->addr)));
949                 return 0;
950         }
951         vnn->pnn = pip->pnn;
952
953         /* stop any previous arps */
954         talloc_free(vnn->takeover_ctx);
955         vnn->takeover_ctx = NULL;
956
957         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
958          * lazy multicast to drop an IP from any node that isn't the
959          * intended new node.  The following causes makes ctdbd ignore
960          * a release for any address it doesn't host.
961          */
962         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
963                 if (!ctdb_sys_have_ip(&pip->addr)) {
964                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
965                                 ctdb_addr_to_str(&pip->addr),
966                                 vnn->public_netmask_bits,
967                                 ctdb_vnn_iface_string(vnn)));
968                         ctdb_vnn_unassign_iface(ctdb, vnn);
969                         return 0;
970                 }
971         } else {
972                 if (vnn->iface == NULL) {
973                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
974                                            ctdb_addr_to_str(&pip->addr),
975                                            vnn->public_netmask_bits));
976                         return 0;
977                 }
978         }
979
980         /* There is a potential race between take_ip and us because we
981          * update the VNN via a callback that run when the
982          * eventscripts have been run.  Avoid the race by allowing one
983          * update to be in flight at a time.
984          */
985         if (vnn->update_in_flight) {
986                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
987                                     "update for this IP already in flight\n",
988                                     ctdb_addr_to_str(&vnn->public_address),
989                                     vnn->public_netmask_bits));
990                 return -1;
991         }
992
993         iface = strdup(ctdb_vnn_iface_string(vnn));
994
995         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
996                 ctdb_addr_to_str(&pip->addr),
997                 vnn->public_netmask_bits,
998                 iface,
999                 pip->pnn));
1000
1001         state = talloc(ctdb, struct takeover_callback_state);
1002         if (state == NULL) {
1003                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1004                                __FILE__, __LINE__);
1005                 free(iface);
1006                 return -1;
1007         }
1008
1009         state->c = talloc_steal(state, c);
1010         state->addr = talloc(state, ctdb_sock_addr);       
1011         if (state->addr == NULL) {
1012                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1013                                __FILE__, __LINE__);
1014                 free(iface);
1015                 talloc_free(state);
1016                 return -1;
1017         }
1018         *state->addr = pip->addr;
1019         state->vnn   = vnn;
1020
1021         vnn->update_in_flight = true;
1022         talloc_set_destructor(state, ctdb_releaseip_destructor);
1023
1024         ret = ctdb_event_script_callback(ctdb, 
1025                                          state, release_ip_callback, state,
1026                                          CTDB_EVENT_RELEASE_IP,
1027                                          "%s %s %u",
1028                                          iface,
1029                                          ctdb_addr_to_str(&pip->addr),
1030                                          vnn->public_netmask_bits);
1031         free(iface);
1032         if (ret != 0) {
1033                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1034                         ctdb_addr_to_str(&pip->addr),
1035                         ctdb_vnn_iface_string(vnn)));
1036                 talloc_free(state);
1037                 return -1;
1038         }
1039
1040         /* tell the control that we will be reply asynchronously */
1041         *async_reply = true;
1042         return 0;
1043 }
1044
1045 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1046                                    ctdb_sock_addr *addr,
1047                                    unsigned mask, const char *ifaces,
1048                                    bool check_address)
1049 {
1050         struct ctdb_vnn      *vnn;
1051         uint32_t num = 0;
1052         char *tmp;
1053         const char *iface;
1054         int i;
1055         int ret;
1056
1057         tmp = strdup(ifaces);
1058         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1059                 if (!ctdb_sys_check_iface_exists(iface)) {
1060                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1061                         free(tmp);
1062                         return -1;
1063                 }
1064         }
1065         free(tmp);
1066
1067         /* Verify that we don't have an entry for this ip yet */
1068         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1069                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1070                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1071                                 ctdb_addr_to_str(addr)));
1072                         return -1;
1073                 }               
1074         }
1075
1076         /* create a new vnn structure for this ip address */
1077         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1078         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1079         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1080         tmp = talloc_strdup(vnn, ifaces);
1081         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1082         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1083                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1084                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1085                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1086                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1087                 num++;
1088         }
1089         talloc_free(tmp);
1090         vnn->ifaces[num] = NULL;
1091         vnn->public_address      = *addr;
1092         vnn->public_netmask_bits = mask;
1093         vnn->pnn                 = -1;
1094         if (check_address) {
1095                 if (ctdb_sys_have_ip(addr)) {
1096                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1097                         vnn->pnn = ctdb->pnn;
1098                 }
1099         }
1100
1101         for (i=0; vnn->ifaces[i]; i++) {
1102                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1103                 if (ret != 0) {
1104                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1105                                            "for public_address[%s]\n",
1106                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1107                         talloc_free(vnn);
1108                         return -1;
1109                 }
1110         }
1111
1112         DLIST_ADD(ctdb->vnn, vnn);
1113
1114         return 0;
1115 }
1116
1117 /*
1118   setup the public address lists from a file
1119 */
1120 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1121 {
1122         char **lines;
1123         int nlines;
1124         int i;
1125
1126         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1127         if (lines == NULL) {
1128                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1129                 return -1;
1130         }
1131         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1132                 nlines--;
1133         }
1134
1135         for (i=0;i<nlines;i++) {
1136                 unsigned mask;
1137                 ctdb_sock_addr addr;
1138                 const char *addrstr;
1139                 const char *ifaces;
1140                 char *tok, *line;
1141
1142                 line = lines[i];
1143                 while ((*line == ' ') || (*line == '\t')) {
1144                         line++;
1145                 }
1146                 if (*line == '#') {
1147                         continue;
1148                 }
1149                 if (strcmp(line, "") == 0) {
1150                         continue;
1151                 }
1152                 tok = strtok(line, " \t");
1153                 addrstr = tok;
1154                 tok = strtok(NULL, " \t");
1155                 if (tok == NULL) {
1156                         if (NULL == ctdb->default_public_interface) {
1157                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1158                                          i+1));
1159                                 talloc_free(lines);
1160                                 return -1;
1161                         }
1162                         ifaces = ctdb->default_public_interface;
1163                 } else {
1164                         ifaces = tok;
1165                 }
1166
1167                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1168                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1169                         talloc_free(lines);
1170                         return -1;
1171                 }
1172                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1173                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1174                         talloc_free(lines);
1175                         return -1;
1176                 }
1177         }
1178
1179
1180         talloc_free(lines);
1181         return 0;
1182 }
1183
1184 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1185                               const char *iface,
1186                               const char *ip)
1187 {
1188         struct ctdb_vnn *svnn;
1189         struct ctdb_interface *cur = NULL;
1190         bool ok;
1191         int ret;
1192
1193         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1194         CTDB_NO_MEMORY(ctdb, svnn);
1195
1196         svnn->ifaces = talloc_array(svnn, const char *, 2);
1197         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1198         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1199         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1200         svnn->ifaces[1] = NULL;
1201
1202         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1203         if (!ok) {
1204                 talloc_free(svnn);
1205                 return -1;
1206         }
1207
1208         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1209         if (ret != 0) {
1210                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1211                                    "for single_ip[%s]\n",
1212                                    svnn->ifaces[0],
1213                                    ctdb_addr_to_str(&svnn->public_address)));
1214                 talloc_free(svnn);
1215                 return -1;
1216         }
1217
1218         /* assume the single public ip interface is initially "good" */
1219         cur = ctdb_find_iface(ctdb, iface);
1220         if (cur == NULL) {
1221                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1222                 return -1;
1223         }
1224         cur->link_up = true;
1225
1226         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1227         if (ret != 0) {
1228                 talloc_free(svnn);
1229                 return -1;
1230         }
1231
1232         ctdb->single_ip_vnn = svnn;
1233         return 0;
1234 }
1235
1236 struct public_ip_list {
1237         struct public_ip_list *next;
1238         uint32_t pnn;
1239         ctdb_sock_addr addr;
1240 };
1241
1242 /* Given a physical node, return the number of
1243    public addresses that is currently assigned to this node.
1244 */
1245 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1246 {
1247         int num=0;
1248
1249         for (;ips;ips=ips->next) {
1250                 if (ips->pnn == pnn) {
1251                         num++;
1252                 }
1253         }
1254         return num;
1255 }
1256
1257
1258 /* Can the given node host the given IP: is the public IP known to the
1259  * node and is NOIPHOST unset?
1260 */
1261 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1262                              struct ctdb_ipflags ipflags,
1263                              struct public_ip_list *ip)
1264 {
1265         struct ctdb_public_ip_list_old *public_ips;
1266         int i;
1267
1268         if (ipflags.noiphost) {
1269                 return false;
1270         }
1271
1272         public_ips = ctdb->nodes[pnn]->available_public_ips;
1273
1274         if (public_ips == NULL) {
1275                 return false;
1276         }
1277
1278         for (i=0; i<public_ips->num; i++) {
1279                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1280                         /* yes, this node can serve this public ip */
1281                         return true;
1282                 }
1283         }
1284
1285         return false;
1286 }
1287
1288 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1289                                  struct ctdb_ipflags ipflags,
1290                                  struct public_ip_list *ip)
1291 {
1292         if (ipflags.noiptakeover) {
1293                 return false;
1294         }
1295
1296         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1297 }
1298
1299 /* search the node lists list for a node to takeover this ip.
1300    pick the node that currently are serving the least number of ips
1301    so that the ips get spread out evenly.
1302 */
1303 static int find_takeover_node(struct ctdb_context *ctdb,
1304                               struct ctdb_ipflags *ipflags,
1305                               struct public_ip_list *ip,
1306                               struct public_ip_list *all_ips)
1307 {
1308         int pnn, min=0, num;
1309         int i, numnodes;
1310
1311         numnodes = talloc_array_length(ipflags);
1312         pnn    = -1;
1313         for (i=0; i<numnodes; i++) {
1314                 /* verify that this node can serve this ip */
1315                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1316                         /* no it couldnt   so skip to the next node */
1317                         continue;
1318                 }
1319
1320                 num = node_ip_coverage(i, all_ips);
1321                 /* was this the first node we checked ? */
1322                 if (pnn == -1) {
1323                         pnn = i;
1324                         min  = num;
1325                 } else {
1326                         if (num < min) {
1327                                 pnn = i;
1328                                 min  = num;
1329                         }
1330                 }
1331         }       
1332         if (pnn == -1) {
1333                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1334                         ctdb_addr_to_str(&ip->addr)));
1335
1336                 return -1;
1337         }
1338
1339         ip->pnn = pnn;
1340         return 0;
1341 }
1342
1343 #define IP_KEYLEN       4
1344 static uint32_t *ip_key(ctdb_sock_addr *ip)
1345 {
1346         static uint32_t key[IP_KEYLEN];
1347
1348         bzero(key, sizeof(key));
1349
1350         switch (ip->sa.sa_family) {
1351         case AF_INET:
1352                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1353                 break;
1354         case AF_INET6: {
1355                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1356                 key[0]  = htonl(s6_a32[0]);
1357                 key[1]  = htonl(s6_a32[1]);
1358                 key[2]  = htonl(s6_a32[2]);
1359                 key[3]  = htonl(s6_a32[3]);
1360                 break;
1361         }
1362         default:
1363                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1364                 return key;
1365         }
1366
1367         return key;
1368 }
1369
1370 static void *add_ip_callback(void *parm, void *data)
1371 {
1372         struct public_ip_list *this_ip = parm;
1373         struct public_ip_list *prev_ip = data;
1374
1375         if (prev_ip == NULL) {
1376                 return parm;
1377         }
1378         if (this_ip->pnn == -1) {
1379                 this_ip->pnn = prev_ip->pnn;
1380         }
1381
1382         return parm;
1383 }
1384
1385 static int getips_count_callback(void *param, void *data)
1386 {
1387         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1388         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1389
1390         new_ip->next = *ip_list;
1391         *ip_list     = new_ip;
1392         return 0;
1393 }
1394
1395 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1396                                        struct ctdb_public_ip_list_old *ips,
1397                                        uint32_t pnn);
1398
1399 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1400                                          struct ctdb_node_map_old *nodemap)
1401 {
1402         int j;
1403         int ret;
1404
1405         if (ctdb->num_nodes != nodemap->num) {
1406                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1407                                   ctdb->num_nodes, nodemap->num));
1408                 return -1;
1409         }
1410
1411         for (j=0; j<nodemap->num; j++) {
1412                 /* For readability */
1413                 struct ctdb_node *node = ctdb->nodes[j];
1414
1415                 /* release any existing data */
1416                 TALLOC_FREE(node->known_public_ips);
1417                 TALLOC_FREE(node->available_public_ips);
1418
1419                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1420                         continue;
1421                 }
1422
1423                 /* Retrieve the list of known public IPs from the node */
1424                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1425                                         TAKEOVER_TIMEOUT(),
1426                                         node->pnn,
1427                                         ctdb->nodes,
1428                                         0,
1429                                         &node->known_public_ips);
1430                 if (ret != 0) {
1431                         DEBUG(DEBUG_ERR,
1432                               ("Failed to read known public IPs from node: %u\n",
1433                                node->pnn));
1434                         return -1;
1435                 }
1436
1437                 if (ctdb->do_checkpublicip) {
1438                         verify_remote_ip_allocation(ctdb,
1439                                                     node->known_public_ips,
1440                                                     node->pnn);
1441                 }
1442
1443                 /* Retrieve the list of available public IPs from the node */
1444                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1445                                         TAKEOVER_TIMEOUT(),
1446                                         node->pnn,
1447                                         ctdb->nodes,
1448                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1449                                         &node->available_public_ips);
1450                 if (ret != 0) {
1451                         DEBUG(DEBUG_ERR,
1452                               ("Failed to read available public IPs from node: %u\n",
1453                                node->pnn));
1454                         return -1;
1455                 }
1456         }
1457
1458         return 0;
1459 }
1460
1461 static struct public_ip_list *
1462 create_merged_ip_list(struct ctdb_context *ctdb)
1463 {
1464         int i, j;
1465         struct public_ip_list *ip_list;
1466         struct ctdb_public_ip_list_old *public_ips;
1467
1468         if (ctdb->ip_tree != NULL) {
1469                 talloc_free(ctdb->ip_tree);
1470                 ctdb->ip_tree = NULL;
1471         }
1472         ctdb->ip_tree = trbt_create(ctdb, 0);
1473
1474         for (i=0;i<ctdb->num_nodes;i++) {
1475                 public_ips = ctdb->nodes[i]->known_public_ips;
1476
1477                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1478                         continue;
1479                 }
1480
1481                 /* there were no public ips for this node */
1482                 if (public_ips == NULL) {
1483                         continue;
1484                 }               
1485
1486                 for (j=0;j<public_ips->num;j++) {
1487                         struct public_ip_list *tmp_ip;
1488
1489                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1490                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1491                         /* Do not use information about IP addresses hosted
1492                          * on other nodes, it may not be accurate */
1493                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1494                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1495                         } else {
1496                                 tmp_ip->pnn = -1;
1497                         }
1498                         tmp_ip->addr = public_ips->ips[j].addr;
1499                         tmp_ip->next = NULL;
1500
1501                         trbt_insertarray32_callback(ctdb->ip_tree,
1502                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1503                                 add_ip_callback,
1504                                 tmp_ip);
1505                 }
1506         }
1507
1508         ip_list = NULL;
1509         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1510
1511         return ip_list;
1512 }
1513
1514 /* 
1515  * This is the length of the longtest common prefix between the IPs.
1516  * It is calculated by XOR-ing the 2 IPs together and counting the
1517  * number of leading zeroes.  The implementation means that all
1518  * addresses end up being 128 bits long.
1519  *
1520  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1521  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1522  * lots of nodes and IP addresses?
1523  */
1524 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1525 {
1526         uint32_t ip1_k[IP_KEYLEN];
1527         uint32_t *t;
1528         int i;
1529         uint32_t x;
1530
1531         uint32_t distance = 0;
1532
1533         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1534         t = ip_key(ip2);
1535         for (i=0; i<IP_KEYLEN; i++) {
1536                 x = ip1_k[i] ^ t[i];
1537                 if (x == 0) {
1538                         distance += 32;
1539                 } else {
1540                         /* Count number of leading zeroes. 
1541                          * FIXME? This could be optimised...
1542                          */
1543                         while ((x & (1 << 31)) == 0) {
1544                                 x <<= 1;
1545                                 distance += 1;
1546                         }
1547                 }
1548         }
1549
1550         return distance;
1551 }
1552
1553 /* Calculate the IP distance for the given IP relative to IPs on the
1554    given node.  The ips argument is generally the all_ips variable
1555    used in the main part of the algorithm.
1556  */
1557 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1558                                   struct public_ip_list *ips,
1559                                   int pnn)
1560 {
1561         struct public_ip_list *t;
1562         uint32_t d;
1563
1564         uint32_t sum = 0;
1565
1566         for (t=ips; t != NULL; t=t->next) {
1567                 if (t->pnn != pnn) {
1568                         continue;
1569                 }
1570
1571                 /* Optimisation: We never calculate the distance
1572                  * between an address and itself.  This allows us to
1573                  * calculate the effect of removing an address from a
1574                  * node by simply calculating the distance between
1575                  * that address and all of the exitsing addresses.
1576                  * Moreover, we assume that we're only ever dealing
1577                  * with addresses from all_ips so we can identify an
1578                  * address via a pointer rather than doing a more
1579                  * expensive address comparison. */
1580                 if (&(t->addr) == ip) {
1581                         continue;
1582                 }
1583
1584                 d = ip_distance(ip, &(t->addr));
1585                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1586         }
1587
1588         return sum;
1589 }
1590
1591 /* Return the LCP2 imbalance metric for addresses currently assigned
1592    to the given node.
1593  */
1594 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1595 {
1596         struct public_ip_list *t;
1597
1598         uint32_t imbalance = 0;
1599
1600         for (t=all_ips; t!=NULL; t=t->next) {
1601                 if (t->pnn != pnn) {
1602                         continue;
1603                 }
1604                 /* Pass the rest of the IPs rather than the whole
1605                    all_ips input list.
1606                 */
1607                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1608         }
1609
1610         return imbalance;
1611 }
1612
1613 /* Allocate any unassigned IPs just by looping through the IPs and
1614  * finding the best node for each.
1615  */
1616 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1617                                       struct ctdb_ipflags *ipflags,
1618                                       struct public_ip_list *all_ips)
1619 {
1620         struct public_ip_list *tmp_ip;
1621
1622         /* loop over all ip's and find a physical node to cover for 
1623            each unassigned ip.
1624         */
1625         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1626                 if (tmp_ip->pnn == -1) {
1627                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1628                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1629                                         ctdb_addr_to_str(&tmp_ip->addr)));
1630                         }
1631                 }
1632         }
1633 }
1634
1635 /* Basic non-deterministic rebalancing algorithm.
1636  */
1637 static void basic_failback(struct ctdb_context *ctdb,
1638                            struct ctdb_ipflags *ipflags,
1639                            struct public_ip_list *all_ips,
1640                            int num_ips)
1641 {
1642         int i, numnodes;
1643         int maxnode, maxnum, minnode, minnum, num, retries;
1644         struct public_ip_list *tmp_ip;
1645
1646         numnodes = talloc_array_length(ipflags);
1647         retries = 0;
1648
1649 try_again:
1650         maxnum=0;
1651         minnum=0;
1652
1653         /* for each ip address, loop over all nodes that can serve
1654            this ip and make sure that the difference between the node
1655            serving the most and the node serving the least ip's are
1656            not greater than 1.
1657         */
1658         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1659                 if (tmp_ip->pnn == -1) {
1660                         continue;
1661                 }
1662
1663                 /* Get the highest and lowest number of ips's served by any 
1664                    valid node which can serve this ip.
1665                 */
1666                 maxnode = -1;
1667                 minnode = -1;
1668                 for (i=0; i<numnodes; i++) {
1669                         /* only check nodes that can actually serve this ip */
1670                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1671                                 /* no it couldnt   so skip to the next node */
1672                                 continue;
1673                         }
1674
1675                         num = node_ip_coverage(i, all_ips);
1676                         if (maxnode == -1) {
1677                                 maxnode = i;
1678                                 maxnum  = num;
1679                         } else {
1680                                 if (num > maxnum) {
1681                                         maxnode = i;
1682                                         maxnum  = num;
1683                                 }
1684                         }
1685                         if (minnode == -1) {
1686                                 minnode = i;
1687                                 minnum  = num;
1688                         } else {
1689                                 if (num < minnum) {
1690                                         minnode = i;
1691                                         minnum  = num;
1692                                 }
1693                         }
1694                 }
1695                 if (maxnode == -1) {
1696                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1697                                 ctdb_addr_to_str(&tmp_ip->addr)));
1698
1699                         continue;
1700                 }
1701
1702                 /* if the spread between the smallest and largest coverage by
1703                    a node is >=2 we steal one of the ips from the node with
1704                    most coverage to even things out a bit.
1705                    try to do this a limited number of times since we dont
1706                    want to spend too much time balancing the ip coverage.
1707                 */
1708                 if ( (maxnum > minnum+1)
1709                      && (retries < (num_ips + 5)) ){
1710                         struct public_ip_list *tmp;
1711
1712                         /* Reassign one of maxnode's VNNs */
1713                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1714                                 if (tmp->pnn == maxnode) {
1715                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1716                                         retries++;
1717                                         goto try_again;;
1718                                 }
1719                         }
1720                 }
1721         }
1722 }
1723
1724 static void lcp2_init(struct ctdb_context *tmp_ctx,
1725                       struct ctdb_ipflags *ipflags,
1726                       struct public_ip_list *all_ips,
1727                       uint32_t *force_rebalance_nodes,
1728                       uint32_t **lcp2_imbalances,
1729                       bool **rebalance_candidates)
1730 {
1731         int i, numnodes;
1732         struct public_ip_list *tmp_ip;
1733
1734         numnodes = talloc_array_length(ipflags);
1735
1736         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1737         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1738         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1739         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1740
1741         for (i=0; i<numnodes; i++) {
1742                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1743                 /* First step: assume all nodes are candidates */
1744                 (*rebalance_candidates)[i] = true;
1745         }
1746
1747         /* 2nd step: if a node has IPs assigned then it must have been
1748          * healthy before, so we remove it from consideration.  This
1749          * is overkill but is all we have because we don't maintain
1750          * state between takeover runs.  An alternative would be to
1751          * keep state and invalidate it every time the recovery master
1752          * changes.
1753          */
1754         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1755                 if (tmp_ip->pnn != -1) {
1756                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1757                 }
1758         }
1759
1760         /* 3rd step: if a node is forced to re-balance then
1761            we allow failback onto the node */
1762         if (force_rebalance_nodes == NULL) {
1763                 return;
1764         }
1765         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1766                 uint32_t pnn = force_rebalance_nodes[i];
1767                 if (pnn >= numnodes) {
1768                         DEBUG(DEBUG_ERR,
1769                               (__location__ "unknown node %u\n", pnn));
1770                         continue;
1771                 }
1772
1773                 DEBUG(DEBUG_NOTICE,
1774                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1775                 (*rebalance_candidates)[pnn] = true;
1776         }
1777 }
1778
1779 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1780  * the IP/node combination that will cost the least.
1781  */
1782 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1783                                      struct ctdb_ipflags *ipflags,
1784                                      struct public_ip_list *all_ips,
1785                                      uint32_t *lcp2_imbalances)
1786 {
1787         struct public_ip_list *tmp_ip;
1788         int dstnode, numnodes;
1789
1790         int minnode;
1791         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1792         struct public_ip_list *minip;
1793
1794         bool should_loop = true;
1795         bool have_unassigned = true;
1796
1797         numnodes = talloc_array_length(ipflags);
1798
1799         while (have_unassigned && should_loop) {
1800                 should_loop = false;
1801
1802                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1803                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1804
1805                 minnode = -1;
1806                 mindsum = 0;
1807                 minip = NULL;
1808
1809                 /* loop over each unassigned ip. */
1810                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1811                         if (tmp_ip->pnn != -1) {
1812                                 continue;
1813                         }
1814
1815                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1816                                 /* only check nodes that can actually takeover this ip */
1817                                 if (!can_node_takeover_ip(ctdb, dstnode,
1818                                                           ipflags[dstnode],
1819                                                           tmp_ip)) {
1820                                         /* no it couldnt   so skip to the next node */
1821                                         continue;
1822                                 }
1823
1824                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1825                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1826                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1827                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1828                                                    dstnode,
1829                                                    dstimbl - lcp2_imbalances[dstnode]));
1830
1831
1832                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1833                                         minnode = dstnode;
1834                                         minimbl = dstimbl;
1835                                         mindsum = dstdsum;
1836                                         minip = tmp_ip;
1837                                         should_loop = true;
1838                                 }
1839                         }
1840                 }
1841
1842                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1843
1844                 /* If we found one then assign it to the given node. */
1845                 if (minnode != -1) {
1846                         minip->pnn = minnode;
1847                         lcp2_imbalances[minnode] = minimbl;
1848                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1849                                           ctdb_addr_to_str(&(minip->addr)),
1850                                           minnode,
1851                                           mindsum));
1852                 }
1853
1854                 /* There might be a better way but at least this is clear. */
1855                 have_unassigned = false;
1856                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1857                         if (tmp_ip->pnn == -1) {
1858                                 have_unassigned = true;
1859                         }
1860                 }
1861         }
1862
1863         /* We know if we have an unassigned addresses so we might as
1864          * well optimise.
1865          */
1866         if (have_unassigned) {
1867                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1868                         if (tmp_ip->pnn == -1) {
1869                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1870                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1871                         }
1872                 }
1873         }
1874 }
1875
1876 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1877  * to move IPs from, determines the best IP/destination node
1878  * combination to move from the source node.
1879  */
1880 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1881                                     struct ctdb_ipflags *ipflags,
1882                                     struct public_ip_list *all_ips,
1883                                     int srcnode,
1884                                     uint32_t *lcp2_imbalances,
1885                                     bool *rebalance_candidates)
1886 {
1887         int dstnode, mindstnode, numnodes;
1888         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1889         uint32_t minsrcimbl, mindstimbl;
1890         struct public_ip_list *minip;
1891         struct public_ip_list *tmp_ip;
1892
1893         /* Find an IP and destination node that best reduces imbalance. */
1894         srcimbl = 0;
1895         minip = NULL;
1896         minsrcimbl = 0;
1897         mindstnode = -1;
1898         mindstimbl = 0;
1899
1900         numnodes = talloc_array_length(ipflags);
1901
1902         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1903         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1904                            srcnode, lcp2_imbalances[srcnode]));
1905
1906         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1907                 /* Only consider addresses on srcnode. */
1908                 if (tmp_ip->pnn != srcnode) {
1909                         continue;
1910                 }
1911
1912                 /* What is this IP address costing the source node? */
1913                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1914                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1915
1916                 /* Consider this IP address would cost each potential
1917                  * destination node.  Destination nodes are limited to
1918                  * those that are newly healthy, since we don't want
1919                  * to do gratuitous failover of IPs just to make minor
1920                  * balance improvements.
1921                  */
1922                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1923                         if (!rebalance_candidates[dstnode]) {
1924                                 continue;
1925                         }
1926
1927                         /* only check nodes that can actually takeover this ip */
1928                         if (!can_node_takeover_ip(ctdb, dstnode,
1929                                                   ipflags[dstnode], tmp_ip)) {
1930                                 /* no it couldnt   so skip to the next node */
1931                                 continue;
1932                         }
1933
1934                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1935                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1936                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1937                                            srcnode, -srcdsum,
1938                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1939                                            dstnode, dstdsum));
1940
1941                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1942                             (dstdsum < srcdsum) &&                      \
1943                             ((mindstnode == -1) ||                              \
1944                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1945
1946                                 minip = tmp_ip;
1947                                 minsrcimbl = srcimbl;
1948                                 mindstnode = dstnode;
1949                                 mindstimbl = dstimbl;
1950                         }
1951                 }
1952         }
1953         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1954
1955         if (mindstnode != -1) {
1956                 /* We found a move that makes things better... */
1957                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1958                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1959                                   ctdb_addr_to_str(&(minip->addr)),
1960                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1961
1962
1963                 lcp2_imbalances[srcnode] = minsrcimbl;
1964                 lcp2_imbalances[mindstnode] = mindstimbl;
1965                 minip->pnn = mindstnode;
1966
1967                 return true;
1968         }
1969
1970         return false;
1971         
1972 }
1973
1974 struct lcp2_imbalance_pnn {
1975         uint32_t imbalance;
1976         int pnn;
1977 };
1978
1979 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1980 {
1981         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1982         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1983
1984         if (lipa->imbalance > lipb->imbalance) {
1985                 return -1;
1986         } else if (lipa->imbalance == lipb->imbalance) {
1987                 return 0;
1988         } else {
1989                 return 1;
1990         }
1991 }
1992
1993 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1994  * node with the highest LCP2 imbalance, and then determines the best
1995  * IP/destination node combination to move from the source node.
1996  */
1997 static void lcp2_failback(struct ctdb_context *ctdb,
1998                           struct ctdb_ipflags *ipflags,
1999                           struct public_ip_list *all_ips,
2000                           uint32_t *lcp2_imbalances,
2001                           bool *rebalance_candidates)
2002 {
2003         int i, numnodes;
2004         struct lcp2_imbalance_pnn * lips;
2005         bool again;
2006
2007         numnodes = talloc_array_length(ipflags);
2008
2009 try_again:
2010         /* Put the imbalances and nodes into an array, sort them and
2011          * iterate through candidates.  Usually the 1st one will be
2012          * used, so this doesn't cost much...
2013          */
2014         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2015         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2016         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2017         for (i=0; i<numnodes; i++) {
2018                 lips[i].imbalance = lcp2_imbalances[i];
2019                 lips[i].pnn = i;
2020                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2021         }
2022         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2023               lcp2_cmp_imbalance_pnn);
2024
2025         again = false;
2026         for (i=0; i<numnodes; i++) {
2027                 /* This means that all nodes had 0 or 1 addresses, so
2028                  * can't be imbalanced.
2029                  */
2030                 if (lips[i].imbalance == 0) {
2031                         break;
2032                 }
2033
2034                 if (lcp2_failback_candidate(ctdb,
2035                                             ipflags,
2036                                             all_ips,
2037                                             lips[i].pnn,
2038                                             lcp2_imbalances,
2039                                             rebalance_candidates)) {
2040                         again = true;
2041                         break;
2042                 }
2043         }
2044
2045         talloc_free(lips);
2046         if (again) {
2047                 goto try_again;
2048         }
2049 }
2050
2051 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2052                                     struct ctdb_ipflags *ipflags,
2053                                     struct public_ip_list *all_ips)
2054 {
2055         struct public_ip_list *tmp_ip;
2056
2057         /* verify that the assigned nodes can serve that public ip
2058            and set it to -1 if not
2059         */
2060         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2061                 if (tmp_ip->pnn == -1) {
2062                         continue;
2063                 }
2064                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2065                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2066                         /* this node can not serve this ip. */
2067                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2068                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2069                                            tmp_ip->pnn));
2070                         tmp_ip->pnn = -1;
2071                 }
2072         }
2073 }
2074
2075 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2076                                        struct ctdb_ipflags *ipflags,
2077                                        struct public_ip_list *all_ips)
2078 {
2079         struct public_ip_list *tmp_ip;
2080         int i, numnodes;
2081
2082         numnodes = talloc_array_length(ipflags);
2083
2084         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2085        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2086         *  always be allocated the same way for a specific set of
2087         *  available/unavailable nodes.
2088         */
2089
2090         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2091                 tmp_ip->pnn = i % numnodes;
2092         }
2093
2094         /* IP failback doesn't make sense with deterministic
2095          * IPs, since the modulo step above implicitly fails
2096          * back IPs to their "home" node.
2097          */
2098         if (1 == ctdb->tunable.no_ip_failback) {
2099                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2100         }
2101
2102         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2103
2104         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2105
2106         /* No failback here! */
2107 }
2108
2109 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2110                                           struct ctdb_ipflags *ipflags,
2111                                           struct public_ip_list *all_ips)
2112 {
2113         /* This should be pushed down into basic_failback. */
2114         struct public_ip_list *tmp_ip;
2115         int num_ips = 0;
2116         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2117                 num_ips++;
2118         }
2119
2120         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2121
2122         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2123
2124         /* If we don't want IPs to fail back then don't rebalance IPs. */
2125         if (1 == ctdb->tunable.no_ip_failback) {
2126                 return;
2127         }
2128
2129         /* Now, try to make sure the ip adresses are evenly distributed
2130            across the nodes.
2131         */
2132         basic_failback(ctdb, ipflags, all_ips, num_ips);
2133 }
2134
2135 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2136                           struct ctdb_ipflags *ipflags,
2137                           struct public_ip_list *all_ips,
2138                           uint32_t *force_rebalance_nodes)
2139 {
2140         uint32_t *lcp2_imbalances;
2141         bool *rebalance_candidates;
2142         int numnodes, num_rebalance_candidates, i;
2143
2144         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2145
2146         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2147
2148         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2149                   &lcp2_imbalances, &rebalance_candidates);
2150
2151         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2152
2153         /* If we don't want IPs to fail back then don't rebalance IPs. */
2154         if (1 == ctdb->tunable.no_ip_failback) {
2155                 goto finished;
2156         }
2157
2158         /* It is only worth continuing if we have suitable target
2159          * nodes to transfer IPs to.  This check is much cheaper than
2160          * continuing on...
2161          */
2162         numnodes = talloc_array_length(ipflags);
2163         num_rebalance_candidates = 0;
2164         for (i=0; i<numnodes; i++) {
2165                 if (rebalance_candidates[i]) {
2166                         num_rebalance_candidates++;
2167                 }
2168         }
2169         if (num_rebalance_candidates == 0) {
2170                 goto finished;
2171         }
2172
2173         /* Now, try to make sure the ip adresses are evenly distributed
2174            across the nodes.
2175         */
2176         lcp2_failback(ctdb, ipflags, all_ips,
2177                       lcp2_imbalances, rebalance_candidates);
2178
2179 finished:
2180         talloc_free(tmp_ctx);
2181 }
2182
2183 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2184 {
2185         int i;
2186
2187         for (i=0;i<nodemap->num;i++) {
2188                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2189                         /* Found one completely healthy node */
2190                         return false;
2191                 }
2192         }
2193
2194         return true;
2195 }
2196
2197 /* The calculation part of the IP allocation algorithm. */
2198 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2199                                    struct ctdb_ipflags *ipflags,
2200                                    struct public_ip_list **all_ips_p,
2201                                    uint32_t *force_rebalance_nodes)
2202 {
2203         /* since nodes only know about those public addresses that
2204            can be served by that particular node, no single node has
2205            a full list of all public addresses that exist in the cluster.
2206            Walk over all node structures and create a merged list of
2207            all public addresses that exist in the cluster.
2208
2209            keep the tree of ips around as ctdb->ip_tree
2210         */
2211         *all_ips_p = create_merged_ip_list(ctdb);
2212
2213         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2214                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2215         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2216                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2217         } else {
2218                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2219         }
2220
2221         /* at this point ->pnn is the node which will own each IP
2222            or -1 if there is no node that can cover this ip
2223         */
2224
2225         return;
2226 }
2227
2228 struct get_tunable_callback_data {
2229         const char *tunable;
2230         uint32_t *out;
2231         bool fatal;
2232 };
2233
2234 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2235                                  int32_t res, TDB_DATA outdata,
2236                                  void *callback)
2237 {
2238         struct get_tunable_callback_data *cd =
2239                 (struct get_tunable_callback_data *)callback;
2240         int size;
2241
2242         if (res != 0) {
2243                 /* Already handled in fail callback */
2244                 return;
2245         }
2246
2247         if (outdata.dsize != sizeof(uint32_t)) {
2248                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2249                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2250                                  (int)outdata.dsize));
2251                 cd->fatal = true;
2252                 return;
2253         }
2254
2255         size = talloc_array_length(cd->out);
2256         if (pnn >= size) {
2257                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2258                                  cd->tunable, pnn, size));
2259                 return;
2260         }
2261
2262                 
2263         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2264 }
2265
2266 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2267                                        int32_t res, TDB_DATA outdata,
2268                                        void *callback)
2269 {
2270         struct get_tunable_callback_data *cd =
2271                 (struct get_tunable_callback_data *)callback;
2272
2273         switch (res) {
2274         case -ETIME:
2275                 DEBUG(DEBUG_ERR,
2276                       ("Timed out getting tunable \"%s\" from node %d\n",
2277                        cd->tunable, pnn));
2278                 cd->fatal = true;
2279                 break;
2280         case -EINVAL:
2281         case -1:
2282                 DEBUG(DEBUG_WARNING,
2283                       ("Tunable \"%s\" not implemented on node %d\n",
2284                        cd->tunable, pnn));
2285                 break;
2286         default:
2287                 DEBUG(DEBUG_ERR,
2288                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2289                        cd->tunable, pnn));
2290                 cd->fatal = true;
2291         }
2292 }
2293
2294 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2295                                         TALLOC_CTX *tmp_ctx,
2296                                         struct ctdb_node_map_old *nodemap,
2297                                         const char *tunable,
2298                                         uint32_t default_value)
2299 {
2300         TDB_DATA data;
2301         struct ctdb_control_get_tunable *t;
2302         uint32_t *nodes;
2303         uint32_t *tvals;
2304         struct get_tunable_callback_data callback_data;
2305         int i;
2306
2307         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2308         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2309         for (i=0; i<nodemap->num; i++) {
2310                 tvals[i] = default_value;
2311         }
2312                 
2313         callback_data.out = tvals;
2314         callback_data.tunable = tunable;
2315         callback_data.fatal = false;
2316
2317         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2318         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2319         t = (struct ctdb_control_get_tunable *)data.dptr;
2320         t->length = strlen(tunable)+1;
2321         memcpy(t->name, tunable, t->length);
2322         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2323         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2324                                       nodes, 0, TAKEOVER_TIMEOUT(),
2325                                       false, data,
2326                                       get_tunable_callback,
2327                                       get_tunable_fail_callback,
2328                                       &callback_data) != 0) {
2329                 if (callback_data.fatal) {
2330                         talloc_free(tvals);
2331                         tvals = NULL;
2332                 }
2333         }
2334         talloc_free(nodes);
2335         talloc_free(data.dptr);
2336
2337         return tvals;
2338 }
2339
2340 /* Set internal flags for IP allocation:
2341  *   Clear ip flags
2342  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2343  *   Set NOIPHOST ip flag for each INACTIVE node
2344  *   if all nodes are disabled:
2345  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2346  *   else
2347  *     Set NOIPHOST ip flags for disabled nodes
2348  */
2349 static struct ctdb_ipflags *
2350 set_ipflags_internal(struct ctdb_context *ctdb,
2351                      TALLOC_CTX *tmp_ctx,
2352                      struct ctdb_node_map_old *nodemap,
2353                      uint32_t *tval_noiptakeover,
2354                      uint32_t *tval_noiphostonalldisabled)
2355 {
2356         int i;
2357         struct ctdb_ipflags *ipflags;
2358
2359         /* Clear IP flags - implicit due to talloc_zero */
2360         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2361         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2362
2363         for (i=0;i<nodemap->num;i++) {
2364                 /* Can not take IPs on node with NoIPTakeover set */
2365                 if (tval_noiptakeover[i] != 0) {
2366                         ipflags[i].noiptakeover = true;
2367                 }
2368
2369                 /* Can not host IPs on INACTIVE node */
2370                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2371                         ipflags[i].noiphost = true;
2372                 }
2373         }
2374
2375         if (all_nodes_are_disabled(nodemap)) {
2376                 /* If all nodes are disabled, can not host IPs on node
2377                  * with NoIPHostOnAllDisabled set
2378                  */
2379                 for (i=0;i<nodemap->num;i++) {
2380                         if (tval_noiphostonalldisabled[i] != 0) {
2381                                 ipflags[i].noiphost = true;
2382                         }
2383                 }
2384         } else {
2385                 /* If some nodes are not disabled, then can not host
2386                  * IPs on DISABLED node
2387                  */
2388                 for (i=0;i<nodemap->num;i++) {
2389                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2390                                 ipflags[i].noiphost = true;
2391                         }
2392                 }
2393         }
2394
2395         return ipflags;
2396 }
2397
2398 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2399                                         TALLOC_CTX *tmp_ctx,
2400                                         struct ctdb_node_map_old *nodemap)
2401 {
2402         uint32_t *tval_noiptakeover;
2403         uint32_t *tval_noiphostonalldisabled;
2404         struct ctdb_ipflags *ipflags;
2405
2406
2407         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2408                                                    "NoIPTakeover", 0);
2409         if (tval_noiptakeover == NULL) {
2410                 return NULL;
2411         }
2412
2413         tval_noiphostonalldisabled =
2414                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2415                                        "NoIPHostOnAllDisabled", 0);
2416         if (tval_noiphostonalldisabled == NULL) {
2417                 /* Caller frees tmp_ctx */
2418                 return NULL;
2419         }
2420
2421         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2422                                        tval_noiptakeover,
2423                                        tval_noiphostonalldisabled);
2424
2425         talloc_free(tval_noiptakeover);
2426         talloc_free(tval_noiphostonalldisabled);
2427
2428         return ipflags;
2429 }
2430
2431 struct iprealloc_callback_data {
2432         bool *retry_nodes;
2433         int retry_count;
2434         client_async_callback fail_callback;
2435         void *fail_callback_data;
2436         struct ctdb_node_map_old *nodemap;
2437 };
2438
2439 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2440                                         int32_t res, TDB_DATA outdata,
2441                                         void *callback)
2442 {
2443         int numnodes;
2444         struct iprealloc_callback_data *cd =
2445                 (struct iprealloc_callback_data *)callback;
2446
2447         numnodes = talloc_array_length(cd->retry_nodes);
2448         if (pnn > numnodes) {
2449                 DEBUG(DEBUG_ERR,
2450                       ("ipreallocated failure from node %d, "
2451                        "but only %d nodes in nodemap\n",
2452                        pnn, numnodes));
2453                 return;
2454         }
2455
2456         /* Can't run the "ipreallocated" event on a INACTIVE node */
2457         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2458                 DEBUG(DEBUG_WARNING,
2459                       ("ipreallocated failed on inactive node %d, ignoring\n",
2460                        pnn));
2461                 return;
2462         }
2463
2464         switch (res) {
2465         case -ETIME:
2466                 /* If the control timed out then that's a real error,
2467                  * so call the real fail callback
2468                  */
2469                 if (cd->fail_callback) {
2470                         cd->fail_callback(ctdb, pnn, res, outdata,
2471                                           cd->fail_callback_data);
2472                 } else {
2473                         DEBUG(DEBUG_WARNING,
2474                               ("iprealloc timed out but no callback registered\n"));
2475                 }
2476                 break;
2477         default:
2478                 /* If not a timeout then either the ipreallocated
2479                  * eventscript (or some setup) failed.  This might
2480                  * have failed because the IPREALLOCATED control isn't
2481                  * implemented - right now there is no way of knowing
2482                  * because the error codes are all folded down to -1.
2483                  * Consider retrying using EVENTSCRIPT control...
2484                  */
2485                 DEBUG(DEBUG_WARNING,
2486                       ("ipreallocated failure from node %d, flagging retry\n",
2487                        pnn));
2488                 cd->retry_nodes[pnn] = true;
2489                 cd->retry_count++;
2490         }
2491 }
2492
2493 struct takeover_callback_data {
2494         bool *node_failed;
2495         client_async_callback fail_callback;
2496         void *fail_callback_data;
2497         struct ctdb_node_map_old *nodemap;
2498 };
2499
2500 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2501                                        uint32_t node_pnn, int32_t res,
2502                                        TDB_DATA outdata, void *callback_data)
2503 {
2504         struct takeover_callback_data *cd =
2505                 talloc_get_type_abort(callback_data,
2506                                       struct takeover_callback_data);
2507         int i;
2508
2509         for (i = 0; i < cd->nodemap->num; i++) {
2510                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2511                         break;
2512                 }
2513         }
2514
2515         if (i == cd->nodemap->num) {
2516                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2517                 return;
2518         }
2519
2520         if (!cd->node_failed[i]) {
2521                 cd->node_failed[i] = true;
2522                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2523                                   cd->fail_callback_data);
2524         }
2525 }
2526
2527 /*
2528   make any IP alias changes for public addresses that are necessary 
2529  */
2530 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2531                       uint32_t *force_rebalance_nodes,
2532                       client_async_callback fail_callback, void *callback_data)
2533 {
2534         int i, j, ret;
2535         struct ctdb_public_ip ip;
2536         uint32_t *nodes;
2537         struct public_ip_list *all_ips, *tmp_ip;
2538         TDB_DATA data;
2539         struct timeval timeout;
2540         struct client_async_data *async_data;
2541         struct ctdb_client_control_state *state;
2542         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2543         struct ctdb_ipflags *ipflags;
2544         struct takeover_callback_data *takeover_data;
2545         struct iprealloc_callback_data iprealloc_data;
2546         bool *retry_data;
2547         bool can_host_ips;
2548
2549         /*
2550          * ip failover is completely disabled, just send out the 
2551          * ipreallocated event.
2552          */
2553         if (ctdb->tunable.disable_ip_failover != 0) {
2554                 goto ipreallocated;
2555         }
2556
2557         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2558         if (ipflags == NULL) {
2559                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2560                 talloc_free(tmp_ctx);
2561                 return -1;
2562         }
2563
2564         /* Fetch known/available public IPs from each active node */
2565         ret = ctdb_reload_remote_public_ips(ctdb, nodemap);
2566         if (ret != 0) {
2567                 talloc_free(tmp_ctx);
2568                 return -1;
2569         }
2570
2571         /* Short-circuit IP allocation if no node has available IPs */
2572         can_host_ips = false;
2573         for (i=0; i < ctdb->num_nodes; i++) {
2574                 if (ctdb->nodes[i]->available_public_ips != NULL) {
2575                         can_host_ips = true;
2576                 }
2577         }
2578         if (!can_host_ips) {
2579                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2580                 return 0;
2581         }
2582
2583         /* Do the IP reassignment calculations */
2584         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2585
2586         /* Now tell all nodes to release any public IPs should not
2587          * host.  This will be a NOOP on nodes that don't currently
2588          * hold the given IP.
2589          */
2590         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2591         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2592
2593         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2594                                                        bool, nodemap->num);
2595         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2596         takeover_data->fail_callback = fail_callback;
2597         takeover_data->fail_callback_data = callback_data;
2598         takeover_data->nodemap = nodemap;
2599
2600         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2601         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2602
2603         async_data->fail_callback = takeover_run_fail_callback;
2604         async_data->callback_data = takeover_data;
2605
2606         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2607
2608         /* Send a RELEASE_IP to all nodes that should not be hosting
2609          * each IP.  For each IP, all but one of these will be
2610          * redundant.  However, the redundant ones are used to tell
2611          * nodes which node should be hosting the IP so that commands
2612          * like "ctdb ip" can display a particular nodes idea of who
2613          * is hosting what. */
2614         for (i=0;i<nodemap->num;i++) {
2615                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2616                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2617                         continue;
2618                 }
2619
2620                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2621                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2622                                 /* This node should be serving this
2623                                    vnn so don't tell it to release the ip
2624                                 */
2625                                 continue;
2626                         }
2627                         ip.pnn  = tmp_ip->pnn;
2628                         ip.addr = tmp_ip->addr;
2629
2630                         timeout = TAKEOVER_TIMEOUT();
2631                         data.dsize = sizeof(ip);
2632                         data.dptr  = (uint8_t *)&ip;
2633                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2634                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2635                                                   data, async_data,
2636                                                   &timeout, NULL);
2637                         if (state == NULL) {
2638                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2639                                 talloc_free(tmp_ctx);
2640                                 return -1;
2641                         }
2642
2643                         ctdb_client_async_add(async_data, state);
2644                 }
2645         }
2646         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2647                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2648                 talloc_free(tmp_ctx);
2649                 return -1;
2650         }
2651         talloc_free(async_data);
2652
2653
2654         /* For each IP, send a TAKOVER_IP to the node that should be
2655          * hosting it.  Many of these will often be redundant (since
2656          * the allocation won't have changed) but they can be useful
2657          * to recover from inconsistencies. */
2658         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2659         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2660
2661         async_data->fail_callback = fail_callback;
2662         async_data->callback_data = callback_data;
2663
2664         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2665                 if (tmp_ip->pnn == -1) {
2666                         /* this IP won't be taken over */
2667                         continue;
2668                 }
2669
2670                 ip.pnn  = tmp_ip->pnn;
2671                 ip.addr = tmp_ip->addr;
2672
2673                 timeout = TAKEOVER_TIMEOUT();
2674                 data.dsize = sizeof(ip);
2675                 data.dptr  = (uint8_t *)&ip;
2676                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2677                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2678                                           data, async_data, &timeout, NULL);
2679                 if (state == NULL) {
2680                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2681                         talloc_free(tmp_ctx);
2682                         return -1;
2683                 }
2684
2685                 ctdb_client_async_add(async_data, state);
2686         }
2687         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2688                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2689                 talloc_free(tmp_ctx);
2690                 return -1;
2691         }
2692
2693 ipreallocated:
2694         /*
2695          * Tell all nodes to run eventscripts to process the
2696          * "ipreallocated" event.  This can do a lot of things,
2697          * including restarting services to reconfigure them if public
2698          * IPs have moved.  Once upon a time this event only used to
2699          * update natgw.
2700          */
2701         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2702         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2703         iprealloc_data.retry_nodes = retry_data;
2704         iprealloc_data.retry_count = 0;
2705         iprealloc_data.fail_callback = fail_callback;
2706         iprealloc_data.fail_callback_data = callback_data;
2707         iprealloc_data.nodemap = nodemap;
2708
2709         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2710         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2711                                         nodes, 0, TAKEOVER_TIMEOUT(),
2712                                         false, tdb_null,
2713                                         NULL, iprealloc_fail_callback,
2714                                         &iprealloc_data);
2715         if (ret != 0) {
2716                 /* If the control failed then we should retry to any
2717                  * nodes flagged by iprealloc_fail_callback using the
2718                  * EVENTSCRIPT control.  This is a best-effort at
2719                  * backward compatiblity when running a mixed cluster
2720                  * where some nodes have not yet been upgraded to
2721                  * support the IPREALLOCATED control.
2722                  */
2723                 DEBUG(DEBUG_WARNING,
2724                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2725
2726                 nodes = talloc_array(tmp_ctx, uint32_t,
2727                                      iprealloc_data.retry_count);
2728                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2729
2730                 j = 0;
2731                 for (i=0; i<nodemap->num; i++) {
2732                         if (iprealloc_data.retry_nodes[i]) {
2733                                 nodes[j] = i;
2734                                 j++;
2735                         }
2736                 }
2737
2738                 data.dptr  = discard_const("ipreallocated");
2739                 data.dsize = strlen((char *)data.dptr) + 1; 
2740                 ret = ctdb_client_async_control(ctdb,
2741                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2742                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2743                                                 false, data,
2744                                                 NULL, fail_callback,
2745                                                 callback_data);
2746                 if (ret != 0) {
2747                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2748                 }
2749         }
2750
2751         talloc_free(tmp_ctx);
2752         return ret;
2753 }
2754
2755
2756 /*
2757   destroy a ctdb_client_ip structure
2758  */
2759 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2760 {
2761         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2762                 ctdb_addr_to_str(&ip->addr),
2763                 ntohs(ip->addr.ip.sin_port),
2764                 ip->client_id));
2765
2766         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2767         return 0;
2768 }
2769
2770 /*
2771   called by a client to inform us of a TCP connection that it is managing
2772   that should tickled with an ACK when IP takeover is done
2773  */
2774 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2775                                 TDB_DATA indata)
2776 {
2777         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2778         struct ctdb_connection *tcp_sock = NULL;
2779         struct ctdb_tcp_list *tcp;
2780         struct ctdb_connection t;
2781         int ret;
2782         TDB_DATA data;
2783         struct ctdb_client_ip *ip;
2784         struct ctdb_vnn *vnn;
2785         ctdb_sock_addr addr;
2786
2787         /* If we don't have public IPs, tickles are useless */
2788         if (ctdb->vnn == NULL) {
2789                 return 0;
2790         }
2791
2792         tcp_sock = (struct ctdb_connection *)indata.dptr;
2793
2794         addr = tcp_sock->src;
2795         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2796         addr = tcp_sock->dst;
2797         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2798
2799         ZERO_STRUCT(addr);
2800         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2801         vnn = find_public_ip_vnn(ctdb, &addr);
2802         if (vnn == NULL) {
2803                 switch (addr.sa.sa_family) {
2804                 case AF_INET:
2805                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2806                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2807                                         ctdb_addr_to_str(&addr)));
2808                         }
2809                         break;
2810                 case AF_INET6:
2811                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2812                                 ctdb_addr_to_str(&addr)));
2813                         break;
2814                 default:
2815                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2816                 }
2817
2818                 return 0;
2819         }
2820
2821         if (vnn->pnn != ctdb->pnn) {
2822                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2823                         ctdb_addr_to_str(&addr),
2824                         client_id, client->pid));
2825                 /* failing this call will tell smbd to die */
2826                 return -1;
2827         }
2828
2829         ip = talloc(client, struct ctdb_client_ip);
2830         CTDB_NO_MEMORY(ctdb, ip);
2831
2832         ip->ctdb      = ctdb;
2833         ip->addr      = addr;
2834         ip->client_id = client_id;
2835         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2836         DLIST_ADD(ctdb->client_ip_list, ip);
2837
2838         tcp = talloc(client, struct ctdb_tcp_list);
2839         CTDB_NO_MEMORY(ctdb, tcp);
2840
2841         tcp->connection.src = tcp_sock->src;
2842         tcp->connection.dst = tcp_sock->dst;
2843
2844         DLIST_ADD(client->tcp_list, tcp);
2845
2846         t.src = tcp_sock->src;
2847         t.dst = tcp_sock->dst;
2848
2849         data.dptr = (uint8_t *)&t;
2850         data.dsize = sizeof(t);
2851
2852         switch (addr.sa.sa_family) {
2853         case AF_INET:
2854                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2855                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2856                         ctdb_addr_to_str(&tcp_sock->src),
2857                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2858                 break;
2859         case AF_INET6:
2860                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2861                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2862                         ctdb_addr_to_str(&tcp_sock->src),
2863                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2864                 break;
2865         default:
2866                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2867         }
2868
2869
2870         /* tell all nodes about this tcp connection */
2871         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2872                                        CTDB_CONTROL_TCP_ADD,
2873                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2874         if (ret != 0) {
2875                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2876                 return -1;
2877         }
2878
2879         return 0;
2880 }
2881
2882 /*
2883   find a tcp address on a list
2884  */
2885 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2886                                            struct ctdb_connection *tcp)
2887 {
2888         int i;
2889
2890         if (array == NULL) {
2891                 return NULL;
2892         }
2893
2894         for (i=0;i<array->num;i++) {
2895                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2896                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2897                         return &array->connections[i];
2898                 }
2899         }
2900         return NULL;
2901 }
2902
2903
2904
2905 /*
2906   called by a daemon to inform us of a TCP connection that one of its
2907   clients managing that should tickled with an ACK when IP takeover is
2908   done
2909  */
2910 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2911 {
2912         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2913         struct ctdb_tcp_array *tcparray;
2914         struct ctdb_connection tcp;
2915         struct ctdb_vnn *vnn;
2916
2917         /* If we don't have public IPs, tickles are useless */
2918         if (ctdb->vnn == NULL) {
2919                 return 0;
2920         }
2921
2922         vnn = find_public_ip_vnn(ctdb, &p->dst);
2923         if (vnn == NULL) {
2924                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2925                         ctdb_addr_to_str(&p->dst)));
2926
2927                 return -1;
2928         }
2929
2930
2931         tcparray = vnn->tcp_array;
2932
2933         /* If this is the first tickle */
2934         if (tcparray == NULL) {
2935                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2936                 CTDB_NO_MEMORY(ctdb, tcparray);
2937                 vnn->tcp_array = tcparray;
2938
2939                 tcparray->num = 0;
2940                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2941                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2942
2943                 tcparray->connections[tcparray->num].src = p->src;
2944                 tcparray->connections[tcparray->num].dst = p->dst;
2945                 tcparray->num++;
2946
2947                 if (tcp_update_needed) {
2948                         vnn->tcp_update_needed = true;
2949                 }
2950                 return 0;
2951         }
2952
2953
2954         /* Do we already have this tickle ?*/
2955         tcp.src = p->src;
2956         tcp.dst = p->dst;
2957         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2958                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2959                         ctdb_addr_to_str(&tcp.dst),
2960                         ntohs(tcp.dst.ip.sin_port),
2961                         vnn->pnn));
2962                 return 0;
2963         }
2964
2965         /* A new tickle, we must add it to the array */
2966         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2967                                         struct ctdb_connection,
2968                                         tcparray->num+1);
2969         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2970
2971         tcparray->connections[tcparray->num].src = p->src;
2972         tcparray->connections[tcparray->num].dst = p->dst;
2973         tcparray->num++;
2974
2975         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2976                 ctdb_addr_to_str(&tcp.dst),
2977                 ntohs(tcp.dst.ip.sin_port),
2978                 vnn->pnn));
2979
2980         if (tcp_update_needed) {
2981                 vnn->tcp_update_needed = true;
2982         }
2983
2984         return 0;
2985 }
2986
2987
2988 /*
2989   called by a daemon to inform us of a TCP connection that one of its
2990   clients managing that should tickled with an ACK when IP takeover is
2991   done
2992  */
2993 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
2994 {
2995         struct ctdb_connection *tcpp;
2996         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
2997
2998         if (vnn == NULL) {
2999                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3000                         ctdb_addr_to_str(&conn->dst)));
3001                 return;
3002         }
3003
3004         /* if the array is empty we cant remove it
3005            and we don't need to do anything
3006          */
3007         if (vnn->tcp_array == NULL) {
3008                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3009                         ctdb_addr_to_str(&conn->dst),
3010                         ntohs(conn->dst.ip.sin_port)));
3011                 return;
3012         }
3013
3014
3015         /* See if we know this connection
3016            if we don't know this connection  then we dont need to do anything
3017          */
3018         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3019         if (tcpp == NULL) {
3020                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3021                         ctdb_addr_to_str(&conn->dst),
3022                         ntohs(conn->dst.ip.sin_port)));
3023                 return;
3024         }
3025
3026
3027         /* We need to remove this entry from the array.
3028            Instead of allocating a new array and copying data to it
3029            we cheat and just copy the last entry in the existing array
3030            to the entry that is to be removed and just shring the 
3031            ->num field
3032          */
3033         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3034         vnn->tcp_array->num--;
3035
3036         /* If we deleted the last entry we also need to remove the entire array
3037          */
3038         if (vnn->tcp_array->num == 0) {
3039                 talloc_free(vnn->tcp_array);
3040                 vnn->tcp_array = NULL;
3041         }               
3042
3043         vnn->tcp_update_needed = true;
3044
3045         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3046                 ctdb_addr_to_str(&conn->src),
3047                 ntohs(conn->src.ip.sin_port)));
3048 }
3049
3050
3051 /*
3052   called by a daemon to inform us of a TCP connection that one of its
3053   clients used are no longer needed in the tickle database
3054  */
3055 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3056 {
3057         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3058
3059         /* If we don't have public IPs, tickles are useless */
3060         if (ctdb->vnn == NULL) {
3061                 return 0;
3062         }
3063
3064         ctdb_remove_connection(ctdb, conn);
3065
3066         return 0;
3067 }
3068
3069
3070 /*
3071   Called when another daemon starts - causes all tickles for all
3072   public addresses we are serving to be sent to the new node on the
3073   next check.  This actually causes the next scheduled call to
3074   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3075   doesn't require careful error handling.
3076  */
3077 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3078 {
3079         struct ctdb_vnn *vnn;
3080
3081         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3082                            (unsigned long) pnn));
3083
3084         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3085                 vnn->tcp_update_needed = true;
3086         }
3087
3088         return 0;
3089 }
3090
3091
3092 /*
3093   called when a client structure goes away - hook to remove
3094   elements from the tcp_list in all daemons
3095  */
3096 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3097 {
3098         while (client->tcp_list) {
3099                 struct ctdb_tcp_list *tcp = client->tcp_list;
3100                 DLIST_REMOVE(client->tcp_list, tcp);
3101                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3102         }
3103 }
3104
3105
3106 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3107 {
3108         struct ctdb_vnn *vnn;
3109         int count = 0;
3110
3111         if (ctdb->tunable.disable_ip_failover == 1) {
3112                 return;
3113         }
3114
3115         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3116                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3117                         ctdb_vnn_unassign_iface(ctdb, vnn);
3118                         continue;
3119                 }
3120                 if (!vnn->iface) {
3121                         continue;
3122                 }
3123
3124                 /* Don't allow multiple releases at once.  Some code,
3125                  * particularly ctdb_tickle_sentenced_connections() is
3126                  * not re-entrant */
3127                 if (vnn->update_in_flight) {
3128                         DEBUG(DEBUG_WARNING,
3129                               (__location__
3130                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3131                                     ctdb_addr_to_str(&vnn->public_address),
3132                                     vnn->public_netmask_bits,
3133                                     ctdb_vnn_iface_string(vnn)));
3134                         continue;
3135                 }
3136                 vnn->update_in_flight = true;
3137
3138                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3139                                     ctdb_addr_to_str(&vnn->public_address),
3140                                     vnn->public_netmask_bits,
3141                                     ctdb_vnn_iface_string(vnn)));
3142
3143                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3144                                   ctdb_vnn_iface_string(vnn),
3145                                   ctdb_addr_to_str(&vnn->public_address),
3146                                   vnn->public_netmask_bits);
3147                 release_kill_clients(ctdb, &vnn->public_address);
3148                 ctdb_vnn_unassign_iface(ctdb, vnn);
3149                 vnn->update_in_flight = false;
3150                 count++;
3151         }
3152
3153         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3154 }
3155
3156
3157 /*
3158   get list of public IPs
3159  */
3160 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3161                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3162 {
3163         int i, num, len;
3164         struct ctdb_public_ip_list_old *ips;
3165         struct ctdb_vnn *vnn;
3166         bool only_available = false;
3167
3168         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3169                 only_available = true;
3170         }
3171
3172         /* count how many public ip structures we have */
3173         num = 0;
3174         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3175                 num++;
3176         }
3177
3178         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3179                 num*sizeof(struct ctdb_public_ip);
3180         ips = talloc_zero_size(outdata, len);
3181         CTDB_NO_MEMORY(ctdb, ips);
3182
3183         i = 0;
3184         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3185                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3186                         continue;
3187                 }
3188                 ips->ips[i].pnn  = vnn->pnn;
3189                 ips->ips[i].addr = vnn->public_address;
3190                 i++;
3191         }
3192         ips->num = i;
3193         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3194                 i*sizeof(struct ctdb_public_ip);
3195
3196         outdata->dsize = len;
3197         outdata->dptr  = (uint8_t *)ips;
3198
3199         return 0;
3200 }
3201
3202
3203 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3204                                         struct ctdb_req_control_old *c,
3205                                         TDB_DATA indata,
3206                                         TDB_DATA *outdata)
3207 {
3208         int i, num, len;
3209         ctdb_sock_addr *addr;
3210         struct ctdb_public_ip_info_old *info;
3211         struct ctdb_vnn *vnn;
3212
3213         addr = (ctdb_sock_addr *)indata.dptr;
3214
3215         vnn = find_public_ip_vnn(ctdb, addr);
3216         if (vnn == NULL) {
3217                 /* if it is not a public ip   it could be our 'single ip' */
3218                 if (ctdb->single_ip_vnn) {
3219                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3220                                 vnn = ctdb->single_ip_vnn;
3221                         }
3222                 }
3223         }
3224         if (vnn == NULL) {
3225                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3226                                  "'%s'not a public address\n",
3227                                  ctdb_addr_to_str(addr)));
3228                 return -1;
3229         }
3230
3231         /* count how many public ip structures we have */
3232         num = 0;
3233         for (;vnn->ifaces[num];) {
3234                 num++;
3235         }
3236
3237         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3238                 num*sizeof(struct ctdb_iface);
3239         info = talloc_zero_size(outdata, len);
3240         CTDB_NO_MEMORY(ctdb, info);
3241
3242         info->ip.addr = vnn->public_address;
3243         info->ip.pnn = vnn->pnn;
3244         info->active_idx = 0xFFFFFFFF;
3245
3246         for (i=0; vnn->ifaces[i]; i++) {
3247                 struct ctdb_interface *cur;
3248
3249                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3250                 if (cur == NULL) {
3251                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3252                                            vnn->ifaces[i]));
3253                         return -1;
3254                 }
3255                 if (vnn->iface == cur) {
3256                         info->active_idx = i;
3257                 }
3258                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3259                 info->ifaces[i].link_state = cur->link_up;
3260                 info->ifaces[i].references = cur->references;
3261         }
3262         info->num = i;
3263         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3264                 i*sizeof(struct ctdb_iface);
3265
3266         outdata->dsize = len;
3267         outdata->dptr  = (uint8_t *)info;
3268
3269         return 0;
3270 }
3271
3272 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3273                                 struct ctdb_req_control_old *c,
3274                                 TDB_DATA *outdata)
3275 {
3276         int i, num, len;
3277         struct ctdb_iface_list_old *ifaces;
3278         struct ctdb_interface *cur;
3279
3280         /* count how many public ip structures we have */
3281         num = 0;
3282         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3283                 num++;
3284         }
3285
3286         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3287                 num*sizeof(struct ctdb_iface);
3288         ifaces = talloc_zero_size(outdata, len);
3289         CTDB_NO_MEMORY(ctdb, ifaces);
3290
3291         i = 0;
3292         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3293                 strcpy(ifaces->ifaces[i].name, cur->name);
3294                 ifaces->ifaces[i].link_state = cur->link_up;
3295                 ifaces->ifaces[i].references = cur->references;
3296                 i++;
3297         }
3298         ifaces->num = i;
3299         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3300                 i*sizeof(struct ctdb_iface);
3301
3302         outdata->dsize = len;
3303         outdata->dptr  = (uint8_t *)ifaces;
3304
3305         return 0;
3306 }
3307
3308 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3309                                     struct ctdb_req_control_old *c,
3310                                     TDB_DATA indata)
3311 {
3312         struct ctdb_iface *info;
3313         struct ctdb_interface *iface;
3314         bool link_up = false;
3315
3316         info = (struct ctdb_iface *)indata.dptr;
3317
3318         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3319                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3320                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3321                                   len, len, info->name));
3322                 return -1;
3323         }
3324
3325         switch (info->link_state) {
3326         case 0:
3327                 link_up = false;
3328                 break;
3329         case 1:
3330                 link_up = true;
3331                 break;
3332         default:
3333                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3334                                   (unsigned int)info->link_state));
3335                 return -1;
3336         }
3337
3338         if (info->references != 0) {
3339                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3340                                   (unsigned int)info->references));
3341                 return -1;
3342         }
3343
3344         iface = ctdb_find_iface(ctdb, info->name);
3345         if (iface == NULL) {
3346                 return -1;
3347         }
3348
3349         if (link_up == iface->link_up) {
3350                 return 0;
3351         }
3352
3353         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3354               ("iface[%s] has changed it's link status %s => %s\n",
3355                iface->name,
3356                iface->link_up?"up":"down",
3357                link_up?"up":"down"));
3358
3359         iface->link_up = link_up;
3360         return 0;
3361 }
3362
3363
3364 /* 
3365    structure containing the listening socket and the list of tcp connections
3366    that the ctdb daemon is to kill
3367 */
3368 struct ctdb_kill_tcp {
3369         struct ctdb_vnn *vnn;
3370         struct ctdb_context *ctdb;
3371         int capture_fd;
3372         struct tevent_fd *fde;
3373         trbt_tree_t *connections;
3374         void *private_data;
3375 };
3376
3377 /*
3378   a tcp connection that is to be killed
3379  */
3380 struct ctdb_killtcp_con {
3381         ctdb_sock_addr src_addr;
3382         ctdb_sock_addr dst_addr;
3383         int count;
3384         struct ctdb_kill_tcp *killtcp;
3385 };
3386
3387 /* this function is used to create a key to represent this socketpair
3388    in the killtcp tree.
3389    this key is used to insert and lookup matching socketpairs that are
3390    to be tickled and RST
3391 */
3392 #define KILLTCP_KEYLEN  10
3393 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3394 {
3395         static uint32_t key[KILLTCP_KEYLEN];
3396
3397         bzero(key, sizeof(key));
3398
3399         if (src->sa.sa_family != dst->sa.sa_family) {
3400                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3401                 return key;
3402         }
3403         
3404         switch (src->sa.sa_family) {
3405         case AF_INET:
3406                 key[0]  = dst->ip.sin_addr.s_addr;
3407                 key[1]  = src->ip.sin_addr.s_addr;
3408                 key[2]  = dst->ip.sin_port;
3409                 key[3]  = src->ip.sin_port;
3410                 break;
3411         case AF_INET6: {
3412                 uint32_t *dst6_addr32 =
3413                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3414                 uint32_t *src6_addr32 =
3415                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3416                 key[0]  = dst6_addr32[3];
3417                 key[1]  = src6_addr32[3];
3418                 key[2]  = dst6_addr32[2];
3419                 key[3]  = src6_addr32[2];
3420                 key[4]  = dst6_addr32[1];
3421                 key[5]  = src6_addr32[1];
3422                 key[6]  = dst6_addr32[0];
3423                 key[7]  = src6_addr32[0];
3424                 key[8]  = dst->ip6.sin6_port;
3425                 key[9]  = src->ip6.sin6_port;
3426                 break;
3427         }
3428         default:
3429                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3430                 return key;
3431         }
3432
3433         return key;
3434 }
3435
3436 /*
3437   called when we get a read event on the raw socket
3438  */
3439 static void capture_tcp_handler(struct tevent_context *ev,
3440                                 struct tevent_fd *fde,
3441                                 uint16_t flags, void *private_data)
3442 {
3443         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3444         struct ctdb_killtcp_con *con;
3445         ctdb_sock_addr src, dst;
3446         uint32_t ack_seq, seq;
3447
3448         if (!(flags & TEVENT_FD_READ)) {
3449                 return;
3450         }
3451
3452         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3453                                 killtcp->private_data,
3454                                 &src, &dst,
3455                                 &ack_seq, &seq) != 0) {
3456                 /* probably a non-tcp ACK packet */
3457                 return;
3458         }
3459
3460         /* check if we have this guy in our list of connections
3461            to kill
3462         */
3463         con = trbt_lookuparray32(killtcp->connections, 
3464                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3465         if (con == NULL) {
3466                 /* no this was some other packet we can just ignore */
3467                 return;
3468         }
3469
3470         /* This one has been tickled !
3471            now reset him and remove him from the list.
3472          */
3473         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3474