ctdb-daemon: Stop using tevent compatibility definitions
[vlendec/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29 #include "common/reqid.h"
30 #include "common/system.h"
31
32
33 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
34
35 #define CTDB_ARP_INTERVAL 1
36 #define CTDB_ARP_REPEAT   3
37
38 /* Flags used in IP allocation algorithms. */
39 struct ctdb_ipflags {
40         bool noiptakeover;
41         bool noiphost;
42         enum ctdb_runstate runstate;
43 };
44
45 struct ctdb_iface {
46         struct ctdb_iface *prev, *next;
47         const char *name;
48         bool link_up;
49         uint32_t references;
50 };
51
52 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
53 {
54         if (vnn->iface) {
55                 return vnn->iface->name;
56         }
57
58         return "__none__";
59 }
60
61 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
62 {
63         struct ctdb_iface *i;
64
65         /* Verify that we dont have an entry for this ip yet */
66         for (i=ctdb->ifaces;i;i=i->next) {
67                 if (strcmp(i->name, iface) == 0) {
68                         return 0;
69                 }
70         }
71
72         /* create a new structure for this interface */
73         i = talloc_zero(ctdb, struct ctdb_iface);
74         CTDB_NO_MEMORY_FATAL(ctdb, i);
75         i->name = talloc_strdup(i, iface);
76         CTDB_NO_MEMORY(ctdb, i->name);
77
78         i->link_up = true;
79
80         DLIST_ADD(ctdb->ifaces, i);
81
82         return 0;
83 }
84
85 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
86                                         const char *name)
87 {
88         int n;
89
90         for (n = 0; vnn->ifaces[n] != NULL; n++) {
91                 if (strcmp(name, vnn->ifaces[n]) == 0) {
92                         return true;
93                 }
94         }
95
96         return false;
97 }
98
99 /* If any interfaces now have no possible IPs then delete them.  This
100  * implementation is naive (i.e. simple) rather than clever
101  * (i.e. complex).  Given that this is run on delip and that operation
102  * is rare, this doesn't need to be efficient - it needs to be
103  * foolproof.  One alternative is reference counting, where the logic
104  * is distributed and can, therefore, be broken in multiple places.
105  * Another alternative is to build a red-black tree of interfaces that
106  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
107  * once) and then walking ctdb->ifaces once and deleting those not in
108  * the tree.  Let's go to one of those if the naive implementation
109  * causes problems...  :-)
110  */
111 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
112                                         struct ctdb_vnn *vnn)
113 {
114         struct ctdb_iface *i, *next;
115
116         /* For each interface, check if there's an IP using it. */
117         for (i = ctdb->ifaces; i != NULL; i = next) {
118                 struct ctdb_vnn *tv;
119                 bool found;
120                 next = i->next;
121
122                 /* Only consider interfaces named in the given VNN. */
123                 if (!vnn_has_interface_with_name(vnn, i->name)) {
124                         continue;
125                 }
126
127                 /* Is the "single IP" on this interface? */
128                 if ((ctdb->single_ip_vnn != NULL) &&
129                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
130                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
131                         /* Found, next interface please... */
132                         continue;
133                 }
134                 /* Search for a vnn with this interface. */
135                 found = false;
136                 for (tv=ctdb->vnn; tv; tv=tv->next) {
137                         if (vnn_has_interface_with_name(tv, i->name)) {
138                                 found = true;
139                                 break;
140                         }
141                 }
142
143                 if (!found) {
144                         /* None of the VNNs are using this interface. */
145                         DLIST_REMOVE(ctdb->ifaces, i);
146                         talloc_free(i);
147                 }
148         }
149 }
150
151
152 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
153                                           const char *iface)
154 {
155         struct ctdb_iface *i;
156
157         for (i=ctdb->ifaces;i;i=i->next) {
158                 if (strcmp(i->name, iface) == 0) {
159                         return i;
160                 }
161         }
162
163         return NULL;
164 }
165
166 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
167                                               struct ctdb_vnn *vnn)
168 {
169         int i;
170         struct ctdb_iface *cur = NULL;
171         struct ctdb_iface *best = NULL;
172
173         for (i=0; vnn->ifaces[i]; i++) {
174
175                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
176                 if (cur == NULL) {
177                         continue;
178                 }
179
180                 if (!cur->link_up) {
181                         continue;
182                 }
183
184                 if (best == NULL) {
185                         best = cur;
186                         continue;
187                 }
188
189                 if (cur->references < best->references) {
190                         best = cur;
191                         continue;
192                 }
193         }
194
195         return best;
196 }
197
198 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
199                                      struct ctdb_vnn *vnn)
200 {
201         struct ctdb_iface *best = NULL;
202
203         if (vnn->iface) {
204                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
205                                    "still assigned to iface '%s'\n",
206                                    ctdb_addr_to_str(&vnn->public_address),
207                                    ctdb_vnn_iface_string(vnn)));
208                 return 0;
209         }
210
211         best = ctdb_vnn_best_iface(ctdb, vnn);
212         if (best == NULL) {
213                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
214                                   "cannot assign to iface any iface\n",
215                                   ctdb_addr_to_str(&vnn->public_address)));
216                 return -1;
217         }
218
219         vnn->iface = best;
220         best->references++;
221         vnn->pnn = ctdb->pnn;
222
223         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
224                            "now assigned to iface '%s' refs[%d]\n",
225                            ctdb_addr_to_str(&vnn->public_address),
226                            ctdb_vnn_iface_string(vnn),
227                            best->references));
228         return 0;
229 }
230
231 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
232                                     struct ctdb_vnn *vnn)
233 {
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now unassigned (old iface '%s' refs[%d])\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            vnn->iface?vnn->iface->references:0));
239         if (vnn->iface) {
240                 vnn->iface->references--;
241         }
242         vnn->iface = NULL;
243         if (vnn->pnn == ctdb->pnn) {
244                 vnn->pnn = -1;
245         }
246 }
247
248 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
249                                struct ctdb_vnn *vnn)
250 {
251         int i;
252
253         if (vnn->delete_pending) {
254                 return false;
255         }
256
257         if (vnn->iface && vnn->iface->link_up) {
258                 return true;
259         }
260
261         for (i=0; vnn->ifaces[i]; i++) {
262                 struct ctdb_iface *cur;
263
264                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
265                 if (cur == NULL) {
266                         continue;
267                 }
268
269                 if (cur->link_up) {
270                         return true;
271                 }
272         }
273
274         return false;
275 }
276
277 struct ctdb_takeover_arp {
278         struct ctdb_context *ctdb;
279         uint32_t count;
280         ctdb_sock_addr addr;
281         struct ctdb_tcp_array *tcparray;
282         struct ctdb_vnn *vnn;
283 };
284
285
286 /*
287   lists of tcp endpoints
288  */
289 struct ctdb_tcp_list {
290         struct ctdb_tcp_list *prev, *next;
291         struct ctdb_tcp_connection connection;
292 };
293
294 /*
295   list of clients to kill on IP release
296  */
297 struct ctdb_client_ip {
298         struct ctdb_client_ip *prev, *next;
299         struct ctdb_context *ctdb;
300         ctdb_sock_addr addr;
301         uint32_t client_id;
302 };
303
304
305 /*
306   send a gratuitous arp
307  */
308 static void ctdb_control_send_arp(struct tevent_context *ev,
309                                   struct tevent_timer *te,
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
353                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
354                          ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
390                          timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          CTDB_EVENT_TAKE_IP,
509                                          "%s %s %u",
510                                          ctdb_vnn_iface_string(vnn),
511                                          ctdb_addr_to_str(&vnn->public_address),
512                                          vnn->public_netmask_bits);
513
514         if (ret != 0) {
515                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
516                         ctdb_addr_to_str(&vnn->public_address),
517                         ctdb_vnn_iface_string(vnn)));
518                 talloc_free(state);
519                 return -1;
520         }
521
522         return 0;
523 }
524
525 struct ctdb_do_updateip_state {
526         struct ctdb_req_control *c;
527         struct ctdb_iface *old;
528         struct ctdb_vnn *vnn;
529 };
530
531 /*
532   called when updateip event finishes
533  */
534 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
535                                       void *private_data)
536 {
537         struct ctdb_do_updateip_state *state =
538                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
539         int32_t ret;
540
541         if (status != 0) {
542                 if (status == -ETIME) {
543                         ctdb_ban_self(ctdb);
544                 }
545                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
546                         ctdb_addr_to_str(&state->vnn->public_address),
547                         state->old->name,
548                         ctdb_vnn_iface_string(state->vnn)));
549
550                 /*
551                  * All we can do is reset the old interface
552                  * and let the next run fix it
553                  */
554                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
555                 state->vnn->iface = state->old;
556                 state->vnn->iface->references++;
557
558                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
559                 talloc_free(state);
560                 return;
561         }
562
563         if (ctdb->do_checkpublicip) {
564
565         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
566         if (ret != 0) {
567                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
568                 talloc_free(state);
569                 return;
570         }
571
572         }
573
574         /* the control succeeded */
575         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
576         talloc_free(state);
577         return;
578 }
579
580 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
581 {
582         state->vnn->update_in_flight = false;
583         return 0;
584 }
585
586 /*
587   update (move) an ip address
588  */
589 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
590                                 struct ctdb_req_control *c,
591                                 struct ctdb_vnn *vnn)
592 {
593         int ret;
594         struct ctdb_do_updateip_state *state;
595         struct ctdb_iface *old = vnn->iface;
596         const char *new_name;
597
598         if (vnn->update_in_flight) {
599                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
600                                     "update for this IP already in flight\n",
601                                     ctdb_addr_to_str(&vnn->public_address),
602                                     vnn->public_netmask_bits));
603                 return -1;
604         }
605
606         ctdb_vnn_unassign_iface(ctdb, vnn);
607         ret = ctdb_vnn_assign_iface(ctdb, vnn);
608         if (ret != 0) {
609                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
610                                  "assin a usable interface (old iface '%s')\n",
611                                  ctdb_addr_to_str(&vnn->public_address),
612                                  vnn->public_netmask_bits,
613                                  old->name));
614                 return -1;
615         }
616
617         new_name = ctdb_vnn_iface_string(vnn);
618         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
619                 /* A benign update from one interface onto itself.
620                  * no need to run the eventscripts in this case, just return
621                  * success.
622                  */
623                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
624                 return 0;
625         }
626
627         state = talloc(vnn, struct ctdb_do_updateip_state);
628         CTDB_NO_MEMORY(ctdb, state);
629
630         state->c = talloc_steal(ctdb, c);
631         state->old = old;
632         state->vnn = vnn;
633
634         vnn->update_in_flight = true;
635         talloc_set_destructor(state, ctdb_updateip_destructor);
636
637         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
638                             "interface %s to %s\n",
639                             ctdb_addr_to_str(&vnn->public_address),
640                             vnn->public_netmask_bits,
641                             old->name,
642                             new_name));
643
644         ret = ctdb_event_script_callback(ctdb,
645                                          state,
646                                          ctdb_do_updateip_callback,
647                                          state,
648                                          CTDB_EVENT_UPDATE_IP,
649                                          "%s %s %s %u",
650                                          state->old->name,
651                                          new_name,
652                                          ctdb_addr_to_str(&vnn->public_address),
653                                          vnn->public_netmask_bits);
654         if (ret != 0) {
655                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
656                                  ctdb_addr_to_str(&vnn->public_address),
657                                  old->name, new_name));
658                 talloc_free(state);
659                 return -1;
660         }
661
662         return 0;
663 }
664
665 /*
666   Find the vnn of the node that has a public ip address
667   returns -1 if the address is not known as a public address
668  */
669 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
670 {
671         struct ctdb_vnn *vnn;
672
673         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
674                 if (ctdb_same_ip(&vnn->public_address, addr)) {
675                         return vnn;
676                 }
677         }
678
679         return NULL;
680 }
681
682 /*
683   take over an ip address
684  */
685 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
686                                  struct ctdb_req_control *c,
687                                  TDB_DATA indata,
688                                  bool *async_reply)
689 {
690         int ret;
691         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
692         struct ctdb_vnn *vnn;
693         bool have_ip = false;
694         bool do_updateip = false;
695         bool do_takeip = false;
696         struct ctdb_iface *best_iface = NULL;
697
698         if (pip->pnn != ctdb->pnn) {
699                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
700                                  "with pnn %d, but we're node %d\n",
701                                  ctdb_addr_to_str(&pip->addr),
702                                  pip->pnn, ctdb->pnn));
703                 return -1;
704         }
705
706         /* update out vnn list */
707         vnn = find_public_ip_vnn(ctdb, &pip->addr);
708         if (vnn == NULL) {
709                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
710                         ctdb_addr_to_str(&pip->addr)));
711                 return 0;
712         }
713
714         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
715                 have_ip = ctdb_sys_have_ip(&pip->addr);
716         }
717         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
718         if (best_iface == NULL) {
719                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
720                                  "a usable interface (old %s, have_ip %d)\n",
721                                  ctdb_addr_to_str(&vnn->public_address),
722                                  vnn->public_netmask_bits,
723                                  ctdb_vnn_iface_string(vnn),
724                                  have_ip));
725                 return -1;
726         }
727
728         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
729                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
730                 have_ip = false;
731         }
732
733
734         if (vnn->iface == NULL && have_ip) {
735                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
736                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
737                                  ctdb_addr_to_str(&vnn->public_address)));
738                 return 0;
739         }
740
741         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "and we have it on iface[%s], but it was assigned to node %d"
744                                   "and we are node %d, banning ourself\n",
745                                  ctdb_addr_to_str(&vnn->public_address),
746                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
747                 ctdb_ban_self(ctdb);
748                 return -1;
749         }
750
751         if (vnn->pnn == -1 && have_ip) {
752                 vnn->pnn = ctdb->pnn;
753                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
754                                   "and we already have it on iface[%s], update local daemon\n",
755                                  ctdb_addr_to_str(&vnn->public_address),
756                                   ctdb_vnn_iface_string(vnn)));
757                 return 0;
758         }
759
760         if (vnn->iface) {
761                 if (vnn->iface != best_iface) {
762                         if (!vnn->iface->link_up) {
763                                 do_updateip = true;
764                         } else if (vnn->iface->references > (best_iface->references + 1)) {
765                                 /* only move when the rebalance gains something */
766                                         do_updateip = true;
767                         }
768                 }
769         }
770
771         if (!have_ip) {
772                 if (do_updateip) {
773                         ctdb_vnn_unassign_iface(ctdb, vnn);
774                         do_updateip = false;
775                 }
776                 do_takeip = true;
777         }
778
779         if (do_takeip) {
780                 ret = ctdb_do_takeip(ctdb, c, vnn);
781                 if (ret != 0) {
782                         return -1;
783                 }
784         } else if (do_updateip) {
785                 ret = ctdb_do_updateip(ctdb, c, vnn);
786                 if (ret != 0) {
787                         return -1;
788                 }
789         } else {
790                 /*
791                  * The interface is up and the kernel known the ip
792                  * => do nothing
793                  */
794                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
795                         ctdb_addr_to_str(&pip->addr),
796                         vnn->public_netmask_bits,
797                         ctdb_vnn_iface_string(vnn)));
798                 return 0;
799         }
800
801         /* tell ctdb_control.c that we will be replying asynchronously */
802         *async_reply = true;
803
804         return 0;
805 }
806
807 /*
808   kill any clients that are registered with a IP that is being released
809  */
810 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
811 {
812         struct ctdb_client_ip *ip;
813
814         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
815                 ctdb_addr_to_str(addr)));
816
817         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
818                 ctdb_sock_addr tmp_addr;
819
820                 tmp_addr = ip->addr;
821                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
822                         ip->client_id,
823                         ctdb_addr_to_str(&ip->addr)));
824
825                 if (ctdb_same_ip(&tmp_addr, addr)) {
826                         struct ctdb_client *client = reqid_find(ctdb->idr,
827                                                                 ip->client_id,
828                                                                 struct ctdb_client);
829                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
830                                 ip->client_id,
831                                 ctdb_addr_to_str(&ip->addr),
832                                 client->pid));
833
834                         if (client->pid != 0) {
835                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
836                                         (unsigned)client->pid,
837                                         ctdb_addr_to_str(addr),
838                                         ip->client_id));
839                                 kill(client->pid, SIGKILL);
840                         }
841                 }
842         }
843 }
844
845 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
846 {
847         DLIST_REMOVE(ctdb->vnn, vnn);
848         ctdb_vnn_unassign_iface(ctdb, vnn);
849         ctdb_remove_orphaned_ifaces(ctdb, vnn);
850         talloc_free(vnn);
851 }
852
853 /*
854   called when releaseip event finishes
855  */
856 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
857                                 void *private_data)
858 {
859         struct takeover_callback_state *state = 
860                 talloc_get_type(private_data, struct takeover_callback_state);
861         TDB_DATA data;
862
863         if (status == -ETIME) {
864                 ctdb_ban_self(ctdb);
865         }
866
867         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
868                 if  (ctdb_sys_have_ip(state->addr)) {
869                         DEBUG(DEBUG_ERR,
870                               ("IP %s still hosted during release IP callback, failing\n",
871                                ctdb_addr_to_str(state->addr)));
872                         ctdb_request_control_reply(ctdb, state->c,
873                                                    NULL, -1, NULL);
874                         talloc_free(state);
875                         return;
876                 }
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* Process the IP if it has been marked for deletion */
896         if (state->vnn->delete_pending) {
897                 do_delete_ip(ctdb, state->vnn);
898                 state->vnn = NULL;
899         }
900
901         /* the control succeeded */
902         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
903         talloc_free(state);
904 }
905
906 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
907 {
908         if (state->vnn != NULL) {
909                 state->vnn->update_in_flight = false;
910         }
911         return 0;
912 }
913
914 /*
915   release an ip address
916  */
917 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
918                                 struct ctdb_req_control *c,
919                                 TDB_DATA indata, 
920                                 bool *async_reply)
921 {
922         int ret;
923         struct takeover_callback_state *state;
924         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
925         struct ctdb_vnn *vnn;
926         char *iface;
927
928         /* update our vnn list */
929         vnn = find_public_ip_vnn(ctdb, &pip->addr);
930         if (vnn == NULL) {
931                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
932                         ctdb_addr_to_str(&pip->addr)));
933                 return 0;
934         }
935         vnn->pnn = pip->pnn;
936
937         /* stop any previous arps */
938         talloc_free(vnn->takeover_ctx);
939         vnn->takeover_ctx = NULL;
940
941         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
942          * lazy multicast to drop an IP from any node that isn't the
943          * intended new node.  The following causes makes ctdbd ignore
944          * a release for any address it doesn't host.
945          */
946         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
947                 if (!ctdb_sys_have_ip(&pip->addr)) {
948                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
949                                 ctdb_addr_to_str(&pip->addr),
950                                 vnn->public_netmask_bits,
951                                 ctdb_vnn_iface_string(vnn)));
952                         ctdb_vnn_unassign_iface(ctdb, vnn);
953                         return 0;
954                 }
955         } else {
956                 if (vnn->iface == NULL) {
957                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
958                                            ctdb_addr_to_str(&pip->addr),
959                                            vnn->public_netmask_bits));
960                         return 0;
961                 }
962         }
963
964         /* There is a potential race between take_ip and us because we
965          * update the VNN via a callback that run when the
966          * eventscripts have been run.  Avoid the race by allowing one
967          * update to be in flight at a time.
968          */
969         if (vnn->update_in_flight) {
970                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
971                                     "update for this IP already in flight\n",
972                                     ctdb_addr_to_str(&vnn->public_address),
973                                     vnn->public_netmask_bits));
974                 return -1;
975         }
976
977         iface = strdup(ctdb_vnn_iface_string(vnn));
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         if (state == NULL) {
987                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
988                                __FILE__, __LINE__);
989                 free(iface);
990                 return -1;
991         }
992
993         state->c = talloc_steal(state, c);
994         state->addr = talloc(state, ctdb_sock_addr);       
995         if (state->addr == NULL) {
996                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
997                                __FILE__, __LINE__);
998                 free(iface);
999                 talloc_free(state);
1000                 return -1;
1001         }
1002         *state->addr = pip->addr;
1003         state->vnn   = vnn;
1004
1005         vnn->update_in_flight = true;
1006         talloc_set_destructor(state, ctdb_releaseip_destructor);
1007
1008         ret = ctdb_event_script_callback(ctdb, 
1009                                          state, release_ip_callback, state,
1010                                          CTDB_EVENT_RELEASE_IP,
1011                                          "%s %s %u",
1012                                          iface,
1013                                          ctdb_addr_to_str(&pip->addr),
1014                                          vnn->public_netmask_bits);
1015         free(iface);
1016         if (ret != 0) {
1017                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1018                         ctdb_addr_to_str(&pip->addr),
1019                         ctdb_vnn_iface_string(vnn)));
1020                 talloc_free(state);
1021                 return -1;
1022         }
1023
1024         /* tell the control that we will be reply asynchronously */
1025         *async_reply = true;
1026         return 0;
1027 }
1028
1029 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1030                                    ctdb_sock_addr *addr,
1031                                    unsigned mask, const char *ifaces,
1032                                    bool check_address)
1033 {
1034         struct ctdb_vnn      *vnn;
1035         uint32_t num = 0;
1036         char *tmp;
1037         const char *iface;
1038         int i;
1039         int ret;
1040
1041         tmp = strdup(ifaces);
1042         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1043                 if (!ctdb_sys_check_iface_exists(iface)) {
1044                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1045                         free(tmp);
1046                         return -1;
1047                 }
1048         }
1049         free(tmp);
1050
1051         /* Verify that we dont have an entry for this ip yet */
1052         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1053                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1054                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1055                                 ctdb_addr_to_str(addr)));
1056                         return -1;
1057                 }               
1058         }
1059
1060         /* create a new vnn structure for this ip address */
1061         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1062         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1063         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1064         tmp = talloc_strdup(vnn, ifaces);
1065         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1066         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1067                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1068                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1069                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1070                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1071                 num++;
1072         }
1073         talloc_free(tmp);
1074         vnn->ifaces[num] = NULL;
1075         vnn->public_address      = *addr;
1076         vnn->public_netmask_bits = mask;
1077         vnn->pnn                 = -1;
1078         if (check_address) {
1079                 if (ctdb_sys_have_ip(addr)) {
1080                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1081                         vnn->pnn = ctdb->pnn;
1082                 }
1083         }
1084
1085         for (i=0; vnn->ifaces[i]; i++) {
1086                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1087                 if (ret != 0) {
1088                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1089                                            "for public_address[%s]\n",
1090                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1091                         talloc_free(vnn);
1092                         return -1;
1093                 }
1094         }
1095
1096         DLIST_ADD(ctdb->vnn, vnn);
1097
1098         return 0;
1099 }
1100
1101 /*
1102   setup the public address lists from a file
1103 */
1104 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1105 {
1106         char **lines;
1107         int nlines;
1108         int i;
1109
1110         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1111         if (lines == NULL) {
1112                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1113                 return -1;
1114         }
1115         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1116                 nlines--;
1117         }
1118
1119         for (i=0;i<nlines;i++) {
1120                 unsigned mask;
1121                 ctdb_sock_addr addr;
1122                 const char *addrstr;
1123                 const char *ifaces;
1124                 char *tok, *line;
1125
1126                 line = lines[i];
1127                 while ((*line == ' ') || (*line == '\t')) {
1128                         line++;
1129                 }
1130                 if (*line == '#') {
1131                         continue;
1132                 }
1133                 if (strcmp(line, "") == 0) {
1134                         continue;
1135                 }
1136                 tok = strtok(line, " \t");
1137                 addrstr = tok;
1138                 tok = strtok(NULL, " \t");
1139                 if (tok == NULL) {
1140                         if (NULL == ctdb->default_public_interface) {
1141                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1142                                          i+1));
1143                                 talloc_free(lines);
1144                                 return -1;
1145                         }
1146                         ifaces = ctdb->default_public_interface;
1147                 } else {
1148                         ifaces = tok;
1149                 }
1150
1151                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1152                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1153                         talloc_free(lines);
1154                         return -1;
1155                 }
1156                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1157                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1158                         talloc_free(lines);
1159                         return -1;
1160                 }
1161         }
1162
1163
1164         talloc_free(lines);
1165         return 0;
1166 }
1167
1168 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1169                               const char *iface,
1170                               const char *ip)
1171 {
1172         struct ctdb_vnn *svnn;
1173         struct ctdb_iface *cur = NULL;
1174         bool ok;
1175         int ret;
1176
1177         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1178         CTDB_NO_MEMORY(ctdb, svnn);
1179
1180         svnn->ifaces = talloc_array(svnn, const char *, 2);
1181         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1182         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1183         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1184         svnn->ifaces[1] = NULL;
1185
1186         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1187         if (!ok) {
1188                 talloc_free(svnn);
1189                 return -1;
1190         }
1191
1192         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1193         if (ret != 0) {
1194                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1195                                    "for single_ip[%s]\n",
1196                                    svnn->ifaces[0],
1197                                    ctdb_addr_to_str(&svnn->public_address)));
1198                 talloc_free(svnn);
1199                 return -1;
1200         }
1201
1202         /* assume the single public ip interface is initially "good" */
1203         cur = ctdb_find_iface(ctdb, iface);
1204         if (cur == NULL) {
1205                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1206                 return -1;
1207         }
1208         cur->link_up = true;
1209
1210         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1211         if (ret != 0) {
1212                 talloc_free(svnn);
1213                 return -1;
1214         }
1215
1216         ctdb->single_ip_vnn = svnn;
1217         return 0;
1218 }
1219
1220 struct ctdb_public_ip_list {
1221         struct ctdb_public_ip_list *next;
1222         uint32_t pnn;
1223         ctdb_sock_addr addr;
1224 };
1225
1226 /* Given a physical node, return the number of
1227    public addresses that is currently assigned to this node.
1228 */
1229 static int node_ip_coverage(struct ctdb_context *ctdb, 
1230         int32_t pnn,
1231         struct ctdb_public_ip_list *ips)
1232 {
1233         int num=0;
1234
1235         for (;ips;ips=ips->next) {
1236                 if (ips->pnn == pnn) {
1237                         num++;
1238                 }
1239         }
1240         return num;
1241 }
1242
1243
1244 /* Can the given node host the given IP: is the public IP known to the
1245  * node and is NOIPHOST unset?
1246 */
1247 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1248                              struct ctdb_ipflags ipflags,
1249                              struct ctdb_public_ip_list *ip)
1250 {
1251         struct ctdb_all_public_ips *public_ips;
1252         int i;
1253
1254         if (ipflags.noiphost) {
1255                 return false;
1256         }
1257
1258         public_ips = ctdb->nodes[pnn]->available_public_ips;
1259
1260         if (public_ips == NULL) {
1261                 return false;
1262         }
1263
1264         for (i=0; i<public_ips->num; i++) {
1265                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1266                         /* yes, this node can serve this public ip */
1267                         return true;
1268                 }
1269         }
1270
1271         return false;
1272 }
1273
1274 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1275                                  struct ctdb_ipflags ipflags,
1276                                  struct ctdb_public_ip_list *ip)
1277 {
1278         if (ipflags.noiptakeover) {
1279                 return false;
1280         }
1281
1282         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1283 }
1284
1285 /* search the node lists list for a node to takeover this ip.
1286    pick the node that currently are serving the least number of ips
1287    so that the ips get spread out evenly.
1288 */
1289 static int find_takeover_node(struct ctdb_context *ctdb, 
1290                 struct ctdb_ipflags *ipflags,
1291                 struct ctdb_public_ip_list *ip,
1292                 struct ctdb_public_ip_list *all_ips)
1293 {
1294         int pnn, min=0, num;
1295         int i, numnodes;
1296
1297         numnodes = talloc_array_length(ipflags);
1298         pnn    = -1;
1299         for (i=0; i<numnodes; i++) {
1300                 /* verify that this node can serve this ip */
1301                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1302                         /* no it couldnt   so skip to the next node */
1303                         continue;
1304                 }
1305
1306                 num = node_ip_coverage(ctdb, i, all_ips);
1307                 /* was this the first node we checked ? */
1308                 if (pnn == -1) {
1309                         pnn = i;
1310                         min  = num;
1311                 } else {
1312                         if (num < min) {
1313                                 pnn = i;
1314                                 min  = num;
1315                         }
1316                 }
1317         }       
1318         if (pnn == -1) {
1319                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1320                         ctdb_addr_to_str(&ip->addr)));
1321
1322                 return -1;
1323         }
1324
1325         ip->pnn = pnn;
1326         return 0;
1327 }
1328
1329 #define IP_KEYLEN       4
1330 static uint32_t *ip_key(ctdb_sock_addr *ip)
1331 {
1332         static uint32_t key[IP_KEYLEN];
1333
1334         bzero(key, sizeof(key));
1335
1336         switch (ip->sa.sa_family) {
1337         case AF_INET:
1338                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1339                 break;
1340         case AF_INET6: {
1341                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1342                 key[0]  = htonl(s6_a32[0]);
1343                 key[1]  = htonl(s6_a32[1]);
1344                 key[2]  = htonl(s6_a32[2]);
1345                 key[3]  = htonl(s6_a32[3]);
1346                 break;
1347         }
1348         default:
1349                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1350                 return key;
1351         }
1352
1353         return key;
1354 }
1355
1356 static void *add_ip_callback(void *parm, void *data)
1357 {
1358         struct ctdb_public_ip_list *this_ip = parm; 
1359         struct ctdb_public_ip_list *prev_ip = data; 
1360
1361         if (prev_ip == NULL) {
1362                 return parm;
1363         }
1364         if (this_ip->pnn == -1) {
1365                 this_ip->pnn = prev_ip->pnn;
1366         }
1367
1368         return parm;
1369 }
1370
1371 static int getips_count_callback(void *param, void *data)
1372 {
1373         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1374         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1375
1376         new_ip->next = *ip_list;
1377         *ip_list     = new_ip;
1378         return 0;
1379 }
1380
1381 static struct ctdb_public_ip_list *
1382 create_merged_ip_list(struct ctdb_context *ctdb)
1383 {
1384         int i, j;
1385         struct ctdb_public_ip_list *ip_list;
1386         struct ctdb_all_public_ips *public_ips;
1387
1388         if (ctdb->ip_tree != NULL) {
1389                 talloc_free(ctdb->ip_tree);
1390                 ctdb->ip_tree = NULL;
1391         }
1392         ctdb->ip_tree = trbt_create(ctdb, 0);
1393
1394         for (i=0;i<ctdb->num_nodes;i++) {
1395                 public_ips = ctdb->nodes[i]->known_public_ips;
1396
1397                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1398                         continue;
1399                 }
1400
1401                 /* there were no public ips for this node */
1402                 if (public_ips == NULL) {
1403                         continue;
1404                 }               
1405
1406                 for (j=0;j<public_ips->num;j++) {
1407                         struct ctdb_public_ip_list *tmp_ip; 
1408
1409                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1410                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1411                         /* Do not use information about IP addresses hosted
1412                          * on other nodes, it may not be accurate */
1413                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1414                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1415                         } else {
1416                                 tmp_ip->pnn = -1;
1417                         }
1418                         tmp_ip->addr = public_ips->ips[j].addr;
1419                         tmp_ip->next = NULL;
1420
1421                         trbt_insertarray32_callback(ctdb->ip_tree,
1422                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1423                                 add_ip_callback,
1424                                 tmp_ip);
1425                 }
1426         }
1427
1428         ip_list = NULL;
1429         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1430
1431         return ip_list;
1432 }
1433
1434 /* 
1435  * This is the length of the longtest common prefix between the IPs.
1436  * It is calculated by XOR-ing the 2 IPs together and counting the
1437  * number of leading zeroes.  The implementation means that all
1438  * addresses end up being 128 bits long.
1439  *
1440  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1441  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1442  * lots of nodes and IP addresses?
1443  */
1444 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1445 {
1446         uint32_t ip1_k[IP_KEYLEN];
1447         uint32_t *t;
1448         int i;
1449         uint32_t x;
1450
1451         uint32_t distance = 0;
1452
1453         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1454         t = ip_key(ip2);
1455         for (i=0; i<IP_KEYLEN; i++) {
1456                 x = ip1_k[i] ^ t[i];
1457                 if (x == 0) {
1458                         distance += 32;
1459                 } else {
1460                         /* Count number of leading zeroes. 
1461                          * FIXME? This could be optimised...
1462                          */
1463                         while ((x & (1 << 31)) == 0) {
1464                                 x <<= 1;
1465                                 distance += 1;
1466                         }
1467                 }
1468         }
1469
1470         return distance;
1471 }
1472
1473 /* Calculate the IP distance for the given IP relative to IPs on the
1474    given node.  The ips argument is generally the all_ips variable
1475    used in the main part of the algorithm.
1476  */
1477 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1478                                   struct ctdb_public_ip_list *ips,
1479                                   int pnn)
1480 {
1481         struct ctdb_public_ip_list *t;
1482         uint32_t d;
1483
1484         uint32_t sum = 0;
1485
1486         for (t=ips; t != NULL; t=t->next) {
1487                 if (t->pnn != pnn) {
1488                         continue;
1489                 }
1490
1491                 /* Optimisation: We never calculate the distance
1492                  * between an address and itself.  This allows us to
1493                  * calculate the effect of removing an address from a
1494                  * node by simply calculating the distance between
1495                  * that address and all of the exitsing addresses.
1496                  * Moreover, we assume that we're only ever dealing
1497                  * with addresses from all_ips so we can identify an
1498                  * address via a pointer rather than doing a more
1499                  * expensive address comparison. */
1500                 if (&(t->addr) == ip) {
1501                         continue;
1502                 }
1503
1504                 d = ip_distance(ip, &(t->addr));
1505                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1506         }
1507
1508         return sum;
1509 }
1510
1511 /* Return the LCP2 imbalance metric for addresses currently assigned
1512    to the given node.
1513  */
1514 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1515 {
1516         struct ctdb_public_ip_list *t;
1517
1518         uint32_t imbalance = 0;
1519
1520         for (t=all_ips; t!=NULL; t=t->next) {
1521                 if (t->pnn != pnn) {
1522                         continue;
1523                 }
1524                 /* Pass the rest of the IPs rather than the whole
1525                    all_ips input list.
1526                 */
1527                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1528         }
1529
1530         return imbalance;
1531 }
1532
1533 /* Allocate any unassigned IPs just by looping through the IPs and
1534  * finding the best node for each.
1535  */
1536 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1537                                       struct ctdb_ipflags *ipflags,
1538                                       struct ctdb_public_ip_list *all_ips)
1539 {
1540         struct ctdb_public_ip_list *tmp_ip;
1541
1542         /* loop over all ip's and find a physical node to cover for 
1543            each unassigned ip.
1544         */
1545         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1546                 if (tmp_ip->pnn == -1) {
1547                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1548                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1549                                         ctdb_addr_to_str(&tmp_ip->addr)));
1550                         }
1551                 }
1552         }
1553 }
1554
1555 /* Basic non-deterministic rebalancing algorithm.
1556  */
1557 static void basic_failback(struct ctdb_context *ctdb,
1558                            struct ctdb_ipflags *ipflags,
1559                            struct ctdb_public_ip_list *all_ips,
1560                            int num_ips)
1561 {
1562         int i, numnodes;
1563         int maxnode, maxnum, minnode, minnum, num, retries;
1564         struct ctdb_public_ip_list *tmp_ip;
1565
1566         numnodes = talloc_array_length(ipflags);
1567         retries = 0;
1568
1569 try_again:
1570         maxnum=0;
1571         minnum=0;
1572
1573         /* for each ip address, loop over all nodes that can serve
1574            this ip and make sure that the difference between the node
1575            serving the most and the node serving the least ip's are
1576            not greater than 1.
1577         */
1578         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1579                 if (tmp_ip->pnn == -1) {
1580                         continue;
1581                 }
1582
1583                 /* Get the highest and lowest number of ips's served by any 
1584                    valid node which can serve this ip.
1585                 */
1586                 maxnode = -1;
1587                 minnode = -1;
1588                 for (i=0; i<numnodes; i++) {
1589                         /* only check nodes that can actually serve this ip */
1590                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1591                                 /* no it couldnt   so skip to the next node */
1592                                 continue;
1593                         }
1594
1595                         num = node_ip_coverage(ctdb, i, all_ips);
1596                         if (maxnode == -1) {
1597                                 maxnode = i;
1598                                 maxnum  = num;
1599                         } else {
1600                                 if (num > maxnum) {
1601                                         maxnode = i;
1602                                         maxnum  = num;
1603                                 }
1604                         }
1605                         if (minnode == -1) {
1606                                 minnode = i;
1607                                 minnum  = num;
1608                         } else {
1609                                 if (num < minnum) {
1610                                         minnode = i;
1611                                         minnum  = num;
1612                                 }
1613                         }
1614                 }
1615                 if (maxnode == -1) {
1616                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1617                                 ctdb_addr_to_str(&tmp_ip->addr)));
1618
1619                         continue;
1620                 }
1621
1622                 /* if the spread between the smallest and largest coverage by
1623                    a node is >=2 we steal one of the ips from the node with
1624                    most coverage to even things out a bit.
1625                    try to do this a limited number of times since we dont
1626                    want to spend too much time balancing the ip coverage.
1627                 */
1628                 if ( (maxnum > minnum+1)
1629                      && (retries < (num_ips + 5)) ){
1630                         struct ctdb_public_ip_list *tmp;
1631
1632                         /* Reassign one of maxnode's VNNs */
1633                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1634                                 if (tmp->pnn == maxnode) {
1635                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1636                                         retries++;
1637                                         goto try_again;;
1638                                 }
1639                         }
1640                 }
1641         }
1642 }
1643
1644 static void lcp2_init(struct ctdb_context *tmp_ctx,
1645                       struct ctdb_ipflags *ipflags,
1646                       struct ctdb_public_ip_list *all_ips,
1647                       uint32_t *force_rebalance_nodes,
1648                       uint32_t **lcp2_imbalances,
1649                       bool **rebalance_candidates)
1650 {
1651         int i, numnodes;
1652         struct ctdb_public_ip_list *tmp_ip;
1653
1654         numnodes = talloc_array_length(ipflags);
1655
1656         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1657         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1658         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1659         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1660
1661         for (i=0; i<numnodes; i++) {
1662                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1663                 /* First step: assume all nodes are candidates */
1664                 (*rebalance_candidates)[i] = true;
1665         }
1666
1667         /* 2nd step: if a node has IPs assigned then it must have been
1668          * healthy before, so we remove it from consideration.  This
1669          * is overkill but is all we have because we don't maintain
1670          * state between takeover runs.  An alternative would be to
1671          * keep state and invalidate it every time the recovery master
1672          * changes.
1673          */
1674         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1675                 if (tmp_ip->pnn != -1) {
1676                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1677                 }
1678         }
1679
1680         /* 3rd step: if a node is forced to re-balance then
1681            we allow failback onto the node */
1682         if (force_rebalance_nodes == NULL) {
1683                 return;
1684         }
1685         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1686                 uint32_t pnn = force_rebalance_nodes[i];
1687                 if (pnn >= numnodes) {
1688                         DEBUG(DEBUG_ERR,
1689                               (__location__ "unknown node %u\n", pnn));
1690                         continue;
1691                 }
1692
1693                 DEBUG(DEBUG_NOTICE,
1694                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1695                 (*rebalance_candidates)[pnn] = true;
1696         }
1697 }
1698
1699 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1700  * the IP/node combination that will cost the least.
1701  */
1702 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1703                                      struct ctdb_ipflags *ipflags,
1704                                      struct ctdb_public_ip_list *all_ips,
1705                                      uint32_t *lcp2_imbalances)
1706 {
1707         struct ctdb_public_ip_list *tmp_ip;
1708         int dstnode, numnodes;
1709
1710         int minnode;
1711         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1712         struct ctdb_public_ip_list *minip;
1713
1714         bool should_loop = true;
1715         bool have_unassigned = true;
1716
1717         numnodes = talloc_array_length(ipflags);
1718
1719         while (have_unassigned && should_loop) {
1720                 should_loop = false;
1721
1722                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1723                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1724
1725                 minnode = -1;
1726                 mindsum = 0;
1727                 minip = NULL;
1728
1729                 /* loop over each unassigned ip. */
1730                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1731                         if (tmp_ip->pnn != -1) {
1732                                 continue;
1733                         }
1734
1735                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1736                                 /* only check nodes that can actually takeover this ip */
1737                                 if (!can_node_takeover_ip(ctdb, dstnode,
1738                                                           ipflags[dstnode],
1739                                                           tmp_ip)) {
1740                                         /* no it couldnt   so skip to the next node */
1741                                         continue;
1742                                 }
1743
1744                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1745                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1746                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1747                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1748                                                    dstnode,
1749                                                    dstimbl - lcp2_imbalances[dstnode]));
1750
1751
1752                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1753                                         minnode = dstnode;
1754                                         minimbl = dstimbl;
1755                                         mindsum = dstdsum;
1756                                         minip = tmp_ip;
1757                                         should_loop = true;
1758                                 }
1759                         }
1760                 }
1761
1762                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1763
1764                 /* If we found one then assign it to the given node. */
1765                 if (minnode != -1) {
1766                         minip->pnn = minnode;
1767                         lcp2_imbalances[minnode] = minimbl;
1768                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1769                                           ctdb_addr_to_str(&(minip->addr)),
1770                                           minnode,
1771                                           mindsum));
1772                 }
1773
1774                 /* There might be a better way but at least this is clear. */
1775                 have_unassigned = false;
1776                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1777                         if (tmp_ip->pnn == -1) {
1778                                 have_unassigned = true;
1779                         }
1780                 }
1781         }
1782
1783         /* We know if we have an unassigned addresses so we might as
1784          * well optimise.
1785          */
1786         if (have_unassigned) {
1787                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1788                         if (tmp_ip->pnn == -1) {
1789                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1790                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1791                         }
1792                 }
1793         }
1794 }
1795
1796 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1797  * to move IPs from, determines the best IP/destination node
1798  * combination to move from the source node.
1799  */
1800 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1801                                     struct ctdb_ipflags *ipflags,
1802                                     struct ctdb_public_ip_list *all_ips,
1803                                     int srcnode,
1804                                     uint32_t *lcp2_imbalances,
1805                                     bool *rebalance_candidates)
1806 {
1807         int dstnode, mindstnode, numnodes;
1808         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1809         uint32_t minsrcimbl, mindstimbl;
1810         struct ctdb_public_ip_list *minip;
1811         struct ctdb_public_ip_list *tmp_ip;
1812
1813         /* Find an IP and destination node that best reduces imbalance. */
1814         srcimbl = 0;
1815         minip = NULL;
1816         minsrcimbl = 0;
1817         mindstnode = -1;
1818         mindstimbl = 0;
1819
1820         numnodes = talloc_array_length(ipflags);
1821
1822         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1823         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1824                            srcnode, lcp2_imbalances[srcnode]));
1825
1826         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1827                 /* Only consider addresses on srcnode. */
1828                 if (tmp_ip->pnn != srcnode) {
1829                         continue;
1830                 }
1831
1832                 /* What is this IP address costing the source node? */
1833                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1834                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1835
1836                 /* Consider this IP address would cost each potential
1837                  * destination node.  Destination nodes are limited to
1838                  * those that are newly healthy, since we don't want
1839                  * to do gratuitous failover of IPs just to make minor
1840                  * balance improvements.
1841                  */
1842                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1843                         if (!rebalance_candidates[dstnode]) {
1844                                 continue;
1845                         }
1846
1847                         /* only check nodes that can actually takeover this ip */
1848                         if (!can_node_takeover_ip(ctdb, dstnode,
1849                                                   ipflags[dstnode], tmp_ip)) {
1850                                 /* no it couldnt   so skip to the next node */
1851                                 continue;
1852                         }
1853
1854                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1855                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1856                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1857                                            srcnode, -srcdsum,
1858                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1859                                            dstnode, dstdsum));
1860
1861                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1862                             (dstdsum < srcdsum) &&                      \
1863                             ((mindstnode == -1) ||                              \
1864                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1865
1866                                 minip = tmp_ip;
1867                                 minsrcimbl = srcimbl;
1868                                 mindstnode = dstnode;
1869                                 mindstimbl = dstimbl;
1870                         }
1871                 }
1872         }
1873         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1874
1875         if (mindstnode != -1) {
1876                 /* We found a move that makes things better... */
1877                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1878                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1879                                   ctdb_addr_to_str(&(minip->addr)),
1880                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1881
1882
1883                 lcp2_imbalances[srcnode] = minsrcimbl;
1884                 lcp2_imbalances[mindstnode] = mindstimbl;
1885                 minip->pnn = mindstnode;
1886
1887                 return true;
1888         }
1889
1890         return false;
1891         
1892 }
1893
1894 struct lcp2_imbalance_pnn {
1895         uint32_t imbalance;
1896         int pnn;
1897 };
1898
1899 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1900 {
1901         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1902         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1903
1904         if (lipa->imbalance > lipb->imbalance) {
1905                 return -1;
1906         } else if (lipa->imbalance == lipb->imbalance) {
1907                 return 0;
1908         } else {
1909                 return 1;
1910         }
1911 }
1912
1913 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1914  * node with the highest LCP2 imbalance, and then determines the best
1915  * IP/destination node combination to move from the source node.
1916  */
1917 static void lcp2_failback(struct ctdb_context *ctdb,
1918                           struct ctdb_ipflags *ipflags,
1919                           struct ctdb_public_ip_list *all_ips,
1920                           uint32_t *lcp2_imbalances,
1921                           bool *rebalance_candidates)
1922 {
1923         int i, numnodes;
1924         struct lcp2_imbalance_pnn * lips;
1925         bool again;
1926
1927         numnodes = talloc_array_length(ipflags);
1928
1929 try_again:
1930         /* Put the imbalances and nodes into an array, sort them and
1931          * iterate through candidates.  Usually the 1st one will be
1932          * used, so this doesn't cost much...
1933          */
1934         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1935         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1936         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1937         for (i=0; i<numnodes; i++) {
1938                 lips[i].imbalance = lcp2_imbalances[i];
1939                 lips[i].pnn = i;
1940                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1941         }
1942         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1943               lcp2_cmp_imbalance_pnn);
1944
1945         again = false;
1946         for (i=0; i<numnodes; i++) {
1947                 /* This means that all nodes had 0 or 1 addresses, so
1948                  * can't be imbalanced.
1949                  */
1950                 if (lips[i].imbalance == 0) {
1951                         break;
1952                 }
1953
1954                 if (lcp2_failback_candidate(ctdb,
1955                                             ipflags,
1956                                             all_ips,
1957                                             lips[i].pnn,
1958                                             lcp2_imbalances,
1959                                             rebalance_candidates)) {
1960                         again = true;
1961                         break;
1962                 }
1963         }
1964
1965         talloc_free(lips);
1966         if (again) {
1967                 goto try_again;
1968         }
1969 }
1970
1971 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1972                                     struct ctdb_ipflags *ipflags,
1973                                     struct ctdb_public_ip_list *all_ips)
1974 {
1975         struct ctdb_public_ip_list *tmp_ip;
1976
1977         /* verify that the assigned nodes can serve that public ip
1978            and set it to -1 if not
1979         */
1980         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1981                 if (tmp_ip->pnn == -1) {
1982                         continue;
1983                 }
1984                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1985                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1986                         /* this node can not serve this ip. */
1987                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1988                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1989                                            tmp_ip->pnn));
1990                         tmp_ip->pnn = -1;
1991                 }
1992         }
1993 }
1994
1995 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
1996                                        struct ctdb_ipflags *ipflags,
1997                                        struct ctdb_public_ip_list *all_ips)
1998 {
1999         struct ctdb_public_ip_list *tmp_ip;
2000         int i, numnodes;
2001
2002         numnodes = talloc_array_length(ipflags);
2003
2004         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2005        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2006         *  always be allocated the same way for a specific set of
2007         *  available/unavailable nodes.
2008         */
2009
2010         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2011                 tmp_ip->pnn = i % numnodes;
2012         }
2013
2014         /* IP failback doesn't make sense with deterministic
2015          * IPs, since the modulo step above implicitly fails
2016          * back IPs to their "home" node.
2017          */
2018         if (1 == ctdb->tunable.no_ip_failback) {
2019                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2020         }
2021
2022         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2023
2024         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2025
2026         /* No failback here! */
2027 }
2028
2029 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2030                                           struct ctdb_ipflags *ipflags,
2031                                           struct ctdb_public_ip_list *all_ips)
2032 {
2033         /* This should be pushed down into basic_failback. */
2034         struct ctdb_public_ip_list *tmp_ip;
2035         int num_ips = 0;
2036         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2037                 num_ips++;
2038         }
2039
2040         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2041
2042         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2043
2044         /* If we don't want IPs to fail back then don't rebalance IPs. */
2045         if (1 == ctdb->tunable.no_ip_failback) {
2046                 return;
2047         }
2048
2049         /* Now, try to make sure the ip adresses are evenly distributed
2050            across the nodes.
2051         */
2052         basic_failback(ctdb, ipflags, all_ips, num_ips);
2053 }
2054
2055 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2056                           struct ctdb_ipflags *ipflags,
2057                           struct ctdb_public_ip_list *all_ips,
2058                           uint32_t *force_rebalance_nodes)
2059 {
2060         uint32_t *lcp2_imbalances;
2061         bool *rebalance_candidates;
2062         int numnodes, num_rebalance_candidates, i;
2063
2064         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2065
2066         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2067
2068         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2069                   &lcp2_imbalances, &rebalance_candidates);
2070
2071         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2072
2073         /* If we don't want IPs to fail back then don't rebalance IPs. */
2074         if (1 == ctdb->tunable.no_ip_failback) {
2075                 goto finished;
2076         }
2077
2078         /* It is only worth continuing if we have suitable target
2079          * nodes to transfer IPs to.  This check is much cheaper than
2080          * continuing on...
2081          */
2082         numnodes = talloc_array_length(ipflags);
2083         num_rebalance_candidates = 0;
2084         for (i=0; i<numnodes; i++) {
2085                 if (rebalance_candidates[i]) {
2086                         num_rebalance_candidates++;
2087                 }
2088         }
2089         if (num_rebalance_candidates == 0) {
2090                 goto finished;
2091         }
2092
2093         /* Now, try to make sure the ip adresses are evenly distributed
2094            across the nodes.
2095         */
2096         lcp2_failback(ctdb, ipflags, all_ips,
2097                       lcp2_imbalances, rebalance_candidates);
2098
2099 finished:
2100         talloc_free(tmp_ctx);
2101 }
2102
2103 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2104 {
2105         int i;
2106
2107         for (i=0;i<nodemap->num;i++) {
2108                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2109                         /* Found one completely healthy node */
2110                         return false;
2111                 }
2112         }
2113
2114         return true;
2115 }
2116
2117 /* The calculation part of the IP allocation algorithm. */
2118 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2119                                    struct ctdb_ipflags *ipflags,
2120                                    struct ctdb_public_ip_list **all_ips_p,
2121                                    uint32_t *force_rebalance_nodes)
2122 {
2123         /* since nodes only know about those public addresses that
2124            can be served by that particular node, no single node has
2125            a full list of all public addresses that exist in the cluster.
2126            Walk over all node structures and create a merged list of
2127            all public addresses that exist in the cluster.
2128
2129            keep the tree of ips around as ctdb->ip_tree
2130         */
2131         *all_ips_p = create_merged_ip_list(ctdb);
2132
2133         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2134                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2135         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2136                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2137         } else {
2138                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2139         }
2140
2141         /* at this point ->pnn is the node which will own each IP
2142            or -1 if there is no node that can cover this ip
2143         */
2144
2145         return;
2146 }
2147
2148 struct get_tunable_callback_data {
2149         const char *tunable;
2150         uint32_t *out;
2151         bool fatal;
2152 };
2153
2154 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2155                                  int32_t res, TDB_DATA outdata,
2156                                  void *callback)
2157 {
2158         struct get_tunable_callback_data *cd =
2159                 (struct get_tunable_callback_data *)callback;
2160         int size;
2161
2162         if (res != 0) {
2163                 /* Already handled in fail callback */
2164                 return;
2165         }
2166
2167         if (outdata.dsize != sizeof(uint32_t)) {
2168                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2169                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2170                                  (int)outdata.dsize));
2171                 cd->fatal = true;
2172                 return;
2173         }
2174
2175         size = talloc_array_length(cd->out);
2176         if (pnn >= size) {
2177                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2178                                  cd->tunable, pnn, size));
2179                 return;
2180         }
2181
2182                 
2183         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2184 }
2185
2186 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2187                                        int32_t res, TDB_DATA outdata,
2188                                        void *callback)
2189 {
2190         struct get_tunable_callback_data *cd =
2191                 (struct get_tunable_callback_data *)callback;
2192
2193         switch (res) {
2194         case -ETIME:
2195                 DEBUG(DEBUG_ERR,
2196                       ("Timed out getting tunable \"%s\" from node %d\n",
2197                        cd->tunable, pnn));
2198                 cd->fatal = true;
2199                 break;
2200         case -EINVAL:
2201         case -1:
2202                 DEBUG(DEBUG_WARNING,
2203                       ("Tunable \"%s\" not implemented on node %d\n",
2204                        cd->tunable, pnn));
2205                 break;
2206         default:
2207                 DEBUG(DEBUG_ERR,
2208                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2209                        cd->tunable, pnn));
2210                 cd->fatal = true;
2211         }
2212 }
2213
2214 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2215                                         TALLOC_CTX *tmp_ctx,
2216                                         struct ctdb_node_map *nodemap,
2217                                         const char *tunable,
2218                                         uint32_t default_value)
2219 {
2220         TDB_DATA data;
2221         struct ctdb_control_get_tunable *t;
2222         uint32_t *nodes;
2223         uint32_t *tvals;
2224         struct get_tunable_callback_data callback_data;
2225         int i;
2226
2227         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2228         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2229         for (i=0; i<nodemap->num; i++) {
2230                 tvals[i] = default_value;
2231         }
2232                 
2233         callback_data.out = tvals;
2234         callback_data.tunable = tunable;
2235         callback_data.fatal = false;
2236
2237         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2238         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2239         t = (struct ctdb_control_get_tunable *)data.dptr;
2240         t->length = strlen(tunable)+1;
2241         memcpy(t->name, tunable, t->length);
2242         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2243         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2244                                       nodes, 0, TAKEOVER_TIMEOUT(),
2245                                       false, data,
2246                                       get_tunable_callback,
2247                                       get_tunable_fail_callback,
2248                                       &callback_data) != 0) {
2249                 if (callback_data.fatal) {
2250                         talloc_free(tvals);
2251                         tvals = NULL;
2252                 }
2253         }
2254         talloc_free(nodes);
2255         talloc_free(data.dptr);
2256
2257         return tvals;
2258 }
2259
2260 struct get_runstate_callback_data {
2261         enum ctdb_runstate *out;
2262         bool fatal;
2263 };
2264
2265 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2266                                   int32_t res, TDB_DATA outdata,
2267                                   void *callback_data)
2268 {
2269         struct get_runstate_callback_data *cd =
2270                 (struct get_runstate_callback_data *)callback_data;
2271         int size;
2272
2273         if (res != 0) {
2274                 /* Already handled in fail callback */
2275                 return;
2276         }
2277
2278         if (outdata.dsize != sizeof(uint32_t)) {
2279                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2280                                  pnn, (int)sizeof(uint32_t),
2281                                  (int)outdata.dsize));
2282                 cd->fatal = true;
2283                 return;
2284         }
2285
2286         size = talloc_array_length(cd->out);
2287         if (pnn >= size) {
2288                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2289                                  pnn, size));
2290                 return;
2291         }
2292
2293         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2294 }
2295
2296 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2297                                        int32_t res, TDB_DATA outdata,
2298                                        void *callback)
2299 {
2300         struct get_runstate_callback_data *cd =
2301                 (struct get_runstate_callback_data *)callback;
2302
2303         switch (res) {
2304         case -ETIME:
2305                 DEBUG(DEBUG_ERR,
2306                       ("Timed out getting runstate from node %d\n", pnn));
2307                 cd->fatal = true;
2308                 break;
2309         default:
2310                 DEBUG(DEBUG_WARNING,
2311                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2312                        pnn));
2313         }
2314 }
2315
2316 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2317                                                     TALLOC_CTX *tmp_ctx,
2318                                                     struct ctdb_node_map *nodemap,
2319                                                     enum ctdb_runstate default_value)
2320 {
2321         uint32_t *nodes;
2322         enum ctdb_runstate *rs;
2323         struct get_runstate_callback_data callback_data;
2324         int i;
2325
2326         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2327         CTDB_NO_MEMORY_NULL(ctdb, rs);
2328         for (i=0; i<nodemap->num; i++) {
2329                 rs[i] = default_value;
2330         }
2331
2332         callback_data.out = rs;
2333         callback_data.fatal = false;
2334
2335         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2336         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2337                                       nodes, 0, TAKEOVER_TIMEOUT(),
2338                                       true, tdb_null,
2339                                       get_runstate_callback,
2340                                       get_runstate_fail_callback,
2341                                       &callback_data) != 0) {
2342                 if (callback_data.fatal) {
2343                         free(rs);
2344                         rs = NULL;
2345                 }
2346         }
2347         talloc_free(nodes);
2348
2349         return rs;
2350 }
2351
2352 /* Set internal flags for IP allocation:
2353  *   Clear ip flags
2354  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2355  *   Set NOIPHOST ip flag for each INACTIVE node
2356  *   if all nodes are disabled:
2357  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2358  *   else
2359  *     Set NOIPHOST ip flags for disabled nodes
2360  */
2361 static struct ctdb_ipflags *
2362 set_ipflags_internal(struct ctdb_context *ctdb,
2363                      TALLOC_CTX *tmp_ctx,
2364                      struct ctdb_node_map *nodemap,
2365                      uint32_t *tval_noiptakeover,
2366                      uint32_t *tval_noiphostonalldisabled,
2367                      enum ctdb_runstate *runstate)
2368 {
2369         int i;
2370         struct ctdb_ipflags *ipflags;
2371
2372         /* Clear IP flags - implicit due to talloc_zero */
2373         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2374         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2375
2376         for (i=0;i<nodemap->num;i++) {
2377                 /* Can not take IPs on node with NoIPTakeover set */
2378                 if (tval_noiptakeover[i] != 0) {
2379                         ipflags[i].noiptakeover = true;
2380                 }
2381
2382                 /* Can not host IPs on node not in RUNNING state */
2383                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2384                         ipflags[i].noiphost = true;
2385                         continue;
2386                 }
2387                 /* Can not host IPs on INACTIVE node */
2388                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2389                         ipflags[i].noiphost = true;
2390                 }
2391                 /* Remember the runstate */
2392                 ipflags[i].runstate = runstate[i];
2393         }
2394
2395         if (all_nodes_are_disabled(nodemap)) {
2396                 /* If all nodes are disabled, can not host IPs on node
2397                  * with NoIPHostOnAllDisabled set
2398                  */
2399                 for (i=0;i<nodemap->num;i++) {
2400                         if (tval_noiphostonalldisabled[i] != 0) {
2401                                 ipflags[i].noiphost = true;
2402                         }
2403                 }
2404         } else {
2405                 /* If some nodes are not disabled, then can not host
2406                  * IPs on DISABLED node
2407                  */
2408                 for (i=0;i<nodemap->num;i++) {
2409                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2410                                 ipflags[i].noiphost = true;
2411                         }
2412                 }
2413         }
2414
2415         return ipflags;
2416 }
2417
2418 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2419                                         TALLOC_CTX *tmp_ctx,
2420                                         struct ctdb_node_map *nodemap)
2421 {
2422         uint32_t *tval_noiptakeover;
2423         uint32_t *tval_noiphostonalldisabled;
2424         struct ctdb_ipflags *ipflags;
2425         enum ctdb_runstate *runstate;
2426
2427
2428         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2429                                                    "NoIPTakeover", 0);
2430         if (tval_noiptakeover == NULL) {
2431                 return NULL;
2432         }
2433
2434         tval_noiphostonalldisabled =
2435                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2436                                        "NoIPHostOnAllDisabled", 0);
2437         if (tval_noiphostonalldisabled == NULL) {
2438                 /* Caller frees tmp_ctx */
2439                 return NULL;
2440         }
2441
2442         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2443          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2444          * reasonable behaviour on a mixed cluster during upgrade.
2445          */
2446         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2447                                            CTDB_RUNSTATE_RUNNING);
2448         if (runstate == NULL) {
2449                 /* Caller frees tmp_ctx */
2450                 return NULL;
2451         }
2452
2453         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2454                                        tval_noiptakeover,
2455                                        tval_noiphostonalldisabled,
2456                                        runstate);
2457
2458         talloc_free(tval_noiptakeover);
2459         talloc_free(tval_noiphostonalldisabled);
2460         talloc_free(runstate);
2461
2462         return ipflags;
2463 }
2464
2465 struct iprealloc_callback_data {
2466         bool *retry_nodes;
2467         int retry_count;
2468         client_async_callback fail_callback;
2469         void *fail_callback_data;
2470         struct ctdb_node_map *nodemap;
2471 };
2472
2473 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2474                                         int32_t res, TDB_DATA outdata,
2475                                         void *callback)
2476 {
2477         int numnodes;
2478         struct iprealloc_callback_data *cd =
2479                 (struct iprealloc_callback_data *)callback;
2480
2481         numnodes = talloc_array_length(cd->retry_nodes);
2482         if (pnn > numnodes) {
2483                 DEBUG(DEBUG_ERR,
2484                       ("ipreallocated failure from node %d, "
2485                        "but only %d nodes in nodemap\n",
2486                        pnn, numnodes));
2487                 return;
2488         }
2489
2490         /* Can't run the "ipreallocated" event on a INACTIVE node */
2491         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2492                 DEBUG(DEBUG_WARNING,
2493                       ("ipreallocated failed on inactive node %d, ignoring\n",
2494                        pnn));
2495                 return;
2496         }
2497
2498         switch (res) {
2499         case -ETIME:
2500                 /* If the control timed out then that's a real error,
2501                  * so call the real fail callback
2502                  */
2503                 if (cd->fail_callback) {
2504                         cd->fail_callback(ctdb, pnn, res, outdata,
2505                                           cd->fail_callback_data);
2506                 } else {
2507                         DEBUG(DEBUG_WARNING,
2508                               ("iprealloc timed out but no callback registered\n"));
2509                 }
2510                 break;
2511         default:
2512                 /* If not a timeout then either the ipreallocated
2513                  * eventscript (or some setup) failed.  This might
2514                  * have failed because the IPREALLOCATED control isn't
2515                  * implemented - right now there is no way of knowing
2516                  * because the error codes are all folded down to -1.
2517                  * Consider retrying using EVENTSCRIPT control...
2518                  */
2519                 DEBUG(DEBUG_WARNING,
2520                       ("ipreallocated failure from node %d, flagging retry\n",
2521                        pnn));
2522                 cd->retry_nodes[pnn] = true;
2523                 cd->retry_count++;
2524         }
2525 }
2526
2527 struct takeover_callback_data {
2528         bool *node_failed;
2529         client_async_callback fail_callback;
2530         void *fail_callback_data;
2531         struct ctdb_node_map *nodemap;
2532 };
2533
2534 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2535                                        uint32_t node_pnn, int32_t res,
2536                                        TDB_DATA outdata, void *callback_data)
2537 {
2538         struct takeover_callback_data *cd =
2539                 talloc_get_type_abort(callback_data,
2540                                       struct takeover_callback_data);
2541         int i;
2542
2543         for (i = 0; i < cd->nodemap->num; i++) {
2544                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2545                         break;
2546                 }
2547         }
2548
2549         if (i == cd->nodemap->num) {
2550                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2551                 return;
2552         }
2553
2554         if (!cd->node_failed[i]) {
2555                 cd->node_failed[i] = true;
2556                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2557                                   cd->fail_callback_data);
2558         }
2559 }
2560
2561 /*
2562   make any IP alias changes for public addresses that are necessary 
2563  */
2564 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2565                       uint32_t *force_rebalance_nodes,
2566                       client_async_callback fail_callback, void *callback_data)
2567 {
2568         int i, j, ret;
2569         struct ctdb_public_ip ip;
2570         uint32_t *nodes;
2571         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2572         TDB_DATA data;
2573         struct timeval timeout;
2574         struct client_async_data *async_data;
2575         struct ctdb_client_control_state *state;
2576         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2577         struct ctdb_ipflags *ipflags;
2578         struct takeover_callback_data *takeover_data;
2579         struct iprealloc_callback_data iprealloc_data;
2580         bool *retry_data;
2581         bool can_host_ips;
2582
2583         /*
2584          * ip failover is completely disabled, just send out the 
2585          * ipreallocated event.
2586          */
2587         if (ctdb->tunable.disable_ip_failover != 0) {
2588                 goto ipreallocated;
2589         }
2590
2591         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2592         if (ipflags == NULL) {
2593                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2594                 talloc_free(tmp_ctx);
2595                 return -1;
2596         }
2597
2598         /* Short-circuit IP allocation if no nodes are in the RUNNING
2599          * runstate yet, since no nodes will be able to host IPs */
2600         can_host_ips = false;
2601         for (i=0; i<nodemap->num; i++) {
2602                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2603                         can_host_ips = true;
2604                 }
2605         }
2606         if (!can_host_ips) {
2607                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2608                 return 0;
2609         }
2610
2611         /* Do the IP reassignment calculations */
2612         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2613
2614         /* Now tell all nodes to release any public IPs should not
2615          * host.  This will be a NOOP on nodes that don't currently
2616          * hold the given IP.
2617          */
2618         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2619         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2620
2621         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2622                                                        bool, nodemap->num);
2623         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2624         takeover_data->fail_callback = fail_callback;
2625         takeover_data->fail_callback_data = callback_data;
2626         takeover_data->nodemap = nodemap;
2627
2628         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2629         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2630
2631         async_data->fail_callback = takeover_run_fail_callback;
2632         async_data->callback_data = takeover_data;
2633
2634         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2635
2636         /* Send a RELEASE_IP to all nodes that should not be hosting
2637          * each IP.  For each IP, all but one of these will be
2638          * redundant.  However, the redundant ones are used to tell
2639          * nodes which node should be hosting the IP so that commands
2640          * like "ctdb ip" can display a particular nodes idea of who
2641          * is hosting what. */
2642         for (i=0;i<nodemap->num;i++) {
2643                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2644                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2645                         continue;
2646                 }
2647
2648                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2649                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2650                                 /* This node should be serving this
2651                                    vnn so dont tell it to release the ip
2652                                 */
2653                                 continue;
2654                         }
2655                         ip.pnn  = tmp_ip->pnn;
2656                         ip.addr = tmp_ip->addr;
2657
2658                         timeout = TAKEOVER_TIMEOUT();
2659                         data.dsize = sizeof(ip);
2660                         data.dptr  = (uint8_t *)&ip;
2661                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2662                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2663                                                   data, async_data,
2664                                                   &timeout, NULL);
2665                         if (state == NULL) {
2666                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2667                                 talloc_free(tmp_ctx);
2668                                 return -1;
2669                         }
2670
2671                         ctdb_client_async_add(async_data, state);
2672                 }
2673         }
2674         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2675                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2676                 talloc_free(tmp_ctx);
2677                 return -1;
2678         }
2679         talloc_free(async_data);
2680
2681
2682         /* For each IP, send a TAKOVER_IP to the node that should be
2683          * hosting it.  Many of these will often be redundant (since
2684          * the allocation won't have changed) but they can be useful
2685          * to recover from inconsistencies. */
2686         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2687         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2688
2689         async_data->fail_callback = fail_callback;
2690         async_data->callback_data = callback_data;
2691
2692         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2693                 if (tmp_ip->pnn == -1) {
2694                         /* this IP won't be taken over */
2695                         continue;
2696                 }
2697
2698                 ip.pnn  = tmp_ip->pnn;
2699                 ip.addr = tmp_ip->addr;
2700
2701                 timeout = TAKEOVER_TIMEOUT();
2702                 data.dsize = sizeof(ip);
2703                 data.dptr  = (uint8_t *)&ip;
2704                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2705                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2706                                           data, async_data, &timeout, NULL);
2707                 if (state == NULL) {
2708                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2709                         talloc_free(tmp_ctx);
2710                         return -1;
2711                 }
2712
2713                 ctdb_client_async_add(async_data, state);
2714         }
2715         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2716                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2717                 talloc_free(tmp_ctx);
2718                 return -1;
2719         }
2720
2721 ipreallocated:
2722         /*
2723          * Tell all nodes to run eventscripts to process the
2724          * "ipreallocated" event.  This can do a lot of things,
2725          * including restarting services to reconfigure them if public
2726          * IPs have moved.  Once upon a time this event only used to
2727          * update natgw.
2728          */
2729         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2730         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2731         iprealloc_data.retry_nodes = retry_data;
2732         iprealloc_data.retry_count = 0;
2733         iprealloc_data.fail_callback = fail_callback;
2734         iprealloc_data.fail_callback_data = callback_data;
2735         iprealloc_data.nodemap = nodemap;
2736
2737         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2738         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2739                                         nodes, 0, TAKEOVER_TIMEOUT(),
2740                                         false, tdb_null,
2741                                         NULL, iprealloc_fail_callback,
2742                                         &iprealloc_data);
2743         if (ret != 0) {
2744                 /* If the control failed then we should retry to any
2745                  * nodes flagged by iprealloc_fail_callback using the
2746                  * EVENTSCRIPT control.  This is a best-effort at
2747                  * backward compatiblity when running a mixed cluster
2748                  * where some nodes have not yet been upgraded to
2749                  * support the IPREALLOCATED control.
2750                  */
2751                 DEBUG(DEBUG_WARNING,
2752                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2753
2754                 nodes = talloc_array(tmp_ctx, uint32_t,
2755                                      iprealloc_data.retry_count);
2756                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2757
2758                 j = 0;
2759                 for (i=0; i<nodemap->num; i++) {
2760                         if (iprealloc_data.retry_nodes[i]) {
2761                                 nodes[j] = i;
2762                                 j++;
2763                         }
2764                 }
2765
2766                 data.dptr  = discard_const("ipreallocated");
2767                 data.dsize = strlen((char *)data.dptr) + 1; 
2768                 ret = ctdb_client_async_control(ctdb,
2769                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2770                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2771                                                 false, data,
2772                                                 NULL, fail_callback,
2773                                                 callback_data);
2774                 if (ret != 0) {
2775                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2776                 }
2777         }
2778
2779         talloc_free(tmp_ctx);
2780         return ret;
2781 }
2782
2783
2784 /*
2785   destroy a ctdb_client_ip structure
2786  */
2787 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2788 {
2789         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2790                 ctdb_addr_to_str(&ip->addr),
2791                 ntohs(ip->addr.ip.sin_port),
2792                 ip->client_id));
2793
2794         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2795         return 0;
2796 }
2797
2798 /*
2799   called by a client to inform us of a TCP connection that it is managing
2800   that should tickled with an ACK when IP takeover is done
2801  */
2802 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2803                                 TDB_DATA indata)
2804 {
2805         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2806         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2807         struct ctdb_tcp_list *tcp;
2808         struct ctdb_tcp_connection t;
2809         int ret;
2810         TDB_DATA data;
2811         struct ctdb_client_ip *ip;
2812         struct ctdb_vnn *vnn;
2813         ctdb_sock_addr addr;
2814
2815         /* If we don't have public IPs, tickles are useless */
2816         if (ctdb->vnn == NULL) {
2817                 return 0;
2818         }
2819
2820         tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2821
2822         addr = tcp_sock->src;
2823         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2824         addr = tcp_sock->dest;
2825         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2826
2827         ZERO_STRUCT(addr);
2828         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2829         vnn = find_public_ip_vnn(ctdb, &addr);
2830         if (vnn == NULL) {
2831                 switch (addr.sa.sa_family) {
2832                 case AF_INET:
2833                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2834                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2835                                         ctdb_addr_to_str(&addr)));
2836                         }
2837                         break;
2838                 case AF_INET6:
2839                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2840                                 ctdb_addr_to_str(&addr)));
2841                         break;
2842                 default:
2843                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2844                 }
2845
2846                 return 0;
2847         }
2848
2849         if (vnn->pnn != ctdb->pnn) {
2850                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2851                         ctdb_addr_to_str(&addr),
2852                         client_id, client->pid));
2853                 /* failing this call will tell smbd to die */
2854                 return -1;
2855         }
2856
2857         ip = talloc(client, struct ctdb_client_ip);
2858         CTDB_NO_MEMORY(ctdb, ip);
2859
2860         ip->ctdb      = ctdb;
2861         ip->addr      = addr;
2862         ip->client_id = client_id;
2863         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2864         DLIST_ADD(ctdb->client_ip_list, ip);
2865
2866         tcp = talloc(client, struct ctdb_tcp_list);
2867         CTDB_NO_MEMORY(ctdb, tcp);
2868
2869         tcp->connection.src_addr = tcp_sock->src;
2870         tcp->connection.dst_addr = tcp_sock->dest;
2871
2872         DLIST_ADD(client->tcp_list, tcp);
2873
2874         t.src_addr = tcp_sock->src;
2875         t.dst_addr = tcp_sock->dest;
2876
2877         data.dptr = (uint8_t *)&t;
2878         data.dsize = sizeof(t);
2879
2880         switch (addr.sa.sa_family) {
2881         case AF_INET:
2882                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2883                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2884                         ctdb_addr_to_str(&tcp_sock->src),
2885                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2886                 break;
2887         case AF_INET6:
2888                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2889                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2890                         ctdb_addr_to_str(&tcp_sock->src),
2891                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2892                 break;
2893         default:
2894                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2895         }
2896
2897
2898         /* tell all nodes about this tcp connection */
2899         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2900                                        CTDB_CONTROL_TCP_ADD,
2901                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2902         if (ret != 0) {
2903                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2904                 return -1;
2905         }
2906
2907         return 0;
2908 }
2909
2910 /*
2911   find a tcp address on a list
2912  */
2913 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2914                                            struct ctdb_tcp_connection *tcp)
2915 {
2916         int i;
2917
2918         if (array == NULL) {
2919                 return NULL;
2920         }
2921
2922         for (i=0;i<array->num;i++) {
2923                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2924                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2925                         return &array->connections[i];
2926                 }
2927         }
2928         return NULL;
2929 }
2930
2931
2932
2933 /*
2934   called by a daemon to inform us of a TCP connection that one of its
2935   clients managing that should tickled with an ACK when IP takeover is
2936   done
2937  */
2938 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2939 {
2940         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2941         struct ctdb_tcp_array *tcparray;
2942         struct ctdb_tcp_connection tcp;
2943         struct ctdb_vnn *vnn;
2944
2945         /* If we don't have public IPs, tickles are useless */
2946         if (ctdb->vnn == NULL) {
2947                 return 0;
2948         }
2949
2950         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2951         if (vnn == NULL) {
2952                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2953                         ctdb_addr_to_str(&p->dst_addr)));
2954
2955                 return -1;
2956         }
2957
2958
2959         tcparray = vnn->tcp_array;
2960
2961         /* If this is the first tickle */
2962         if (tcparray == NULL) {
2963                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2964                 CTDB_NO_MEMORY(ctdb, tcparray);
2965                 vnn->tcp_array = tcparray;
2966
2967                 tcparray->num = 0;
2968                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2969                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2970
2971                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2972                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2973                 tcparray->num++;