ctdbd: Removed bogus comment in ctdb_find_iface()
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* These flags are ONLY valid within IP allocation code and must be
37  * cleared to avoid confusing other recovery daemon functions
38  */
39 #define NODE_FLAGS_NOIPTAKEOVER         0x01000000 /* can not takeover additional IPs */
40 #define NODE_FLAGS_NOIPHOST             0x02000000 /* can not host IPs */
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = ctdb->done_startup;
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->iface && vnn->iface->link_up) {
264                 return true;
265         }
266
267         for (i=0; vnn->ifaces[i]; i++) {
268                 struct ctdb_iface *cur;
269
270                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271                 if (cur == NULL) {
272                         continue;
273                 }
274
275                 if (cur->link_up) {
276                         return true;
277                 }
278         }
279
280         return false;
281 }
282
283 struct ctdb_takeover_arp {
284         struct ctdb_context *ctdb;
285         uint32_t count;
286         ctdb_sock_addr addr;
287         struct ctdb_tcp_array *tcparray;
288         struct ctdb_vnn *vnn;
289 };
290
291
292 /*
293   lists of tcp endpoints
294  */
295 struct ctdb_tcp_list {
296         struct ctdb_tcp_list *prev, *next;
297         struct ctdb_tcp_connection connection;
298 };
299
300 /*
301   list of clients to kill on IP release
302  */
303 struct ctdb_client_ip {
304         struct ctdb_client_ip *prev, *next;
305         struct ctdb_context *ctdb;
306         ctdb_sock_addr addr;
307         uint32_t client_id;
308 };
309
310
311 /*
312   send a gratuitous arp
313  */
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
315                                   struct timeval t, void *private_data)
316 {
317         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
318                                                         struct ctdb_takeover_arp);
319         int i, ret;
320         struct ctdb_tcp_array *tcparray;
321         const char *iface = ctdb_vnn_iface_string(arp->vnn);
322
323         ret = ctdb_sys_send_arp(&arp->addr, iface);
324         if (ret != 0) {
325                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326                                   iface, strerror(errno)));
327         }
328
329         tcparray = arp->tcparray;
330         if (tcparray) {
331                 for (i=0;i<tcparray->num;i++) {
332                         struct ctdb_tcp_connection *tcon;
333
334                         tcon = &tcparray->connections[i];
335                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
337                                 ctdb_addr_to_str(&tcon->src_addr),
338                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339                         ret = ctdb_sys_send_tcp(
340                                 &tcon->src_addr, 
341                                 &tcon->dst_addr,
342                                 0, 0, 0);
343                         if (ret != 0) {
344                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345                                         ctdb_addr_to_str(&tcon->src_addr)));
346                         }
347                 }
348         }
349
350         arp->count++;
351
352         if (arp->count == CTDB_ARP_REPEAT) {
353                 talloc_free(arp);
354                 return;
355         }
356
357         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
358                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
359                         ctdb_control_send_arp, arp);
360 }
361
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363                                        struct ctdb_vnn *vnn)
364 {
365         struct ctdb_takeover_arp *arp;
366         struct ctdb_tcp_array *tcparray;
367
368         if (!vnn->takeover_ctx) {
369                 vnn->takeover_ctx = talloc_new(vnn);
370                 if (!vnn->takeover_ctx) {
371                         return -1;
372                 }
373         }
374
375         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376         if (!arp) {
377                 return -1;
378         }
379
380         arp->ctdb = ctdb;
381         arp->addr = vnn->public_address;
382         arp->vnn  = vnn;
383
384         tcparray = vnn->tcp_array;
385         if (tcparray) {
386                 /* add all of the known tcp connections for this IP to the
387                    list of tcp connections to send tickle acks for */
388                 arp->tcparray = talloc_steal(arp, tcparray);
389
390                 vnn->tcp_array = NULL;
391                 vnn->tcp_update_needed = true;
392         }
393
394         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395                         timeval_zero(), ctdb_control_send_arp, arp);
396
397         return 0;
398 }
399
400 struct takeover_callback_state {
401         struct ctdb_req_control *c;
402         ctdb_sock_addr *addr;
403         struct ctdb_vnn *vnn;
404 };
405
406 struct ctdb_do_takeip_state {
407         struct ctdb_req_control *c;
408         struct ctdb_vnn *vnn;
409 };
410
411 /*
412   called when takeip event finishes
413  */
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415                                     void *private_data)
416 {
417         struct ctdb_do_takeip_state *state =
418                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419         int32_t ret;
420         TDB_DATA data;
421
422         if (status != 0) {
423                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424         
425                 if (status == -ETIME) {
426                         ctdb_ban_self(ctdb);
427                 }
428                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429                                  ctdb_addr_to_str(&state->vnn->public_address),
430                                  ctdb_vnn_iface_string(state->vnn)));
431                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433                 node->flags |= NODE_FLAGS_UNHEALTHY;
434                 talloc_free(state);
435                 return;
436         }
437
438         if (ctdb->do_checkpublicip) {
439
440         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441         if (ret != 0) {
442                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443                 talloc_free(state);
444                 return;
445         }
446
447         }
448
449         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450         data.dsize = strlen((char *)data.dptr) + 1;
451         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456         /* the control succeeded */
457         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458         talloc_free(state);
459         return;
460 }
461
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463 {
464         state->vnn->update_in_flight = false;
465         return 0;
466 }
467
468 /*
469   take over an ip address
470  */
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472                               struct ctdb_req_control *c,
473                               struct ctdb_vnn *vnn)
474 {
475         int ret;
476         struct ctdb_do_takeip_state *state;
477
478         if (vnn->update_in_flight) {
479                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480                                     "update for this IP already in flight\n",
481                                     ctdb_addr_to_str(&vnn->public_address),
482                                     vnn->public_netmask_bits));
483                 return -1;
484         }
485
486         ret = ctdb_vnn_assign_iface(ctdb, vnn);
487         if (ret != 0) {
488                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489                                  "assign a usable interface\n",
490                                  ctdb_addr_to_str(&vnn->public_address),
491                                  vnn->public_netmask_bits));
492                 return -1;
493         }
494
495         state = talloc(vnn, struct ctdb_do_takeip_state);
496         CTDB_NO_MEMORY(ctdb, state);
497
498         state->c = talloc_steal(ctdb, c);
499         state->vnn   = vnn;
500
501         vnn->update_in_flight = true;
502         talloc_set_destructor(state, ctdb_takeip_destructor);
503
504         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505                             ctdb_addr_to_str(&vnn->public_address),
506                             vnn->public_netmask_bits,
507                             ctdb_vnn_iface_string(vnn)));
508
509         ret = ctdb_event_script_callback(ctdb,
510                                          state,
511                                          ctdb_do_takeip_callback,
512                                          state,
513                                          false,
514                                          CTDB_EVENT_TAKE_IP,
515                                          "%s %s %u",
516                                          ctdb_vnn_iface_string(vnn),
517                                          ctdb_addr_to_str(&vnn->public_address),
518                                          vnn->public_netmask_bits);
519
520         if (ret != 0) {
521                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522                         ctdb_addr_to_str(&vnn->public_address),
523                         ctdb_vnn_iface_string(vnn)));
524                 talloc_free(state);
525                 return -1;
526         }
527
528         return 0;
529 }
530
531 struct ctdb_do_updateip_state {
532         struct ctdb_req_control *c;
533         struct ctdb_iface *old;
534         struct ctdb_vnn *vnn;
535 };
536
537 /*
538   called when updateip event finishes
539  */
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541                                       void *private_data)
542 {
543         struct ctdb_do_updateip_state *state =
544                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545         int32_t ret;
546
547         if (status != 0) {
548                 if (status == -ETIME) {
549                         ctdb_ban_self(ctdb);
550                 }
551                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552                         ctdb_addr_to_str(&state->vnn->public_address),
553                         state->old->name,
554                         ctdb_vnn_iface_string(state->vnn)));
555
556                 /*
557                  * All we can do is reset the old interface
558                  * and let the next run fix it
559                  */
560                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561                 state->vnn->iface = state->old;
562                 state->vnn->iface->references++;
563
564                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565                 talloc_free(state);
566                 return;
567         }
568
569         if (ctdb->do_checkpublicip) {
570
571         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572         if (ret != 0) {
573                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574                 talloc_free(state);
575                 return;
576         }
577
578         }
579
580         /* the control succeeded */
581         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582         talloc_free(state);
583         return;
584 }
585
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587 {
588         state->vnn->update_in_flight = false;
589         return 0;
590 }
591
592 /*
593   update (move) an ip address
594  */
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596                                 struct ctdb_req_control *c,
597                                 struct ctdb_vnn *vnn)
598 {
599         int ret;
600         struct ctdb_do_updateip_state *state;
601         struct ctdb_iface *old = vnn->iface;
602         const char *new_name;
603
604         if (vnn->update_in_flight) {
605                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606                                     "update for this IP already in flight\n",
607                                     ctdb_addr_to_str(&vnn->public_address),
608                                     vnn->public_netmask_bits));
609                 return -1;
610         }
611
612         ctdb_vnn_unassign_iface(ctdb, vnn);
613         ret = ctdb_vnn_assign_iface(ctdb, vnn);
614         if (ret != 0) {
615                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616                                  "assin a usable interface (old iface '%s')\n",
617                                  ctdb_addr_to_str(&vnn->public_address),
618                                  vnn->public_netmask_bits,
619                                  old->name));
620                 return -1;
621         }
622
623         new_name = ctdb_vnn_iface_string(vnn);
624         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625                 /* A benign update from one interface onto itself.
626                  * no need to run the eventscripts in this case, just return
627                  * success.
628                  */
629                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630                 return 0;
631         }
632
633         state = talloc(vnn, struct ctdb_do_updateip_state);
634         CTDB_NO_MEMORY(ctdb, state);
635
636         state->c = talloc_steal(ctdb, c);
637         state->old = old;
638         state->vnn = vnn;
639
640         vnn->update_in_flight = true;
641         talloc_set_destructor(state, ctdb_updateip_destructor);
642
643         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644                             "interface %s to %s\n",
645                             ctdb_addr_to_str(&vnn->public_address),
646                             vnn->public_netmask_bits,
647                             old->name,
648                             new_name));
649
650         ret = ctdb_event_script_callback(ctdb,
651                                          state,
652                                          ctdb_do_updateip_callback,
653                                          state,
654                                          false,
655                                          CTDB_EVENT_UPDATE_IP,
656                                          "%s %s %s %u",
657                                          state->old->name,
658                                          new_name,
659                                          ctdb_addr_to_str(&vnn->public_address),
660                                          vnn->public_netmask_bits);
661         if (ret != 0) {
662                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663                                  ctdb_addr_to_str(&vnn->public_address),
664                                  old->name, new_name));
665                 talloc_free(state);
666                 return -1;
667         }
668
669         return 0;
670 }
671
672 /*
673   Find the vnn of the node that has a public ip address
674   returns -1 if the address is not known as a public address
675  */
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677 {
678         struct ctdb_vnn *vnn;
679
680         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681                 if (ctdb_same_ip(&vnn->public_address, addr)) {
682                         return vnn;
683                 }
684         }
685
686         return NULL;
687 }
688
689 /*
690   take over an ip address
691  */
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693                                  struct ctdb_req_control *c,
694                                  TDB_DATA indata,
695                                  bool *async_reply)
696 {
697         int ret;
698         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699         struct ctdb_vnn *vnn;
700         bool have_ip = false;
701         bool do_updateip = false;
702         bool do_takeip = false;
703         struct ctdb_iface *best_iface = NULL;
704
705         if (pip->pnn != ctdb->pnn) {
706                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707                                  "with pnn %d, but we're node %d\n",
708                                  ctdb_addr_to_str(&pip->addr),
709                                  pip->pnn, ctdb->pnn));
710                 return -1;
711         }
712
713         /* update out vnn list */
714         vnn = find_public_ip_vnn(ctdb, &pip->addr);
715         if (vnn == NULL) {
716                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717                         ctdb_addr_to_str(&pip->addr)));
718                 return 0;
719         }
720
721         if (ctdb->do_checkpublicip) {
722                 have_ip = ctdb_sys_have_ip(&pip->addr);
723         }
724         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725         if (best_iface == NULL) {
726                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727                                  "a usable interface (old %s, have_ip %d)\n",
728                                  ctdb_addr_to_str(&vnn->public_address),
729                                  vnn->public_netmask_bits,
730                                  ctdb_vnn_iface_string(vnn),
731                                  have_ip));
732                 return -1;
733         }
734
735         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737                 have_ip = false;
738         }
739
740
741         if (vnn->iface == NULL && have_ip) {
742                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744                                  ctdb_addr_to_str(&vnn->public_address)));
745                 return 0;
746         }
747
748         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750                                   "and we have it on iface[%s], but it was assigned to node %d"
751                                   "and we are node %d, banning ourself\n",
752                                  ctdb_addr_to_str(&vnn->public_address),
753                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754                 ctdb_ban_self(ctdb);
755                 return -1;
756         }
757
758         if (vnn->pnn == -1 && have_ip) {
759                 vnn->pnn = ctdb->pnn;
760                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761                                   "and we already have it on iface[%s], update local daemon\n",
762                                  ctdb_addr_to_str(&vnn->public_address),
763                                   ctdb_vnn_iface_string(vnn)));
764                 return 0;
765         }
766
767         if (vnn->iface) {
768                 if (vnn->iface != best_iface) {
769                         if (!vnn->iface->link_up) {
770                                 do_updateip = true;
771                         } else if (vnn->iface->references > (best_iface->references + 1)) {
772                                 /* only move when the rebalance gains something */
773                                         do_updateip = true;
774                         }
775                 }
776         }
777
778         if (!have_ip) {
779                 if (do_updateip) {
780                         ctdb_vnn_unassign_iface(ctdb, vnn);
781                         do_updateip = false;
782                 }
783                 do_takeip = true;
784         }
785
786         if (do_takeip) {
787                 ret = ctdb_do_takeip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else if (do_updateip) {
792                 ret = ctdb_do_updateip(ctdb, c, vnn);
793                 if (ret != 0) {
794                         return -1;
795                 }
796         } else {
797                 /*
798                  * The interface is up and the kernel known the ip
799                  * => do nothing
800                  */
801                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits,
804                         ctdb_vnn_iface_string(vnn)));
805                 return 0;
806         }
807
808         /* tell ctdb_control.c that we will be replying asynchronously */
809         *async_reply = true;
810
811         return 0;
812 }
813
814 /*
815   takeover an ip address old v4 style
816  */
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
818                                 struct ctdb_req_control *c,
819                                 TDB_DATA indata, 
820                                 bool *async_reply)
821 {
822         TDB_DATA data;
823         
824         data.dsize = sizeof(struct ctdb_public_ip);
825         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826         CTDB_NO_MEMORY(ctdb, data.dptr);
827         
828         memcpy(data.dptr, indata.dptr, indata.dsize);
829         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
830 }
831
832 /*
833   kill any clients that are registered with a IP that is being released
834  */
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
836 {
837         struct ctdb_client_ip *ip;
838
839         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840                 ctdb_addr_to_str(addr)));
841
842         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843                 ctdb_sock_addr tmp_addr;
844
845                 tmp_addr = ip->addr;
846                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
847                         ip->client_id,
848                         ctdb_addr_to_str(&ip->addr)));
849
850                 if (ctdb_same_ip(&tmp_addr, addr)) {
851                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
852                                                                      ip->client_id, 
853                                                                      struct ctdb_client);
854                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
855                                 ip->client_id,
856                                 ctdb_addr_to_str(&ip->addr),
857                                 client->pid));
858
859                         if (client->pid != 0) {
860                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861                                         (unsigned)client->pid,
862                                         ctdb_addr_to_str(addr),
863                                         ip->client_id));
864                                 ctdb_kill(ctdb, client->pid, SIGKILL);
865                         }
866                 }
867         }
868 }
869
870 /*
871   called when releaseip event finishes
872  */
873 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
874                                 void *private_data)
875 {
876         struct takeover_callback_state *state = 
877                 talloc_get_type(private_data, struct takeover_callback_state);
878         TDB_DATA data;
879
880         if (status == -ETIME) {
881                 ctdb_ban_self(ctdb);
882         }
883
884         /* send a message to all clients of this node telling them
885            that the cluster has been reconfigured and they should
886            release any sockets on this IP */
887         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
888         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
889         data.dsize = strlen((char *)data.dptr)+1;
890
891         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
892
893         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
894
895         /* kill clients that have registered with this IP */
896         release_kill_clients(ctdb, state->addr);
897
898         ctdb_vnn_unassign_iface(ctdb, state->vnn);
899
900         /* the control succeeded */
901         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
902         talloc_free(state);
903 }
904
905 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
906 {
907         state->vnn->update_in_flight = false;
908         return 0;
909 }
910
911 /*
912   release an ip address
913  */
914 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
915                                 struct ctdb_req_control *c,
916                                 TDB_DATA indata, 
917                                 bool *async_reply)
918 {
919         int ret;
920         struct takeover_callback_state *state;
921         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
922         struct ctdb_vnn *vnn;
923         char *iface;
924
925         /* update our vnn list */
926         vnn = find_public_ip_vnn(ctdb, &pip->addr);
927         if (vnn == NULL) {
928                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
929                         ctdb_addr_to_str(&pip->addr)));
930                 return 0;
931         }
932         vnn->pnn = pip->pnn;
933
934         /* stop any previous arps */
935         talloc_free(vnn->takeover_ctx);
936         vnn->takeover_ctx = NULL;
937
938         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
939          * lazy multicast to drop an IP from any node that isn't the
940          * intended new node.  The following causes makes ctdbd ignore
941          * a release for any address it doesn't host.
942          */
943         if (ctdb->do_checkpublicip) {
944                 if (!ctdb_sys_have_ip(&pip->addr)) {
945                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
946                                 ctdb_addr_to_str(&pip->addr),
947                                 vnn->public_netmask_bits,
948                                 ctdb_vnn_iface_string(vnn)));
949                         ctdb_vnn_unassign_iface(ctdb, vnn);
950                         return 0;
951                 }
952         } else {
953                 if (vnn->iface == NULL) {
954                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
955                                            ctdb_addr_to_str(&pip->addr),
956                                            vnn->public_netmask_bits));
957                         return 0;
958                 }
959         }
960
961         /* There is a potential race between take_ip and us because we
962          * update the VNN via a callback that run when the
963          * eventscripts have been run.  Avoid the race by allowing one
964          * update to be in flight at a time.
965          */
966         if (vnn->update_in_flight) {
967                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
968                                     "update for this IP already in flight\n",
969                                     ctdb_addr_to_str(&vnn->public_address),
970                                     vnn->public_netmask_bits));
971                 return -1;
972         }
973
974         if (ctdb->do_checkpublicip) {
975                 iface = ctdb_sys_find_ifname(&pip->addr);
976                 if (iface == NULL) {
977                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
978                         return 0;
979                 }
980         } else {
981                 iface = strdup(ctdb_vnn_iface_string(vnn));
982         }
983
984         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
985                 ctdb_addr_to_str(&pip->addr),
986                 vnn->public_netmask_bits,
987                 iface,
988                 pip->pnn));
989
990         state = talloc(ctdb, struct takeover_callback_state);
991         CTDB_NO_MEMORY(ctdb, state);
992
993         state->c = talloc_steal(state, c);
994         state->addr = talloc(state, ctdb_sock_addr);       
995         CTDB_NO_MEMORY(ctdb, state->addr);
996         *state->addr = pip->addr;
997         state->vnn   = vnn;
998
999         vnn->update_in_flight = true;
1000         talloc_set_destructor(state, ctdb_releaseip_destructor);
1001
1002         ret = ctdb_event_script_callback(ctdb, 
1003                                          state, release_ip_callback, state,
1004                                          false,
1005                                          CTDB_EVENT_RELEASE_IP,
1006                                          "%s %s %u",
1007                                          iface,
1008                                          ctdb_addr_to_str(&pip->addr),
1009                                          vnn->public_netmask_bits);
1010         free(iface);
1011         if (ret != 0) {
1012                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1013                         ctdb_addr_to_str(&pip->addr),
1014                         ctdb_vnn_iface_string(vnn)));
1015                 talloc_free(state);
1016                 return -1;
1017         }
1018
1019         /* tell the control that we will be reply asynchronously */
1020         *async_reply = true;
1021         return 0;
1022 }
1023
1024 /*
1025   release an ip address old v4 style
1026  */
1027 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1028                                 struct ctdb_req_control *c,
1029                                 TDB_DATA indata, 
1030                                 bool *async_reply)
1031 {
1032         TDB_DATA data;
1033         
1034         data.dsize = sizeof(struct ctdb_public_ip);
1035         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1036         CTDB_NO_MEMORY(ctdb, data.dptr);
1037         
1038         memcpy(data.dptr, indata.dptr, indata.dsize);
1039         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1040 }
1041
1042
1043 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1044                                    ctdb_sock_addr *addr,
1045                                    unsigned mask, const char *ifaces,
1046                                    bool check_address)
1047 {
1048         struct ctdb_vnn      *vnn;
1049         uint32_t num = 0;
1050         char *tmp;
1051         const char *iface;
1052         int i;
1053         int ret;
1054
1055         tmp = strdup(ifaces);
1056         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1057                 if (!ctdb_sys_check_iface_exists(iface)) {
1058                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1059                         free(tmp);
1060                         return -1;
1061                 }
1062         }
1063         free(tmp);
1064
1065         /* Verify that we dont have an entry for this ip yet */
1066         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1067                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1068                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1069                                 ctdb_addr_to_str(addr)));
1070                         return -1;
1071                 }               
1072         }
1073
1074         /* create a new vnn structure for this ip address */
1075         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1076         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1077         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1078         tmp = talloc_strdup(vnn, ifaces);
1079         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1080         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1081                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1082                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1083                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1084                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1085                 num++;
1086         }
1087         talloc_free(tmp);
1088         vnn->ifaces[num] = NULL;
1089         vnn->public_address      = *addr;
1090         vnn->public_netmask_bits = mask;
1091         vnn->pnn                 = -1;
1092         if (check_address) {
1093                 if (ctdb_sys_have_ip(addr)) {
1094                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1095                         vnn->pnn = ctdb->pnn;
1096                 }
1097         }
1098
1099         for (i=0; vnn->ifaces[i]; i++) {
1100                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1101                 if (ret != 0) {
1102                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1103                                            "for public_address[%s]\n",
1104                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1105                         talloc_free(vnn);
1106                         return -1;
1107                 }
1108         }
1109
1110         DLIST_ADD(ctdb->vnn, vnn);
1111
1112         return 0;
1113 }
1114
1115 /*
1116   setup the event script directory
1117 */
1118 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1119 {
1120         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1121         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1122         return 0;
1123 }
1124
1125 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1126                                   struct timeval t, void *private_data)
1127 {
1128         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1129                                                         struct ctdb_context);
1130         struct ctdb_vnn *vnn;
1131
1132         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1133                 int i;
1134
1135                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1136                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1137                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1138                                         vnn->ifaces[i],
1139                                         ctdb_addr_to_str(&vnn->public_address)));
1140                         }
1141                 }
1142         }
1143
1144         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1145                 timeval_current_ofs(30, 0), 
1146                 ctdb_check_interfaces_event, ctdb);
1147 }
1148
1149
1150 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1151 {
1152         if (ctdb->check_public_ifaces_ctx != NULL) {
1153                 talloc_free(ctdb->check_public_ifaces_ctx);
1154                 ctdb->check_public_ifaces_ctx = NULL;
1155         }
1156
1157         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1158         if (ctdb->check_public_ifaces_ctx == NULL) {
1159                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1160         }
1161
1162         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1163                 timeval_current_ofs(30, 0), 
1164                 ctdb_check_interfaces_event, ctdb);
1165
1166         return 0;
1167 }
1168
1169
1170 /*
1171   setup the public address lists from a file
1172 */
1173 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1174 {
1175         char **lines;
1176         int nlines;
1177         int i;
1178
1179         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1180         if (lines == NULL) {
1181                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1182                 return -1;
1183         }
1184         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1185                 nlines--;
1186         }
1187
1188         for (i=0;i<nlines;i++) {
1189                 unsigned mask;
1190                 ctdb_sock_addr addr;
1191                 const char *addrstr;
1192                 const char *ifaces;
1193                 char *tok, *line;
1194
1195                 line = lines[i];
1196                 while ((*line == ' ') || (*line == '\t')) {
1197                         line++;
1198                 }
1199                 if (*line == '#') {
1200                         continue;
1201                 }
1202                 if (strcmp(line, "") == 0) {
1203                         continue;
1204                 }
1205                 tok = strtok(line, " \t");
1206                 addrstr = tok;
1207                 tok = strtok(NULL, " \t");
1208                 if (tok == NULL) {
1209                         if (NULL == ctdb->default_public_interface) {
1210                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1211                                          i+1));
1212                                 talloc_free(lines);
1213                                 return -1;
1214                         }
1215                         ifaces = ctdb->default_public_interface;
1216                 } else {
1217                         ifaces = tok;
1218                 }
1219
1220                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1221                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1226                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1227                         talloc_free(lines);
1228                         return -1;
1229                 }
1230         }
1231
1232
1233         talloc_free(lines);
1234         return 0;
1235 }
1236
1237 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1238                               const char *iface,
1239                               const char *ip)
1240 {
1241         struct ctdb_vnn *svnn;
1242         struct ctdb_iface *cur = NULL;
1243         bool ok;
1244         int ret;
1245
1246         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1247         CTDB_NO_MEMORY(ctdb, svnn);
1248
1249         svnn->ifaces = talloc_array(svnn, const char *, 2);
1250         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1251         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1252         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1253         svnn->ifaces[1] = NULL;
1254
1255         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1256         if (!ok) {
1257                 talloc_free(svnn);
1258                 return -1;
1259         }
1260
1261         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1262         if (ret != 0) {
1263                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1264                                    "for single_ip[%s]\n",
1265                                    svnn->ifaces[0],
1266                                    ctdb_addr_to_str(&svnn->public_address)));
1267                 talloc_free(svnn);
1268                 return -1;
1269         }
1270
1271         /* assume the single public ip interface is initially "good" */
1272         cur = ctdb_find_iface(ctdb, iface);
1273         if (cur == NULL) {
1274                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1275                 return -1;
1276         }
1277         cur->link_up = true;
1278
1279         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1280         if (ret != 0) {
1281                 talloc_free(svnn);
1282                 return -1;
1283         }
1284
1285         ctdb->single_ip_vnn = svnn;
1286         return 0;
1287 }
1288
1289 /* Given a physical node, return the number of
1290    public addresses that is currently assigned to this node.
1291 */
1292 static int node_ip_coverage(struct ctdb_context *ctdb, 
1293         int32_t pnn,
1294         struct ctdb_public_ip_list *ips)
1295 {
1296         int num=0;
1297
1298         for (;ips;ips=ips->next) {
1299                 if (ips->pnn == pnn) {
1300                         num++;
1301                 }
1302         }
1303         return num;
1304 }
1305
1306
1307 /* Can the given node host the given IP: is the public IP known to the
1308  * node and is NOIPHOST unset?
1309 */
1310 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1311                              struct ctdb_node_map *nodemap,
1312                              struct ctdb_public_ip_list *ip)
1313 {
1314         struct ctdb_all_public_ips *public_ips;
1315         int i;
1316
1317         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPHOST) {
1318                 return false;
1319         }
1320
1321         public_ips = ctdb->nodes[pnn]->available_public_ips;
1322
1323         if (public_ips == NULL) {
1324                 return false;
1325         }
1326
1327         for (i=0;i<public_ips->num;i++) {
1328                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1329                         /* yes, this node can serve this public ip */
1330                         return true;
1331                 }
1332         }
1333
1334         return false;
1335 }
1336
1337 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1338                                  struct ctdb_node_map *nodemap,
1339                                  struct ctdb_public_ip_list *ip)
1340 {
1341         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPTAKEOVER) {
1342                 return false;
1343         }
1344
1345         return can_node_host_ip(ctdb, pnn, nodemap, ip);
1346 }
1347
1348 /* search the node lists list for a node to takeover this ip.
1349    pick the node that currently are serving the least number of ips
1350    so that the ips get spread out evenly.
1351 */
1352 static int find_takeover_node(struct ctdb_context *ctdb, 
1353                 struct ctdb_node_map *nodemap,
1354                 struct ctdb_public_ip_list *ip,
1355                 struct ctdb_public_ip_list *all_ips)
1356 {
1357         int pnn, min=0, num;
1358         int i;
1359
1360         pnn    = -1;
1361         for (i=0;i<nodemap->num;i++) {
1362                 /* verify that this node can serve this ip */
1363                 if (!can_node_takeover_ip(ctdb, i, nodemap, ip)) {
1364                         /* no it couldnt   so skip to the next node */
1365                         continue;
1366                 }
1367
1368                 num = node_ip_coverage(ctdb, i, all_ips);
1369                 /* was this the first node we checked ? */
1370                 if (pnn == -1) {
1371                         pnn = i;
1372                         min  = num;
1373                 } else {
1374                         if (num < min) {
1375                                 pnn = i;
1376                                 min  = num;
1377                         }
1378                 }
1379         }       
1380         if (pnn == -1) {
1381                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1382                         ctdb_addr_to_str(&ip->addr)));
1383
1384                 return -1;
1385         }
1386
1387         ip->pnn = pnn;
1388         return 0;
1389 }
1390
1391 #define IP_KEYLEN       4
1392 static uint32_t *ip_key(ctdb_sock_addr *ip)
1393 {
1394         static uint32_t key[IP_KEYLEN];
1395
1396         bzero(key, sizeof(key));
1397
1398         switch (ip->sa.sa_family) {
1399         case AF_INET:
1400                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1401                 break;
1402         case AF_INET6: {
1403                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1404                 key[0]  = htonl(s6_a32[0]);
1405                 key[1]  = htonl(s6_a32[1]);
1406                 key[2]  = htonl(s6_a32[2]);
1407                 key[3]  = htonl(s6_a32[3]);
1408                 break;
1409         }
1410         default:
1411                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1412                 return key;
1413         }
1414
1415         return key;
1416 }
1417
1418 static void *add_ip_callback(void *parm, void *data)
1419 {
1420         struct ctdb_public_ip_list *this_ip = parm; 
1421         struct ctdb_public_ip_list *prev_ip = data; 
1422
1423         if (prev_ip == NULL) {
1424                 return parm;
1425         }
1426         if (this_ip->pnn == -1) {
1427                 this_ip->pnn = prev_ip->pnn;
1428         }
1429
1430         return parm;
1431 }
1432
1433 static int getips_count_callback(void *param, void *data)
1434 {
1435         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1436         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1437
1438         new_ip->next = *ip_list;
1439         *ip_list     = new_ip;
1440         return 0;
1441 }
1442
1443 static struct ctdb_public_ip_list *
1444 create_merged_ip_list(struct ctdb_context *ctdb)
1445 {
1446         int i, j;
1447         struct ctdb_public_ip_list *ip_list;
1448         struct ctdb_all_public_ips *public_ips;
1449
1450         if (ctdb->ip_tree != NULL) {
1451                 talloc_free(ctdb->ip_tree);
1452                 ctdb->ip_tree = NULL;
1453         }
1454         ctdb->ip_tree = trbt_create(ctdb, 0);
1455
1456         for (i=0;i<ctdb->num_nodes;i++) {
1457                 public_ips = ctdb->nodes[i]->known_public_ips;
1458
1459                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1460                         continue;
1461                 }
1462
1463                 /* there were no public ips for this node */
1464                 if (public_ips == NULL) {
1465                         continue;
1466                 }               
1467
1468                 for (j=0;j<public_ips->num;j++) {
1469                         struct ctdb_public_ip_list *tmp_ip; 
1470
1471                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1472                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1473                         /* Do not use information about IP addresses hosted
1474                          * on other nodes, it may not be accurate */
1475                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1476                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1477                         } else {
1478                                 tmp_ip->pnn = -1;
1479                         }
1480                         tmp_ip->addr = public_ips->ips[j].addr;
1481                         tmp_ip->next = NULL;
1482
1483                         trbt_insertarray32_callback(ctdb->ip_tree,
1484                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1485                                 add_ip_callback,
1486                                 tmp_ip);
1487                 }
1488         }
1489
1490         ip_list = NULL;
1491         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1492
1493         return ip_list;
1494 }
1495
1496 /* 
1497  * This is the length of the longtest common prefix between the IPs.
1498  * It is calculated by XOR-ing the 2 IPs together and counting the
1499  * number of leading zeroes.  The implementation means that all
1500  * addresses end up being 128 bits long.
1501  *
1502  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1503  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1504  * lots of nodes and IP addresses?
1505  */
1506 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1507 {
1508         uint32_t ip1_k[IP_KEYLEN];
1509         uint32_t *t;
1510         int i;
1511         uint32_t x;
1512
1513         uint32_t distance = 0;
1514
1515         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1516         t = ip_key(ip2);
1517         for (i=0; i<IP_KEYLEN; i++) {
1518                 x = ip1_k[i] ^ t[i];
1519                 if (x == 0) {
1520                         distance += 32;
1521                 } else {
1522                         /* Count number of leading zeroes. 
1523                          * FIXME? This could be optimised...
1524                          */
1525                         while ((x & (1 << 31)) == 0) {
1526                                 x <<= 1;
1527                                 distance += 1;
1528                         }
1529                 }
1530         }
1531
1532         return distance;
1533 }
1534
1535 /* Calculate the IP distance for the given IP relative to IPs on the
1536    given node.  The ips argument is generally the all_ips variable
1537    used in the main part of the algorithm.
1538  */
1539 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1540                                   struct ctdb_public_ip_list *ips,
1541                                   int pnn)
1542 {
1543         struct ctdb_public_ip_list *t;
1544         uint32_t d;
1545
1546         uint32_t sum = 0;
1547
1548         for (t=ips; t != NULL; t=t->next) {
1549                 if (t->pnn != pnn) {
1550                         continue;
1551                 }
1552
1553                 /* Optimisation: We never calculate the distance
1554                  * between an address and itself.  This allows us to
1555                  * calculate the effect of removing an address from a
1556                  * node by simply calculating the distance between
1557                  * that address and all of the exitsing addresses.
1558                  * Moreover, we assume that we're only ever dealing
1559                  * with addresses from all_ips so we can identify an
1560                  * address via a pointer rather than doing a more
1561                  * expensive address comparison. */
1562                 if (&(t->addr) == ip) {
1563                         continue;
1564                 }
1565
1566                 d = ip_distance(ip, &(t->addr));
1567                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1568         }
1569
1570         return sum;
1571 }
1572
1573 /* Return the LCP2 imbalance metric for addresses currently assigned
1574    to the given node.
1575  */
1576 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1577 {
1578         struct ctdb_public_ip_list *t;
1579
1580         uint32_t imbalance = 0;
1581
1582         for (t=all_ips; t!=NULL; t=t->next) {
1583                 if (t->pnn != pnn) {
1584                         continue;
1585                 }
1586                 /* Pass the rest of the IPs rather than the whole
1587                    all_ips input list.
1588                 */
1589                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1590         }
1591
1592         return imbalance;
1593 }
1594
1595 /* Allocate any unassigned IPs just by looping through the IPs and
1596  * finding the best node for each.
1597  */
1598 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1599                                       struct ctdb_node_map *nodemap,
1600                                       struct ctdb_public_ip_list *all_ips)
1601 {
1602         struct ctdb_public_ip_list *tmp_ip;
1603
1604         /* loop over all ip's and find a physical node to cover for 
1605            each unassigned ip.
1606         */
1607         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1608                 if (tmp_ip->pnn == -1) {
1609                         if (find_takeover_node(ctdb, nodemap, tmp_ip, all_ips)) {
1610                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1611                                         ctdb_addr_to_str(&tmp_ip->addr)));
1612                         }
1613                 }
1614         }
1615 }
1616
1617 /* Basic non-deterministic rebalancing algorithm.
1618  */
1619 static void basic_failback(struct ctdb_context *ctdb,
1620                            struct ctdb_node_map *nodemap,
1621                            struct ctdb_public_ip_list *all_ips,
1622                            int num_ips)
1623 {
1624         int i;
1625         int maxnode, maxnum, minnode, minnum, num, retries;
1626         struct ctdb_public_ip_list *tmp_ip;
1627
1628         retries = 0;
1629
1630 try_again:
1631         maxnum=0;
1632         minnum=0;
1633
1634         /* for each ip address, loop over all nodes that can serve
1635            this ip and make sure that the difference between the node
1636            serving the most and the node serving the least ip's are
1637            not greater than 1.
1638         */
1639         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1640                 if (tmp_ip->pnn == -1) {
1641                         continue;
1642                 }
1643
1644                 /* Get the highest and lowest number of ips's served by any 
1645                    valid node which can serve this ip.
1646                 */
1647                 maxnode = -1;
1648                 minnode = -1;
1649                 for (i=0;i<nodemap->num;i++) {
1650                         /* only check nodes that can actually serve this ip */
1651                         if (!can_node_takeover_ip(ctdb, i, nodemap, tmp_ip)) {
1652                                 /* no it couldnt   so skip to the next node */
1653                                 continue;
1654                         }
1655
1656                         num = node_ip_coverage(ctdb, i, all_ips);
1657                         if (maxnode == -1) {
1658                                 maxnode = i;
1659                                 maxnum  = num;
1660                         } else {
1661                                 if (num > maxnum) {
1662                                         maxnode = i;
1663                                         maxnum  = num;
1664                                 }
1665                         }
1666                         if (minnode == -1) {
1667                                 minnode = i;
1668                                 minnum  = num;
1669                         } else {
1670                                 if (num < minnum) {
1671                                         minnode = i;
1672                                         minnum  = num;
1673                                 }
1674                         }
1675                 }
1676                 if (maxnode == -1) {
1677                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1678                                 ctdb_addr_to_str(&tmp_ip->addr)));
1679
1680                         continue;
1681                 }
1682
1683                 /* if the spread between the smallest and largest coverage by
1684                    a node is >=2 we steal one of the ips from the node with
1685                    most coverage to even things out a bit.
1686                    try to do this a limited number of times since we dont
1687                    want to spend too much time balancing the ip coverage.
1688                 */
1689                 if ( (maxnum > minnum+1)
1690                      && (retries < (num_ips + 5)) ){
1691                         struct ctdb_public_ip_list *tmp;
1692
1693                         /* Reassign one of maxnode's VNNs */
1694                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1695                                 if (tmp->pnn == maxnode) {
1696                                         (void)find_takeover_node(ctdb, nodemap, tmp, all_ips);
1697                                         retries++;
1698                                         goto try_again;;
1699                                 }
1700                         }
1701                 }
1702         }
1703 }
1704
1705 struct ctdb_rebalancenodes {
1706         struct ctdb_rebalancenodes *next;
1707         uint32_t pnn;
1708 };
1709 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1710
1711
1712 /* set this flag to force the node to be rebalanced even if it just didnt
1713    become healthy again.
1714 */
1715 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1716 {
1717         struct ctdb_rebalancenodes *rebalance;
1718
1719         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1720                 if (rebalance->pnn == pnn) {
1721                         return;
1722                 }
1723         }
1724
1725         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1726         rebalance->pnn = pnn;
1727         rebalance->next = force_rebalance_list;
1728         force_rebalance_list = rebalance;
1729 }
1730
1731 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1732  * that we can unit test it.
1733  */
1734 static void lcp2_init(struct ctdb_context * tmp_ctx,
1735                struct ctdb_node_map * nodemap,
1736                struct ctdb_public_ip_list *all_ips,
1737                uint32_t **lcp2_imbalances,
1738                bool **rebalance_candidates)
1739 {
1740         int i;
1741         struct ctdb_public_ip_list *tmp_ip;
1742
1743         *rebalance_candidates = talloc_array(tmp_ctx, bool, nodemap->num);
1744         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1745         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1746         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1747
1748         for (i=0;i<nodemap->num;i++) {
1749                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1750                 /* First step: assume all nodes are candidates */
1751                 (*rebalance_candidates)[i] = true;
1752         }
1753
1754         /* 2nd step: if a node has IPs assigned then it must have been
1755          * healthy before, so we remove it from consideration.  This
1756          * is overkill but is all we have because we don't maintain
1757          * state between takeover runs.  An alternative would be to
1758          * keep state and invalidate it every time the recovery master
1759          * changes.
1760          */
1761         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1762                 if (tmp_ip->pnn != -1) {
1763                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1764                 }
1765         }
1766
1767         /* 3rd step: if a node is forced to re-balance then
1768            we allow failback onto the node */
1769         while (force_rebalance_list != NULL) {
1770                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1771
1772                 if (force_rebalance_list->pnn <= nodemap->num) {
1773                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1774                 }
1775
1776                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1777                 talloc_free(force_rebalance_list);
1778                 force_rebalance_list = next;
1779         }
1780 }
1781
1782 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1783  * the IP/node combination that will cost the least.
1784  */
1785 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1786                                      struct ctdb_node_map *nodemap,
1787                                      struct ctdb_public_ip_list *all_ips,
1788                                      uint32_t *lcp2_imbalances)
1789 {
1790         struct ctdb_public_ip_list *tmp_ip;
1791         int dstnode;
1792
1793         int minnode;
1794         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1795         struct ctdb_public_ip_list *minip;
1796
1797         bool should_loop = true;
1798         bool have_unassigned = true;
1799
1800         while (have_unassigned && should_loop) {
1801                 should_loop = false;
1802
1803                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1804                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1805
1806                 minnode = -1;
1807                 mindsum = 0;
1808                 minip = NULL;
1809
1810                 /* loop over each unassigned ip. */
1811                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1812                         if (tmp_ip->pnn != -1) {
1813                                 continue;
1814                         }
1815
1816                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1817                                 /* only check nodes that can actually takeover this ip */
1818                                 if (!can_node_takeover_ip(ctdb, dstnode,
1819                                                           nodemap, tmp_ip)) {
1820                                         /* no it couldnt   so skip to the next node */
1821                                         continue;
1822                                 }
1823
1824                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1825                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1826                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1827                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1828                                                    dstnode,
1829                                                    dstimbl - lcp2_imbalances[dstnode]));
1830
1831
1832                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1833                                         minnode = dstnode;
1834                                         minimbl = dstimbl;
1835                                         mindsum = dstdsum;
1836                                         minip = tmp_ip;
1837                                         should_loop = true;
1838                                 }
1839                         }
1840                 }
1841
1842                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1843
1844                 /* If we found one then assign it to the given node. */
1845                 if (minnode != -1) {
1846                         minip->pnn = minnode;
1847                         lcp2_imbalances[minnode] = minimbl;
1848                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1849                                           ctdb_addr_to_str(&(minip->addr)),
1850                                           minnode,
1851                                           mindsum));
1852                 }
1853
1854                 /* There might be a better way but at least this is clear. */
1855                 have_unassigned = false;
1856                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1857                         if (tmp_ip->pnn == -1) {
1858                                 have_unassigned = true;
1859                         }
1860                 }
1861         }
1862
1863         /* We know if we have an unassigned addresses so we might as
1864          * well optimise.
1865          */
1866         if (have_unassigned) {
1867                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1868                         if (tmp_ip->pnn == -1) {
1869                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1870                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1871                         }
1872                 }
1873         }
1874 }
1875
1876 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1877  * to move IPs from, determines the best IP/destination node
1878  * combination to move from the source node.
1879  */
1880 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1881                                     struct ctdb_node_map *nodemap,
1882                                     struct ctdb_public_ip_list *all_ips,
1883                                     int srcnode,
1884                                     uint32_t candimbl,
1885                                     uint32_t *lcp2_imbalances,
1886                                     bool *rebalance_candidates)
1887 {
1888         int dstnode, mindstnode;
1889         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1890         uint32_t minsrcimbl, mindstimbl;
1891         struct ctdb_public_ip_list *minip;
1892         struct ctdb_public_ip_list *tmp_ip;
1893
1894         /* Find an IP and destination node that best reduces imbalance. */
1895         minip = NULL;
1896         minsrcimbl = 0;
1897         mindstnode = -1;
1898         mindstimbl = 0;
1899
1900         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1901         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1902
1903         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1904                 /* Only consider addresses on srcnode. */
1905                 if (tmp_ip->pnn != srcnode) {
1906                         continue;
1907                 }
1908
1909                 /* What is this IP address costing the source node? */
1910                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1911                 srcimbl = candimbl - srcdsum;
1912
1913                 /* Consider this IP address would cost each potential
1914                  * destination node.  Destination nodes are limited to
1915                  * those that are newly healthy, since we don't want
1916                  * to do gratuitous failover of IPs just to make minor
1917                  * balance improvements.
1918                  */
1919                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1920                         if (!rebalance_candidates[dstnode]) {
1921                                 continue;
1922                         }
1923
1924                         /* only check nodes that can actually takeover this ip */
1925                         if (!can_node_takeover_ip(ctdb, dstnode,
1926                                                   nodemap, tmp_ip)) {
1927                                 /* no it couldnt   so skip to the next node */
1928                                 continue;
1929                         }
1930
1931                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1932                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1933                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1934                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1935                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1936                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1937
1938                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1939                             ((mindstnode == -1) ||                              \
1940                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1941
1942                                 minip = tmp_ip;
1943                                 minsrcimbl = srcimbl;
1944                                 mindstnode = dstnode;
1945                                 mindstimbl = dstimbl;
1946                         }
1947                 }
1948         }
1949         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1950
1951         if (mindstnode != -1) {
1952                 /* We found a move that makes things better... */
1953                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1954                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1955                                   ctdb_addr_to_str(&(minip->addr)),
1956                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1957
1958
1959                 lcp2_imbalances[srcnode] = srcimbl;
1960                 lcp2_imbalances[mindstnode] = mindstimbl;
1961                 minip->pnn = mindstnode;
1962
1963                 return true;
1964         }
1965
1966         return false;
1967         
1968 }
1969
1970 struct lcp2_imbalance_pnn {
1971         uint32_t imbalance;
1972         int pnn;
1973 };
1974
1975 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1976 {
1977         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1978         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1979
1980         if (lipa->imbalance > lipb->imbalance) {
1981                 return -1;
1982         } else if (lipa->imbalance == lipb->imbalance) {
1983                 return 0;
1984         } else {
1985                 return 1;
1986         }
1987 }
1988
1989 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1990  * node with the highest LCP2 imbalance, and then determines the best
1991  * IP/destination node combination to move from the source node.
1992  */
1993 static void lcp2_failback(struct ctdb_context *ctdb,
1994                           struct ctdb_node_map *nodemap,
1995                           struct ctdb_public_ip_list *all_ips,
1996                           uint32_t *lcp2_imbalances,
1997                           bool *rebalance_candidates)
1998 {
1999         int i, num_rebalance_candidates;
2000         struct lcp2_imbalance_pnn * lips;
2001         bool again;
2002
2003 try_again:
2004
2005         /* It is only worth continuing if we have suitable target
2006          * nodes to transfer IPs to.  This check is much cheaper than
2007          * continuing on...
2008          */
2009         num_rebalance_candidates = 0;
2010         for (i = 0; i < nodemap->num; i++) {
2011                 if (rebalance_candidates[i]) {
2012                         num_rebalance_candidates++;
2013                 }
2014         }
2015         if (num_rebalance_candidates == 0) {
2016                 return;
2017         }
2018
2019         /* Put the imbalances and nodes into an array, sort them and
2020          * iterate through candidates.  Usually the 1st one will be
2021          * used, so this doesn't cost much...
2022          */
2023         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2024         for (i = 0; i < nodemap->num; i++) {
2025                 lips[i].imbalance = lcp2_imbalances[i];
2026                 lips[i].pnn = i;
2027         }
2028         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2029               lcp2_cmp_imbalance_pnn);
2030
2031         again = false;
2032         for (i = 0; i < nodemap->num; i++) {
2033                 /* This means that all nodes had 0 or 1 addresses, so
2034                  * can't be imbalanced.
2035                  */
2036                 if (lips[i].imbalance == 0) {
2037                         break;
2038                 }
2039
2040                 if (lcp2_failback_candidate(ctdb,
2041                                             nodemap,
2042                                             all_ips,
2043                                             lips[i].pnn,
2044                                             lips[i].imbalance,
2045                                             lcp2_imbalances,
2046                                             rebalance_candidates)) {
2047                         again = true;
2048                         break;
2049                 }
2050         }
2051
2052         talloc_free(lips);
2053         if (again) {
2054                 goto try_again;
2055         }
2056 }
2057
2058 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2059                                     struct ctdb_node_map *nodemap,
2060                                     struct ctdb_public_ip_list *all_ips)
2061 {
2062         struct ctdb_public_ip_list *tmp_ip;
2063
2064         /* verify that the assigned nodes can serve that public ip
2065            and set it to -1 if not
2066         */
2067         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2068                 if (tmp_ip->pnn == -1) {
2069                         continue;
2070                 }
2071                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2072                                       nodemap, tmp_ip) != 0) {
2073                         /* this node can not serve this ip. */
2074                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2075                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2076                                            tmp_ip->pnn));
2077                         tmp_ip->pnn = -1;
2078                 }
2079         }
2080 }
2081
2082 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2083                                        struct ctdb_node_map *nodemap,
2084                                        struct ctdb_public_ip_list *all_ips)
2085 {
2086         struct ctdb_public_ip_list *tmp_ip;
2087         int i;
2088
2089         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2090        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2091         *  always be allocated the same way for a specific set of
2092         *  available/unavailable nodes.
2093         */
2094
2095         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2096                 tmp_ip->pnn = i%nodemap->num;
2097         }
2098
2099         /* IP failback doesn't make sense with deterministic
2100          * IPs, since the modulo step above implicitly fails
2101          * back IPs to their "home" node.
2102          */
2103         if (1 == ctdb->tunable.no_ip_failback) {
2104                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2105         }
2106
2107         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2108
2109         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2110
2111         /* No failback here! */
2112 }
2113
2114 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2115                                           struct ctdb_node_map *nodemap,
2116                                           struct ctdb_public_ip_list *all_ips)
2117 {
2118         /* This should be pushed down into basic_failback. */
2119         struct ctdb_public_ip_list *tmp_ip;
2120         int num_ips = 0;
2121         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2122                 num_ips++;
2123         }
2124
2125         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2126
2127         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2128
2129         /* If we don't want IPs to fail back then don't rebalance IPs. */
2130         if (1 == ctdb->tunable.no_ip_failback) {
2131                 return;
2132         }
2133
2134         /* Now, try to make sure the ip adresses are evenly distributed
2135            across the nodes.
2136         */
2137         basic_failback(ctdb, nodemap, all_ips, num_ips);
2138 }
2139
2140 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2141                           struct ctdb_node_map *nodemap,
2142                           struct ctdb_public_ip_list *all_ips)
2143 {
2144         uint32_t *lcp2_imbalances;
2145         bool *rebalance_candidates;
2146
2147         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2148
2149         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2150
2151         lcp2_init(tmp_ctx, nodemap, all_ips,
2152                   &lcp2_imbalances, &rebalance_candidates);
2153
2154         lcp2_allocate_unassigned(ctdb, nodemap, all_ips, lcp2_imbalances);
2155
2156         /* If we don't want IPs to fail back then don't rebalance IPs. */
2157         if (1 == ctdb->tunable.no_ip_failback) {
2158                 goto finished;
2159         }
2160
2161         /* Now, try to make sure the ip adresses are evenly distributed
2162            across the nodes.
2163         */
2164         lcp2_failback(ctdb, nodemap, all_ips,
2165                       lcp2_imbalances, rebalance_candidates);
2166
2167 finished:
2168         talloc_free(tmp_ctx);
2169 }
2170
2171 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2172 {
2173         int i, num_healthy;
2174
2175         /* Count how many completely healthy nodes we have */
2176         num_healthy = 0;
2177         for (i=0;i<nodemap->num;i++) {
2178                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2179                         num_healthy++;
2180                 }
2181         }
2182
2183         return num_healthy == 0;
2184 }
2185
2186 /* The calculation part of the IP allocation algorithm. */
2187 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2188                                    struct ctdb_node_map *nodemap,
2189                                    struct ctdb_public_ip_list **all_ips_p)
2190 {
2191         /* since nodes only know about those public addresses that
2192            can be served by that particular node, no single node has
2193            a full list of all public addresses that exist in the cluster.
2194            Walk over all node structures and create a merged list of
2195            all public addresses that exist in the cluster.
2196
2197            keep the tree of ips around as ctdb->ip_tree
2198         */
2199         *all_ips_p = create_merged_ip_list(ctdb);
2200
2201         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2202                 ip_alloc_lcp2(ctdb, nodemap, *all_ips_p);
2203         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2204                 ip_alloc_deterministic_ips(ctdb, nodemap, *all_ips_p);
2205         } else {
2206                 ip_alloc_nondeterministic_ips(ctdb, nodemap, *all_ips_p);
2207         }
2208
2209         /* at this point ->pnn is the node which will own each IP
2210            or -1 if there is no node that can cover this ip
2211         */
2212
2213         return;
2214 }
2215
2216 struct get_tunable_callback_data {
2217         const char *tunable;
2218         uint32_t *out;
2219 };
2220
2221 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2222                                  int32_t res, TDB_DATA outdata,
2223                                  void *callback)
2224 {
2225         struct get_tunable_callback_data *cd =
2226                 (struct get_tunable_callback_data *)callback;
2227         int size;
2228
2229         if (res != 0) {
2230                 DEBUG(DEBUG_ERR,
2231                       ("Failure to read \"%s\" tunable from remote node %d\n",
2232                        cd->tunable, pnn));
2233                 return;
2234         }
2235
2236         if (outdata.dsize != sizeof(uint32_t)) {
2237                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2238                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2239                                  (int)outdata.dsize));
2240                 return;
2241         }
2242
2243         size = talloc_get_size(cd->out) / sizeof(uint32_t);
2244         if (pnn >= size) {
2245                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2246                                  cd->tunable, pnn, size));
2247                 return;
2248         }
2249
2250                 
2251         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2252 }
2253
2254 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2255                                         TALLOC_CTX *tmp_ctx,
2256                                         struct ctdb_node_map *nodemap,
2257                                         const char *tunable)
2258 {
2259         TDB_DATA data;
2260         struct ctdb_control_get_tunable *t;
2261         uint32_t *nodes;
2262         uint32_t *tvals;
2263         struct get_tunable_callback_data callback_data;
2264
2265         tvals = talloc_zero_array(tmp_ctx, uint32_t, nodemap->num);
2266         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2267         callback_data.out = tvals;
2268         callback_data.tunable = tunable;
2269
2270         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2271         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2272         t = (struct ctdb_control_get_tunable *)data.dptr;
2273         t->length = strlen(tunable)+1;
2274         memcpy(t->name, tunable, t->length);
2275         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2276         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2277                                       nodes, 0, TAKEOVER_TIMEOUT(),
2278                                       false, data,
2279                                       get_tunable_callback, NULL,
2280                                       &callback_data) != 0) {
2281                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get %s tunable failed\n", tunable));
2282         }
2283         talloc_free(nodes);
2284         talloc_free(data.dptr);
2285
2286         return tvals;
2287 }
2288
2289 static void clear_ipflags(struct ctdb_node_map *nodemap)
2290 {
2291         int i;
2292
2293         for (i=0;i<nodemap->num;i++) {
2294                 nodemap->nodes[i].flags &=
2295                         ~(NODE_FLAGS_NOIPTAKEOVER|NODE_FLAGS_NOIPHOST);
2296         }
2297 }
2298
2299
2300 /* Set internal flags for IP allocation:
2301  *   Clear ip flags
2302  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2303  *   Set NOIPHOST ip flag for each INACTIVE node
2304  *   if all nodes are disabled:
2305  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2306  *   else
2307  *     Set NOIPHOST ip flags for disabled nodes
2308  */
2309 static void set_ipflags_internal(struct ctdb_node_map *nodemap,
2310                                  uint32_t *tval_noiptakeover,
2311                                  uint32_t *tval_noiphostonalldisabled)
2312 {
2313         int i;
2314
2315         clear_ipflags(nodemap);
2316
2317         for (i=0;i<nodemap->num;i++) {
2318                 /* Can not take IPs on node with NoIPTakeover set */
2319                 if (tval_noiptakeover[i] != 0) {
2320                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPTAKEOVER;
2321                 }
2322
2323                 /* Can not host IPs on INACTIVE node */
2324                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2325                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2326                 }
2327         }
2328
2329         if (all_nodes_are_disabled(nodemap)) {
2330                 /* If all nodes are disabled, can not host IPs on node
2331                  * with NoIPHostOnAllDisabled set
2332                  */
2333                 for (i=0;i<nodemap->num;i++) {
2334                         if (tval_noiphostonalldisabled[i] != 0) {
2335                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2336                         }
2337                 }
2338         } else {
2339                 /* If some nodes are not disabled, then can not host
2340                  * IPs on DISABLED node
2341                  */
2342                 for (i=0;i<nodemap->num;i++) {
2343                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2344                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2345                         }
2346                 }
2347         }
2348 }
2349
2350 static bool set_ipflags(struct ctdb_context *ctdb,
2351                         TALLOC_CTX *tmp_ctx,
2352                         struct ctdb_node_map *nodemap)
2353 {
2354         uint32_t *tval_noiptakeover;
2355         uint32_t *tval_noiphostonalldisabled;
2356
2357         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2358                                                    "NoIPTakeover");
2359         if (tval_noiptakeover == NULL) {
2360                 return false;
2361         }
2362
2363         tval_noiphostonalldisabled =
2364                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2365                                        "NoIPHostOnAllDisabled");
2366         if (tval_noiphostonalldisabled == NULL) {
2367                 return false;
2368         }
2369
2370         set_ipflags_internal(nodemap,
2371                              tval_noiptakeover, tval_noiphostonalldisabled);
2372
2373         talloc_free(tval_noiptakeover);
2374         talloc_free(tval_noiphostonalldisabled);
2375
2376         return true;
2377 }
2378
2379 /*
2380   make any IP alias changes for public addresses that are necessary 
2381  */
2382 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2383                       client_async_callback fail_callback, void *callback_data)
2384 {
2385         int i;
2386         struct ctdb_public_ip ip;
2387         struct ctdb_public_ipv4 ipv4;
2388         uint32_t *nodes;
2389         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2390         TDB_DATA data;
2391         struct timeval timeout;
2392         struct client_async_data *async_data;
2393         struct ctdb_client_control_state *state;
2394         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2395         uint32_t disable_timeout;
2396
2397         /*
2398          * ip failover is completely disabled, just send out the 
2399          * ipreallocated event.
2400          */
2401         if (ctdb->tunable.disable_ip_failover != 0) {
2402                 goto ipreallocated;
2403         }
2404
2405         if (!set_ipflags(ctdb, tmp_ctx, nodemap)) {
2406                 DEBUG(DEBUG_ERR,("Failed to set IP flags from tunables\n"));
2407                 return -1;
2408         }
2409
2410         ZERO_STRUCT(ip);
2411
2412         /* Do the IP reassignment calculations */
2413         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2414
2415         /* The IP flags need to be cleared because they should never
2416          * be seen outside the IP allocation code.
2417          */
2418         clear_ipflags(nodemap);
2419
2420         /* The recovery daemon does regular sanity checks of the IPs.
2421          * However, sometimes it is overzealous and thinks changes are
2422          * required when they're already underway.  This stops the
2423          * checks for a while before we start moving IPs.
2424          */
2425         disable_timeout = ctdb->tunable.takeover_timeout;
2426         data.dptr  = (uint8_t*)&disable_timeout;
2427         data.dsize = sizeof(disable_timeout);
2428         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2429                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2430                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2431         }
2432
2433         /* now tell all nodes to delete any alias that they should not
2434            have.  This will be a NOOP on nodes that don't currently
2435            hold the given alias */
2436         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2437         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2438
2439         async_data->fail_callback = fail_callback;
2440         async_data->callback_data = callback_data;
2441
2442         for (i=0;i<nodemap->num;i++) {
2443                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2444                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2445                         continue;
2446                 }
2447
2448                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2449                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2450                                 /* This node should be serving this
2451                                    vnn so dont tell it to release the ip
2452                                 */
2453                                 continue;
2454                         }
2455                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2456                                 ipv4.pnn = tmp_ip->pnn;
2457                                 ipv4.sin = tmp_ip->addr.ip;
2458
2459                                 timeout = TAKEOVER_TIMEOUT();
2460                                 data.dsize = sizeof(ipv4);
2461                                 data.dptr  = (uint8_t *)&ipv4;
2462                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2463                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2464                                                 data, async_data,
2465                                                 &timeout, NULL);
2466                         } else {
2467                                 ip.pnn  = tmp_ip->pnn;
2468                                 ip.addr = tmp_ip->addr;
2469
2470                                 timeout = TAKEOVER_TIMEOUT();
2471                                 data.dsize = sizeof(ip);
2472                                 data.dptr  = (uint8_t *)&ip;
2473                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2474                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2475                                                 data, async_data,
2476                                                 &timeout, NULL);
2477                         }
2478
2479                         if (state == NULL) {
2480                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2481                                 talloc_free(tmp_ctx);
2482                                 return -1;
2483                         }
2484                 
2485                         ctdb_client_async_add(async_data, state);
2486                 }
2487         }
2488         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2489                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2490                 talloc_free(tmp_ctx);
2491                 return -1;
2492         }
2493         talloc_free(async_data);
2494
2495
2496         /* tell all nodes to get their own IPs */
2497         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2498         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2499
2500         async_data->fail_callback = fail_callback;
2501         async_data->callback_data = callback_data;
2502
2503         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2504                 if (tmp_ip->pnn == -1) {
2505                         /* this IP won't be taken over */
2506                         continue;
2507                 }
2508
2509                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2510                         ipv4.pnn = tmp_ip->pnn;
2511                         ipv4.sin = tmp_ip->addr.ip;
2512
2513                         timeout = TAKEOVER_TIMEOUT();
2514                         data.dsize = sizeof(ipv4);
2515                         data.dptr  = (uint8_t *)&ipv4;
2516                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2517                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2518                                         data, async_data,
2519                                         &timeout, NULL);
2520                 } else {
2521                         ip.pnn  = tmp_ip->pnn;
2522                         ip.addr = tmp_ip->addr;
2523
2524                         timeout = TAKEOVER_TIMEOUT();
2525                         data.dsize = sizeof(ip);
2526                         data.dptr  = (uint8_t *)&ip;
2527                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2528                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2529                                         data, async_data,
2530                                         &timeout, NULL);
2531                 }
2532                 if (state == NULL) {
2533                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2534                         talloc_free(tmp_ctx);
2535                         return -1;
2536                 }
2537                 
2538                 ctdb_client_async_add(async_data, state);
2539         }
2540         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2541                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2542                 talloc_free(tmp_ctx);
2543                 return -1;
2544         }
2545
2546 ipreallocated:
2547         /* 
2548          * Tell all nodes to run eventscripts to process the
2549          * "ipreallocated" event.  This can do a lot of things,
2550          * including restarting services to reconfigure them if public
2551          * IPs have moved.  Once upon a time this event only used to
2552          * update natwg.
2553          */
2554         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2555         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2556                                       nodes, 0, TAKEOVER_TIMEOUT(),
2557                                       false, tdb_null,
2558                                       NULL, fail_callback,
2559                                       callback_data) != 0) {
2560                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2561         }
2562
2563         talloc_free(tmp_ctx);
2564         return 0;
2565 }
2566
2567
2568 /*
2569   destroy a ctdb_client_ip structure
2570  */
2571 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2572 {
2573         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2574                 ctdb_addr_to_str(&ip->addr),
2575                 ntohs(ip->addr.ip.sin_port),
2576                 ip->client_id));
2577
2578         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2579         return 0;
2580 }
2581
2582 /*
2583   called by a client to inform us of a TCP connection that it is managing
2584   that should tickled with an ACK when IP takeover is done
2585   we handle both the old ipv4 style of packets as well as the new ipv4/6
2586   pdus.
2587  */
2588 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2589                                 TDB_DATA indata)
2590 {
2591         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2592         struct ctdb_control_tcp *old_addr = NULL;
2593         struct ctdb_control_tcp_addr new_addr;
2594         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2595         struct ctdb_tcp_list *tcp;
2596         struct ctdb_tcp_connection t;
2597         int ret;
2598         TDB_DATA data;
2599         struct ctdb_client_ip *ip;
2600         struct ctdb_vnn *vnn;
2601         ctdb_sock_addr addr;
2602
2603         switch (indata.dsize) {
2604         case sizeof(struct ctdb_control_tcp):
2605                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2606                 ZERO_STRUCT(new_addr);
2607                 tcp_sock = &new_addr;
2608                 tcp_sock->src.ip  = old_addr->src;
2609                 tcp_sock->dest.ip = old_addr->dest;
2610                 break;
2611         case sizeof(struct ctdb_control_tcp_addr):
2612                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2613                 break;
2614         default:
2615                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2616                                  "to ctdb_control_tcp_client. size was %d but "
2617                                  "only allowed sizes are %lu and %lu\n",
2618                                  (int)indata.dsize,
2619                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2620                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2621                 return -1;
2622         }
2623
2624         addr = tcp_sock->src;
2625         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2626         addr = tcp_sock->dest;
2627         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2628
2629         ZERO_STRUCT(addr);
2630         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2631         vnn = find_public_ip_vnn(ctdb, &addr);
2632         if (vnn == NULL) {
2633                 switch (addr.sa.sa_family) {
2634                 case AF_INET:
2635                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2636                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2637                                         ctdb_addr_to_str(&addr)));
2638                         }
2639                         break;
2640                 case AF_INET6:
2641                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2642                                 ctdb_addr_to_str(&addr)));
2643                         break;
2644                 default:
2645                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2646                 }
2647
2648                 return 0;
2649         }
2650
2651         if (vnn->pnn != ctdb->pnn) {
2652                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2653                         ctdb_addr_to_str(&addr),
2654                         client_id, client->pid));
2655                 /* failing this call will tell smbd to die */
2656                 return -1;
2657         }
2658
2659         ip = talloc(client, struct ctdb_client_ip);
2660         CTDB_NO_MEMORY(ctdb, ip);
2661
2662         ip->ctdb      = ctdb;
2663         ip->addr      = addr;
2664         ip->client_id = client_id;
2665         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2666         DLIST_ADD(ctdb->client_ip_list, ip);
2667
2668         tcp = talloc(client, struct ctdb_tcp_list);
2669         CTDB_NO_MEMORY(ctdb, tcp);
2670
2671         tcp->connection.src_addr = tcp_sock->src;
2672         tcp->connection.dst_addr = tcp_sock->dest;
2673
2674         DLIST_ADD(client->tcp_list, tcp);
2675
2676         t.src_addr = tcp_sock->src;
2677         t.dst_addr = tcp_sock->dest;
2678
2679         data.dptr = (uint8_t *)&t;
2680         data.dsize = sizeof(t);
2681
2682         switch (addr.sa.sa_family) {
2683         case AF_INET:
2684                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2685                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2686                         ctdb_addr_to_str(&tcp_sock->src),
2687                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2688                 break;
2689         case AF_INET6:
2690                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2691                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2692                         ctdb_addr_to_str(&tcp_sock->src),
2693                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2694                 break;
2695         default:
2696                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2697         }
2698
2699
2700         /* tell all nodes about this tcp connection */
2701         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2702                                        CTDB_CONTROL_TCP_ADD,
2703                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2704         if (ret != 0) {
2705                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2706                 return -1;
2707         }
2708
2709         return 0;
2710 }
2711
2712 /*
2713   find a tcp address on a list
2714  */
2715 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2716                                            struct ctdb_tcp_connection *tcp)
2717 {
2718         int i;
2719
2720         if (array == NULL) {
2721                 return NULL;
2722         }
2723
2724         for (i=0;i<array->num;i++) {
2725                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2726                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2727                         return &array->connections[i];
2728                 }
2729         }
2730         return NULL;
2731 }
2732
2733
2734
2735 /*
2736   called by a daemon to inform us of a TCP connection that one of its
2737   clients managing that should tickled with an ACK when IP takeover is
2738   done
2739  */
2740 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2741 {
2742         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2743         struct ctdb_tcp_array *tcparray;
2744         struct ctdb_tcp_connection tcp;
2745         struct ctdb_vnn *vnn;
2746
2747         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2748         if (vnn == NULL) {
2749                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2750                         ctdb_addr_to_str(&p->dst_addr)));
2751
2752                 return -1;
2753         }
2754
2755
2756         tcparray = vnn->tcp_array;
2757
2758         /* If this is the first tickle */
2759         if (tcparray == NULL) {
2760                 tcparray = talloc_size(ctdb->nodes, 
2761                         offsetof(struct ctdb_tcp_array, connections) +
2762                         sizeof(struct ctdb_tcp_connection) * 1);
2763                 CTDB_NO_MEMORY(ctdb, tcparray);
2764                 vnn->tcp_array = tcparray;
2765
2766                 tcparray->num = 0;
2767                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2768                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2769
2770                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2771                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2772                 tcparray->num++;
2773
2774                 if (tcp_update_needed) {
2775                         vnn->tcp_update_needed = true;
2776                 }
2777                 return 0;
2778         }
2779
2780
2781         /* Do we already have this tickle ?*/
2782         tcp.src_addr = p->src_addr;
2783         tcp.dst_addr = p->dst_addr;
2784         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2785                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2786                         ctdb_addr_to_str(&tcp.dst_addr),
2787                         ntohs(tcp.dst_addr.ip.sin_port),
2788                         vnn->pnn));
2789                 return 0;
2790         }
2791
2792         /* A new tickle, we must add it to the array */
2793         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2794                                         struct ctdb_tcp_connection,
2795                                         tcparray->num+1);
2796         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2797
2798         vnn->tcp_array = tcparray;
2799         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2800         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2801         tcparray->num++;
2802                                 
2803         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2804                 ctdb_addr_to_str(&tcp.dst_addr),
2805                 ntohs(tcp.dst_addr.ip.sin_port),
2806                 vnn->pnn));
2807
2808         if (tcp_update_needed) {
2809                 vnn->tcp_update_needed = true;
2810         }
2811
2812         return 0;
2813 }
2814
2815
2816 /*
2817   called by a daemon to inform us of a TCP connection that one of its
2818   clients managing that should tickled with an ACK when IP takeover is
2819   done
2820  */
2821 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2822 {
2823         struct ctdb_tcp_connection *tcpp;
2824         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2825
2826         if (vnn == NULL) {
2827                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2828                         ctdb_addr_to_str(&conn->dst_addr)));
2829                 return;
2830         }
2831
2832         /* if the array is empty we cant remove it
2833            and we dont need to do anything
2834          */
2835         if (vnn->tcp_array == NULL) {
2836                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2837                         ctdb_addr_to_str(&conn->dst_addr),
2838                         ntohs(conn->dst_addr.ip.sin_port)));
2839                 return;
2840         }
2841
2842
2843         /* See if we know this connection
2844            if we dont know this connection  then we dont need to do anything
2845          */
2846         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2847         if (tcpp == NULL) {
2848                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2849                         ctdb_addr_to_str(&conn->dst_addr),
2850                         ntohs(conn->dst_addr.ip.sin_port)));
2851                 return;
2852         }
2853
2854
2855         /* We need to remove this entry from the array.
2856            Instead of allocating a new array and copying data to it
2857            we cheat and just copy the last entry in the existing array
2858            to the entry that is to be removed and just shring the 
2859            ->num field
2860          */
2861         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2862         vnn->tcp_array->num--;
2863
2864         /* If we deleted the last entry we also need to remove the entire array
2865          */
2866         if (vnn->tcp_array->num == 0) {
2867                 talloc_free(vnn->tcp_array);
2868                 vnn->tcp_array = NULL;
2869         }               
2870
2871         vnn->tcp_update_needed = true;
2872
2873         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2874                 ctdb_addr_to_str(&conn->src_addr),
2875                 ntohs(conn->src_addr.ip.sin_port)));
2876 }
2877
2878
2879 /*
2880   called by a daemon to inform us of a TCP connection that one of its
2881   clients used are no longer needed in the tickle database
2882  */
2883 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2884 {
2885         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2886
2887         ctdb_remove_tcp_connection(ctdb, conn);
2888
2889         return 0;
2890 }
2891
2892
2893 /*
2894   called when a daemon restarts - send all tickes for all public addresses
2895   we are serving immediately to the new node.
2896  */
2897 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2898 {
2899 /*XXX here we should send all tickes we are serving to the new node */
2900         return 0;
2901 }
2902
2903
2904 /*
2905   called when a client structure goes away - hook to remove
2906   elements from the tcp_list in all daemons
2907  */
2908 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2909 {
2910         while (client->tcp_list) {
2911                 struct ctdb_tcp_list *tcp = client->tcp_list;
2912                 DLIST_REMOVE(client->tcp_list, tcp);
2913                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2914         }
2915 }
2916
2917
2918 /*
2919   release all IPs on shutdown
2920  */
2921 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2922 {
2923         struct ctdb_vnn *vnn;
2924
2925         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2926                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2927                         ctdb_vnn_unassign_iface(ctdb, vnn);
2928                         continue;
2929                 }
2930                 if (!vnn->iface) {
2931                         continue;
2932                 }
2933                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2934                                   ctdb_vnn_iface_string(vnn),
2935                                   ctdb_addr_to_str(&vnn->public_address),
2936                                   vnn->public_netmask_bits);
2937                 release_kill_clients(ctdb, &vnn->public_address);
2938                 ctdb_vnn_unassign_iface(ctdb, vnn);
2939         }
2940 }
2941
2942
2943 /*
2944   get list of public IPs
2945  */
2946 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2947                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2948 {
2949         int i, num, len;
2950         struct ctdb_all_public_ips *ips;
2951         struct ctdb_vnn *vnn;
2952         bool only_available = false;
2953
2954         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2955                 only_available = true;
2956         }
2957
2958         /* count how many public ip structures we have */
2959         num = 0;
2960         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2961                 num++;
2962         }
2963
2964         len = offsetof(struct ctdb_all_public_ips, ips) + 
2965                 num*sizeof(struct ctdb_public_ip);
2966         ips = talloc_zero_size(outdata, len);
2967         CTDB_NO_MEMORY(ctdb, ips);
2968
2969         i = 0;
2970         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2971                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2972                         continue;
2973                 }
2974                 ips->ips[i].pnn  = vnn->pnn;
2975                 ips->ips[i].addr = vnn->public_address;
2976                 i++;
2977         }
2978         ips->num = i;
2979         len = offsetof(struct ctdb_all_public_ips, ips) +
2980                 i*sizeof(struct ctdb_public_ip);
2981
2982         outdata->dsize = len;
2983         outdata->dptr  = (uint8_t *)ips;
2984
2985         return 0;
2986 }
2987
2988
2989 /*
2990   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2991  */
2992 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2993                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2994 {
2995         int i, num, len;
2996         struct ctdb_all_public_ipsv4 *ips;
2997         struct ctdb_vnn *vnn;
2998
2999         /* count how many public ip structures we have */
3000         num = 0;
3001         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3002                 if (vnn->public_address.sa.sa_family != AF_INET) {
3003                         continue;
3004                 }
3005                 num++;
3006         }
3007
3008         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3009                 num*sizeof(struct ctdb_public_ipv4);
3010         ips = talloc_zero_size(outdata, len);
3011         CTDB_NO_MEMORY(ctdb, ips);
3012
3013         outdata->dsize = len;
3014         outdata->dptr  = (uint8_t *)ips;
3015
3016         ips->num = num;
3017         i = 0;
3018         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3019                 if (vnn->public_address.sa.sa_family != AF_INET) {
3020                         continue;
3021                 }
3022                 ips->ips[i].pnn = vnn->pnn;
3023                 ips->ips[i].sin = vnn->public_address.ip;
3024                 i++;
3025         }
3026
3027         return 0;
3028 }
3029
3030 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3031                                         struct ctdb_req_control *c,
3032                                         TDB_DATA indata,
3033                                         TDB_DATA *outdata)
3034 {
3035         int i, num, len;
3036         ctdb_sock_addr *addr;
3037         struct ctdb_control_public_ip_info *info;
3038         struct ctdb_vnn *vnn;
3039
3040         addr = (ctdb_sock_addr *)indata.dptr;
3041
3042         vnn = find_public_ip_vnn(ctdb, addr);
3043         if (vnn == NULL) {
3044                 /* if it is not a public ip   it could be our 'single ip' */
3045                 if (ctdb->single_ip_vnn) {
3046                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3047                                 vnn = ctdb->single_ip_vnn;
3048                         }
3049                 }
3050         }
3051         if (vnn == NULL) {
3052                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3053                                  "'%s'not a public address\n",
3054                                  ctdb_addr_to_str(addr)));
3055                 return -1;
3056         }
3057
3058         /* count how many public ip structures we have */
3059         num = 0;
3060         for (;vnn->ifaces[num];) {
3061                 num++;
3062         }
3063
3064         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3065                 num*sizeof(struct ctdb_control_iface_info);
3066         info = talloc_zero_size(outdata, len);
3067         CTDB_NO_MEMORY(ctdb, info);
3068
3069         info->ip.addr = vnn->public_address;
3070         info->ip.pnn = vnn->pnn;
3071         info->active_idx = 0xFFFFFFFF;
3072
3073         for (i=0; vnn->ifaces[i]; i++) {
3074                 struct ctdb_iface *cur;
3075
3076                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3077                 if (cur == NULL) {
3078                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3079                                            vnn->ifaces[i]));
3080                         return -1;
3081                 }
3082                 if (vnn->iface == cur) {
3083                         info->active_idx = i;
3084                 }
3085                 strcpy(info->ifaces[i].name, cur->name);
3086                 info->ifaces[i].link_state = cur->link_up;
3087                 info->ifaces[i].references = cur->references;
3088         }
3089         info->num = i;
3090         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3091                 i*sizeof(struct ctdb_control_iface_info);
3092
3093         outdata->dsize = len;
3094         outdata->dptr  = (uint8_t *)info;
3095
3096         return 0;
3097 }
3098
3099 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3100                                 struct ctdb_req_control *c,
3101                                 TDB_DATA *outdata)
3102 {
3103         int i, num, len;
3104         struct ctdb_control_get_ifaces *ifaces;
3105         struct ctdb_iface *cur;
3106
3107         /* count how many public ip structures we have */
3108         num = 0;
3109         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3110                 num++;
3111         }
3112
3113         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3114                 num*sizeof(struct ctdb_control_iface_info);
3115         ifaces = talloc_zero_size(outdata, len);
3116         CTDB_NO_MEMORY(ctdb, ifaces);
3117
3118         i = 0;
3119         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3120                 strcpy(ifaces->ifaces[i].name, cur->name);
3121                 ifaces->ifaces[i].link_state = cur->link_up;
3122                 ifaces->ifaces[i].references = cur->references;
3123                 i++;
3124         }
3125         ifaces->num = i;
3126         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3127                 i*sizeof(struct ctdb_control_iface_info);
3128
3129         outdata->dsize = len;
3130         outdata->dptr  = (uint8_t *)ifaces;
3131
3132         return 0;
3133 }
3134
3135 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3136                                     struct ctdb_req_control *c,
3137                                     TDB_DATA indata)
3138 {
3139         struct ctdb_control_iface_info *info;
3140         struct ctdb_iface *iface;
3141         bool link_up = false;
3142
3143         info = (struct ctdb_control_iface_info *)indata.dptr;
3144
3145         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3146                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3147                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3148                                   len, len, info->name));
3149                 return -1;
3150         }
3151
3152         switch (info->link_state) {
3153         case 0:
3154                 link_up = false;
3155                 break;
3156         case 1:
3157                 link_up = true;
3158                 break;
3159         default:
3160                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3161                                   (unsigned int)info->link_state));
3162                 return -1;
3163         }
3164
3165         if (info->references != 0) {
3166                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3167                                   (unsigned int)info->references));
3168                 return -1;
3169         }
3170
3171         iface = ctdb_find_iface(ctdb, info->name);
3172         if (iface == NULL) {
3173                 return -1;
3174         }
3175
3176         if (link_up == iface->link_up) {
3177                 return 0;
3178         }
3179
3180         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3181               ("iface[%s] has changed it's link status %s => %s\n",
3182                iface->name,
3183                iface->link_up?"up":"down",
3184                link_up?"up":"down"));
3185
3186         iface->link_up = link_up;
3187         return 0;
3188 }
3189
3190
3191 /* 
3192    structure containing the listening socket and the list of tcp connections
3193    that the ctdb daemon is to kill
3194 */
3195 struct ctdb_kill_tcp {
3196         struct ctdb_vnn *vnn;
3197         struct ctdb_context *ctdb;
3198         int capture_fd;
3199         struct fd_event *fde;
3200         trbt_tree_t *connections;
3201         void *private_data;
3202 };
3203
3204 /*
3205   a tcp connection that is to be killed
3206  */
3207 struct ctdb_killtcp_con {
3208         ctdb_sock_addr src_addr;
3209         ctdb_sock_addr dst_addr;
3210         int count;
3211         struct ctdb_kill_tcp *killtcp;
3212 };
3213
3214 /* this function is used to create a key to represent this socketpair
3215    in the killtcp tree.
3216    this key is used to insert and lookup matching socketpairs that are
3217    to be tickled and RST
3218 */
3219 #define KILLTCP_KEYLEN  10
3220 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3221 {
3222         static uint32_t key[KILLTCP_KEYLEN];
3223
3224         bzero(key, sizeof(key));
3225
3226         if (src->sa.sa_family != dst->sa.sa_family) {
3227                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3228                 return key;
3229         }
3230         
3231         switch (src->sa.sa_family) {
3232         case AF_INET:
3233                 key[0]  = dst->ip.sin_addr.s_addr;
3234                 key[1]  = src->ip.sin_addr.s_addr;
3235                 key[2]  = dst->ip.sin_port;
3236                 key[3]  = src->ip.sin_port;
3237                 break;
3238         case AF_INET6: {
3239                 uint32_t *dst6_addr32 =
3240                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3241                 uint32_t *src6_addr32 =
3242                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3243                 key[0]  = dst6_addr32[3];
3244                 key[1]  = src6_addr32[3];
3245                 key[2]  = dst6_addr32[2];
3246                 key[3]  = src6_addr32[2];
3247                 key[4]  = dst6_addr32[1];
3248                 key[5]  = src6_addr32[1];
3249                 key[6]  = dst6_addr32[0];
3250                 key[7]  = src6_addr32[0];
3251                 key[8]  = dst->ip6.sin6_port;
3252                 key[9]  = src->ip6.sin6_port;
3253                 break;
3254         }
3255         default:
3256                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3257                 return key;
3258         }
3259
3260         return key;
3261 }
3262
3263 /*
3264   called when we get a read event on the raw socket
3265  */
3266 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3267                                 uint16_t flags, void *private_data)
3268 {
3269         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3270         struct ctdb_killtcp_con *con;
3271         ctdb_sock_addr src, dst;
3272         uint32_t ack_seq, seq;
3273
3274         if (!(flags & EVENT_FD_READ)) {
3275                 return;
3276         }
3277
3278         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3279                                 killtcp->private_data,
3280                                 &src, &dst,
3281                                 &ack_seq, &seq) != 0) {
3282                 /* probably a non-tcp ACK packet */
3283                 return;
3284         }
3285
3286         /* check if we have this guy in our list of connections
3287            to kill
3288         */
3289         con = trbt_lookuparray32(killtcp->connections, 
3290                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3291         if (con == NULL) {
3292                 /* no this was some other packet we can just ignore */
3293                 return;
3294         }
3295
3296         /* This one has been tickled !
3297            now reset him and remove him from the list.
3298          */
3299         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3300                 ntohs(con->dst_addr.ip.sin_port),
3301                 ctdb_addr_to_str(&con->src_addr),
3302                 ntohs(con->src_addr.ip.sin_port)));
3303
3304         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3305         talloc_free(con);
3306 }
3307
3308
3309 /* when traversing the list of all tcp connections to send tickle acks to
3310    (so that we can capture the ack coming back and kill the connection
3311     by a RST)
3312    this callback is called for each connection we are currently trying to kill
3313 */
3314 static int tickle_connection_traverse(void *param, void *data)
3315 {
3316         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3317
3318         /* have tried too many times, just give up */
3319         if (con->count >= 5) {
3320                 /* can't delete in traverse: reparent to delete_cons */
3321                 talloc_steal(param, con);
3322                 return 0;
3323         }
3324
3325         /* othervise, try tickling it again */
3326         con->count++;
3327         ctdb_sys_send_tcp(
3328                 (ctdb_sock_addr *)&con->dst_addr,
3329                 (ctdb_sock_addr *)&con->src_addr,
3330                 0, 0, 0);
3331         return 0;
3332 }
3333
3334
3335 /* 
3336    called every second until all sentenced connections have been reset
3337  */
3338 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3339                                               struct timeval t, void *private_data)
3340 {
3341         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3342         void *delete_cons = talloc_new(NULL);
3343
3344         /* loop over all connections sending tickle ACKs */
3345         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3346
3347         /* now we've finished traverse, it's safe to do deletion. */
3348         talloc_free(delete_cons);
3349
3350         /* If there are no more connections to kill we can remove the
3351            entire killtcp structure
3352          */
3353         if ( (killtcp->connections == NULL) || 
3354              (killtcp->connections->root == NULL) ) {
3355                 talloc_free(killtcp);
3356                 return;
3357         }
3358
3359         /* try tickling them again in a seconds time
3360          */
3361         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3362                         ctdb_tickle_sentenced_connections, killtcp);
3363 }
3364
3365 /*
3366   destroy the killtcp structure
3367  */
3368 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3369 {
3370         struct ctdb_vnn *tmpvnn;
3371
3372         /* verify that this vnn is still active */
3373         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3374                 if (tmpvnn == killtcp->vnn) {
3375                         break;
3376                 }
3377         }
3378
3379         if (tmpvnn == NULL) {
3380                 return 0;
3381         }
3382
3383         if (killtcp->vnn->killtcp != killtcp) {
3384                 return 0;
3385         }
3386
3387         killtcp->vnn->killtcp = NULL;
3388
3389         return 0;
3390 }
3391
3392
3393 /* nothing fancy here, just unconditionally replace any existing
3394    connection structure with the new one.
3395
3396    dont even free the old one if it did exist, that one is talloc_stolen
3397    by the same node in the tree anyway and will be deleted when the new data 
3398    is deleted
3399 */
3400 static void *add_killtcp_callback(void *parm, void *data)
3401 {
3402         return parm;
3403 }
3404
3405 /*
3406   add a tcp socket to the list of connections we want to RST
3407  */
3408 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3409                                        ctdb_sock_addr *s,
3410                                        ctdb_sock_addr *d)
3411 {
3412         ctdb_sock_addr src, dst;
3413         struct ctdb_kill_tcp *killtcp;
3414         struct ctdb_killtcp_con *con;
3415         struct ctdb_vnn *vnn;
3416
3417         ctdb_canonicalize_ip(s, &src);
3418         ctdb_canonicalize_ip(d, &dst);
3419
3420         vnn = find_public_ip_vnn(ctdb, &dst);
3421         if (vnn == NULL) {
3422                 vnn = find_public_ip_vnn(ctdb, &src);
3423         }
3424         if (vnn == NULL) {
3425                 /* if it is not a public ip   it could be our 'single ip' */
3426                 if (ctdb->single_ip_vnn) {
3427                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3428                                 vnn = ctdb->single_ip_vnn;
3429                         }
3430                 }
3431         }
3432         if (vnn == NULL) {
3433                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3434                 return -1;
3435         }
3436
3437         killtcp = vnn->killtcp;
3438         
3439         /* If this is the first connection to kill we must allocate
3440            a new structure
3441          */
3442         if (killtcp == NULL) {
3443                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3444                 CTDB_NO_MEMORY(ctdb, killtcp);
3445
3446                 killtcp->vnn         = vnn;
3447                 killtcp->ctdb        = ctdb;
3448                 killtcp->capture_fd  = -1;
3449                 killtcp->connections = trbt_create(killtcp, 0);
3450
3451                 vnn->killtcp         = killtcp;
3452                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3453         }
3454
3455
3456
3457         /* create a structure that describes this connection we want to
3458            RST and store it in killtcp->connections
3459         */
3460         con = talloc(killtcp, struct ctdb_killtcp_con);
3461         CTDB_NO_MEMORY(ctdb, con);
3462         con->src_addr = src;
3463         con->dst_addr = dst;
3464         con->count    = 0;
3465         con->killtcp  = killtcp;
3466
3467
3468         trbt_insertarray32_callback(killtcp->connections,
3469                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3470                         add_killtcp_callback, con);
3471
3472         /* 
3473            If we dont have a socket to listen on yet we must create it
3474          */
3475         if (killtcp->capture_fd == -1) {
3476                 const char *iface = ctdb_vnn_iface_string(vnn);
3477                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3478                 if (killtcp->capture_fd == -1) {
3479                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3480                                           "socket on iface '%s' for killtcp (%s)\n",
3481                                           iface, strerror(errno)));
3482                         goto failed;
3483                 }
3484         }
3485
3486
3487         if (killtcp->fde == NULL) {
3488                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3489                                             EVENT_FD_READ,
3490                                             capture_tcp_handler, killtcp);
3491                 tevent_fd_set_auto_close(killtcp->fde);
3492
3493                 /* We also need to set up some events to tickle all these connections
3494                    until they are all reset
3495                 */
3496                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3497                                 ctdb_tickle_sentenced_connections, killtcp);
3498         }
3499
3500         /* tickle him once now */
3501         ctdb_sys_send_tcp(
3502                 &con->dst_addr,
3503                 &con->src_addr,
3504                 0, 0, 0);
3505
3506         return 0;
3507
3508 failed:
3509         talloc_free(vnn->killtcp);
3510         vnn->killtcp = NULL;
3511         return -1;
3512 }
3513
3514 /*
3515   kill a TCP connection.
3516  */
3517 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3518 {
3519         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3520
3521         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3522 }
3523
3524 /*
3525   called by a daemon to inform us of the entire list of TCP tickles for
3526   a particular public address.
3527   this control should only be sent by the node that is currently serving
3528   that public address.
3529  */
3530 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3531 {
3532         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3533         struct ctdb_tcp_array *tcparray;
3534         struct ctdb_vnn *vnn;
3535
3536         /* We must at least have tickles.num or else we cant verify the size
3537            of the received data blob
3538          */
3539         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3540                                         tickles.connections)) {
3541                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3542                 return -1;
3543         }
3544
3545         /* verify that the size of data matches what we expect */
3546         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3547                                 tickles.connections)
3548                          + sizeof(struct ctdb_tcp_connection)
3549                                  * list->tickles.num) {
3550                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3551                 return -1;
3552         }       
3553
3554         vnn = find_public_ip_vnn(ctdb, &list->addr);
3555         if (vnn == NULL) {
3556                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3557                         ctdb_addr_to_str(&list->addr)));
3558
3559                 return 1;
3560         }
3561
3562         /* remove any old ticklelist we might have */
3563         talloc_free(vnn->tcp_array);
3564         vnn->tcp_array = NULL;
3565
3566         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3567         CTDB_NO_MEMORY(ctdb, tcparray);
3568
3569         tcparray->num = list->tickles.num;
3570
3571         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3572         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3573
3574         memcpy(tcparray->connections, &list->tickles.connections[0], 
3575                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3576
3577         /* We now have a new fresh tickle list array for this vnn */
3578         vnn->tcp_array = talloc_steal(vnn, tcparray);
3579         
3580         return 0;
3581 }
3582
3583 /*
3584   called to return the full list of tickles for the puclic address associated 
3585   with the provided vnn
3586  */
3587 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3588 {
3589         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3590         struct ctdb_control_tcp_tickle_list *list;
3591         struct ctdb_tcp_array *tcparray;
3592         int num;
3593         struct ctdb_vnn *vnn;
3594
3595         vnn = find_public_ip_vnn(ctdb, addr);
3596         if (vnn == NULL) {
3597                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3598                         ctdb_addr_to_str(addr)));
3599
3600                 return 1;
3601         }
3602
3603         tcparray = vnn->tcp_array;
3604         if (tcparray) {
3605                 num = tcparray->num;
3606         } else {
3607                 num = 0;
3608         }
3609
3610         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3611                                 tickles.connections)
3612                         + sizeof(struct ctdb_tcp_connection) * num;
3613
3614         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3615         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3616         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3617
3618         list->addr = *addr;
3619         list->tickles.num = num;
3620         if (num) {
3621                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3622                         sizeof(struct ctdb_tcp_connection) * num);
3623         }
3624
3625         return 0;
3626 }
3627
3628
3629 /*
3630   set the list of all tcp tickles for a public address
3631  */
3632 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3633                               struct timeval timeout, uint32_t destnode, 
3634                               ctdb_sock_addr *addr,
3635                               struct ctdb_tcp_array *tcparray)
3636 {
3637         int ret, num;
3638         TDB_DATA data;
3639         struct ctdb_control_tcp_tickle_list *list;
3640
3641         if (tcparray) {
3642                 num = tcparray->num;
3643         } else {
3644                 num = 0;
3645         }
3646
3647         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3648                                 tickles.connections) +
3649                         sizeof(struct ctdb_tcp_connection) * num;
3650         data.dptr = talloc_size(ctdb, data.dsize);
3651         CTDB_NO_MEMORY(ctdb, data.dptr);
3652
3653         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3654         list->addr = *addr;
3655         list->tickles.num = num;
3656         if (tcparray) {
3657                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3658         }
3659
3660         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3661                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3662                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3663         if (ret != 0) {
3664                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3665                 return -1;
3666         }
3667
3668         talloc_free(data.dptr);
3669
3670         return ret;
3671 }
3672
3673
3674 /*
3675   perform tickle updates if required
3676  */
3677 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3678                                 struct timed_event *te, 
3679                                 struct timeval t, void *private_data)
3680 {
3681         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3682         int ret;
3683         struct ctdb_vnn *vnn;
3684
3685         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3686                 /* we only send out updates for public addresses that 
3687                    we have taken over
3688                  */
3689                 if (ctdb->pnn != vnn->pnn) {
3690                         continue;
3691                 }
3692                 /* We only send out the updates if we need to */
3693                 if (!vnn->tcp_update_needed) {
3694                         continue;
3695                 }
3696                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3697                                 TAKEOVER_TIMEOUT(),
3698                                 CTDB_BROADCAST_CONNECTED,
3699                                 &vnn->public_address,
3700                                 vnn->tcp_array);
3701                 if (ret != 0) {
3702                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3703                                 ctdb_addr_to_str(&vnn->public_address)));
3704                 }
3705         }
3706
3707         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3708                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3709                              ctdb_update_tcp_tickles, ctdb);
3710 }               
3711         
3712
3713 /*
3714   start periodic update of tcp tickles
3715  */
3716 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3717 {
3718         ctdb->tickle_update_context = talloc_new(ctdb);
3719
3720         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3721                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3722                              ctdb_update_tcp_tickles, ctdb);
3723 }
3724
3725
3726
3727
3728 struct control_gratious_arp {
3729         struct ctdb_context *ctdb;
3730         ctdb_sock_addr addr;
3731         const char *iface;
3732         int count;
3733 };
3734
3735 /*
3736   send a control_gratuitous arp
3737  */
3738 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3739                                   struct timeval t, void *private_data)
3740 {
3741         int ret;
3742         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3743                                                         struct control_gratious_arp);
3744
3745         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3746         if (ret != 0) {
3747                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3748                                  arp->iface, strerror(errno)));
3749         }
3750
3751
3752         arp->count++;
3753         if (arp->count == CTDB_ARP_REPEAT) {
3754                 talloc_free(arp);
3755                 return;
3756         }
3757
3758         event_add_timed(arp->ctdb->ev, arp, 
3759                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3760                         send_gratious_arp, arp);
3761 }
3762
3763
3764 /*
3765   send a gratious arp 
3766  */
3767 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3768 {
3769         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3770         struct control_gratious_arp *arp;
3771
3772         /* verify the size of indata */
3773         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3774                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3775                                  (unsigned)indata.dsize, 
3776                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3777                 return -1;
3778         }
3779         if (indata.dsize != 
3780                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3781                 + gratious_arp->len ) ){
3782
3783                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3784                         "but should be %u bytes\n", 
3785                          (unsigned)indata.dsize, 
3786                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3787                 return -1;
3788         }
3789
3790
3791         arp = talloc(ctdb, struct control_gratious_arp);
3792         CTDB_NO_MEMORY(ctdb, arp);
3793
3794         arp->ctdb  = ctdb;
3795         arp->addr   = gratious_arp->addr;
3796         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3797         CTDB_NO_MEMORY(ctdb, arp->iface);
3798         arp->count = 0;
3799         
3800         event_add_timed(arp->ctdb->ev, arp, 
3801                         timeval_zero(), send_gratious_arp, arp);
3802
3803         return 0;
3804 }
3805
3806 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3807 {
3808         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3809         int ret;
3810
3811         /* verify the size of indata */
3812         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3813                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3814                 return -1;
3815         }
3816         if (indata.dsize != 
3817                 ( offsetof(struct ctdb_control_ip_iface, iface)
3818                 + pub->len ) ){
3819
3820                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3821                         "but should be %u bytes\n", 
3822                          (unsigned)indata.dsize, 
3823                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3824                 return -1;
3825         }
3826
3827         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3828
3829         if (ret != 0) {
3830                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3831                 return -1;
3832         }
3833
3834         return 0;
3835 }
3836
3837 /*
3838   called when releaseip event finishes for del_public_address
3839  */
3840 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3841                                 void *private_data)
3842 {
3843         talloc_free(private_data);
3844 }
3845
3846 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3847 {
3848         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3849         struct ctdb_vnn *vnn;
3850         int ret;
3851
3852         /* verify the size of indata */
3853         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3854                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3855                 return -1;
3856         }
3857         if (indata.dsize != 
3858                 ( offsetof(struct ctdb_control_ip_iface, iface)
3859                 + pub->len ) ){
3860
3861                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3862                         "but should be %u bytes\n", 
3863                          (unsigned)indata.dsize, 
3864                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3865                 return -1;
3866         }
3867
3868         /* walk over all public addresses until we find a match */
3869         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3870                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3871                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3872
3873                         DLIST_REMOVE(ctdb->vnn, vnn);
3874                         talloc_steal(mem_ctx, vnn);
3875                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3876                         if (vnn->pnn != ctdb->pnn) {
3877                                 if (vnn->iface != NULL) {
3878                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3879                                 }
3880                                 talloc_free(mem_ctx);
3881                                 return 0;
3882                         }
3883                         vnn->pnn = -1;
3884
3885                         ret = ctdb_event_script_callback(ctdb, 
3886                                          mem_ctx, delete_ip_callback, mem_ctx,
3887                                          false,
3888                                          CTDB_EVENT_RELEASE_IP,
3889                                          "%s %s %u",
3890                                          ctdb_vnn_iface_string(vnn),
3891                                          ctdb_addr_to_str(&vnn->public_address),
3892                                          vnn->public_netmask_bits);
3893                         if (vnn->iface != NULL) {
3894                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3895                         }
3896                         if (ret != 0) {
3897                                 return -1;
3898                         }
3899                         return 0;
3900                 }
3901         }
3902
3903         return -1;
3904 }
3905
3906
3907 struct ipreallocated_callback_state {
3908         struct ctdb_req_control *c;
3909 };
3910
3911 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3912                                         int status, void *p)
3913 {
3914         struct ipreallocated_callback_state *state =
3915                 talloc_get_type(p, struct ipreallocated_callback_state);
3916
3917         if (status != 0) {
3918                 DEBUG(DEBUG_ERR,
3919                       (" \"ipreallocated\" event script failed (status %d)\n",
3920                        status));
3921                 if (status == -ETIME) {
3922                         ctdb_ban_self(ctdb);
3923                 }
3924         }
3925
3926         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3927         talloc_free(state);
3928 }
3929
3930 /* A control to run the ipreallocated event */
3931 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3932                                    struct ctdb_req_control *c,
3933                                    bool *async_reply)
3934 {
3935         int ret;
3936         struct ipreallocated_callback_state *state;
3937
3938         state = talloc(ctdb, struct ipreallocated_callback_state);
3939         CTDB_NO_MEMORY(ctdb, state);
3940
3941         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3942
3943         ret = ctdb_event_script_callback(ctdb, state,
3944                                          ctdb_ipreallocated_callback, state,
3945                                          false, CTDB_EVENT_IPREALLOCATED,
3946                                          "%s", "");
3947
3948         if (ret != 0) {
3949                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3950                 talloc_free(state);
3951                 return -1;
3952         }
3953
3954         /* tell the control that we will be reply asynchronously */
3955         state->c    = talloc_steal(state, c);
3956         *async_reply = true;
3957
3958         return 0;
3959 }
3960
3961
3962 /* This function is called from the recovery daemon to verify that a remote
3963    node has the expected ip allocation.
3964    This is verified against ctdb->ip_tree
3965 */
3966 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3967 {
3968         struct ctdb_public_ip_list *tmp_ip; 
3969         int i;
3970
3971         if (ctdb->ip_tree == NULL) {
3972                 /* dont know the expected allocation yet, assume remote node
3973                    is correct. */
3974                 return 0;
3975         }
3976
3977         if (ips == NULL) {
3978                 return 0;
3979         }
3980
3981         for (i=0; i<ips->num; i++) {
3982                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3983                 if (tmp_ip == NULL) {
3984                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3985                         return -1;
3986                 }
3987
3988                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3989                         continue;
3990                 }
3991
3992                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3993                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3994                         return -1;
3995                 }
3996         }
3997
3998         return 0;
3999 }
4000
4001 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4002 {
4003         struct ctdb_public_ip_list *tmp_ip; 
4004
4005         if (ctdb->ip_tree == NULL) {
4006                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4007                 return -1;
4008         }
4009
4010         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4011         if (tmp_ip == NULL) {
4012                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4013                 return -1;
4014         }
4015
4016         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4017         tmp_ip->pnn = ip->pnn;
4018
4019         return 0;
4020 }
4021
4022
4023 struct ctdb_reloadips_handle {
4024         struct ctdb_context *ctdb;
4025         struct ctdb_req_control *c;
4026         int status;
4027         int fd[2];
4028         pid_t child;
4029         struct fd_event *fde;
4030 };
4031
4032 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4033 {
4034         if (h == h->ctdb->reload_ips) {
4035                 h->ctdb->reload_ips = NULL;
4036         }
4037         if (h->c != NULL) {
4038                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4039                 h->c = NULL;
4040         }
4041         ctdb_kill(h->ctdb, h->child, SIGKILL);
4042         return 0;
4043 }
4044
4045 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4046                                 struct timed_event *te,
4047                                 struct timeval t, void *private_data)
4048 {
4049         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4050
4051         talloc_free(h);
4052 }       
4053
4054 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4055                              uint16_t flags, void *private_data)
4056 {
4057         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4058
4059         char res;
4060         int ret;
4061
4062         ret = read(h->fd[0], &res, 1);
4063         if (ret < 1 || res != 0) {
4064                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4065                 res = 1;
4066         }
4067         h->status = res;
4068
4069         talloc_free(h);
4070 }
4071
4072 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4073 {
4074         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4075         struct ctdb_all_public_ips *ips;
4076         struct ctdb_vnn *vnn;
4077         int i, ret;
4078
4079         /* read the ip allocation from the local node */
4080         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4081         if (ret != 0) {
4082                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4083                 talloc_free(mem_ctx);
4084                 return -1;
4085         }
4086
4087         /* re-read the public ips file */
4088         ctdb->vnn = NULL;
4089         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4090                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4091                 talloc_free(mem_ctx);
4092                 return -1;
4093         }               
4094
4095
4096         /* check the previous list of ips and scan for ips that have been
4097            dropped.
4098          */
4099         for (i = 0; i < ips->num; i++) {
4100                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4101                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4102                                 break;
4103                         }
4104                 }
4105
4106                 /* we need to delete this ip, no longer available on this node */
4107                 if (vnn == NULL) {
4108                         struct ctdb_control_ip_iface pub;
4109
4110                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4111                         pub.addr  = ips->ips[i].addr;
4112                         pub.mask  = 0;
4113                         pub.len   = 0;
4114
4115                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4116                         if (ret != 0) {
4117                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4118                                 return -1;
4119                         }
4120                 }
4121         }
4122
4123
4124         /* loop over all new ones and check the ones we need to add */
4125         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4126                 for (i = 0; i < ips->num; i++) {
4127                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4128                                 break;
4129                         }
4130                 }
4131                 if (i == ips->num) {
4132                         struct ctdb_control_ip_iface pub;
4133                         const char *ifaces = NULL;
4134                         int iface = 0;
4135
4136                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4137
4138                         pub.addr  = vnn->public_address;
4139                         pub.mask  = vnn->public_netmask_bits;
4140
4141
4142                         ifaces = vnn->ifaces[0];
4143                         iface = 1;
4144                         while (vnn->ifaces[iface] != NULL) {
4145                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4146                                 iface++;
4147                         }
4148                         pub.len   = strlen(ifaces)+1;
4149                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4150
4151                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4152                         if (ret != 0) {
4153                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4154                                 return -1;
4155                         }
4156                 }
4157         }
4158
4159         return 0;
4160 }
4161
4162 /* This control is sent to force the node to re-read the public addresses file
4163    and drop any addresses we should nnot longer host, and add new addresses
4164    that we are now able to host
4165 */
4166 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4167 {
4168         struct ctdb_reloadips_handle *h;
4169         pid_t parent = getpid();
4170
4171         if (ctdb->reload_ips != NULL) {
4172                 talloc_free(ctdb->reload_ips);
4173                 ctdb->reload_ips = NULL;
4174         }
4175
4176         h = talloc(ctdb, struct ctdb_reloadips_handle);
4177         CTDB_NO_MEMORY(ctdb, h);
4178         h->ctdb     = ctdb;
4179         h->c        = NULL;
4180         h->status   = -1;
4181         
4182         if (pipe(h->fd) == -1) {
4183                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4184                 talloc_free(h);
4185                 return -1;
4186         }
4187
4188         h->child = ctdb_fork(ctdb);
4189         if (h->child == (pid_t)-1) {
4190                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4191                 close(h->fd[0]);
4192                 close(h->fd[1]);
4193                 talloc_free(h);
4194                 return -1;
4195         }
4196
4197         /* child process */
4198         if (h->child == 0) {
4199                 signed char res = 0;
4200
4201                 close(h->fd[0]);
4202                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4203
4204                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4205                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4206                         res = -1;
4207                 } else {
4208                         res = ctdb_reloadips_child(ctdb);
4209                         if (res != 0) {
4210                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4211                         }
4212                 }
4213
4214                 write(h->fd[1], &res, 1);
4215                 /* make sure we die when our parent dies */
4216                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4217                         sleep(5);
4218                 }
4219                 _exit(0);
4220         }
4221
4222         h->c             = talloc_steal(h, c);
4223
4224         close(h->fd[1]);
4225         set_close_on_exec(h->fd[0]);
4226
4227         talloc_set_destructor(h, ctdb_reloadips_destructor);
4228
4229
4230         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4231                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4232                         (void *)h);
4233         tevent_fd_set_auto_close(h->fde);
4234
4235         event_add_timed(ctdb->ev, h,
4236                         timeval_current_ofs(120, 0),
4237                         ctdb_reloadips_timeout_event, h);
4238
4239         /* we reply later */
4240         *async_reply = true;
4241         return 0;
4242 }