recoverd: Move IP flags into ctdb_takeover.c
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* These flags are ONLY valid within IP allocation code and must be
37  * cleared to avoid confusing other recovery daemon functions
38  */
39 #define NODE_FLAGS_NOIPTAKEOVER         0x01000000 /* can not takeover additional IPs */
40 #define NODE_FLAGS_NOIPHOST             0x02000000 /* can not host IPs */
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = ctdb->done_startup;
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn,
122                                         TALLOC_CTX *mem_ctx)
123 {
124         struct ctdb_iface *i;
125
126         /* For each interface, check if there's an IP using it. */
127         for(i=ctdb->ifaces; i; i=i->next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         /* Caller will free mem_ctx when convenient. */
156                         talloc_steal(mem_ctx, i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         /* Verify that we dont have an entry for this ip yet */
168         for (i=ctdb->ifaces;i;i=i->next) {
169                 if (strcmp(i->name, iface) == 0) {
170                         return i;
171                 }
172         }
173
174         return NULL;
175 }
176
177 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178                                               struct ctdb_vnn *vnn)
179 {
180         int i;
181         struct ctdb_iface *cur = NULL;
182         struct ctdb_iface *best = NULL;
183
184         for (i=0; vnn->ifaces[i]; i++) {
185
186                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
187                 if (cur == NULL) {
188                         continue;
189                 }
190
191                 if (!cur->link_up) {
192                         continue;
193                 }
194
195                 if (best == NULL) {
196                         best = cur;
197                         continue;
198                 }
199
200                 if (cur->references < best->references) {
201                         best = cur;
202                         continue;
203                 }
204         }
205
206         return best;
207 }
208
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210                                      struct ctdb_vnn *vnn)
211 {
212         struct ctdb_iface *best = NULL;
213
214         if (vnn->iface) {
215                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216                                    "still assigned to iface '%s'\n",
217                                    ctdb_addr_to_str(&vnn->public_address),
218                                    ctdb_vnn_iface_string(vnn)));
219                 return 0;
220         }
221
222         best = ctdb_vnn_best_iface(ctdb, vnn);
223         if (best == NULL) {
224                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225                                   "cannot assign to iface any iface\n",
226                                   ctdb_addr_to_str(&vnn->public_address)));
227                 return -1;
228         }
229
230         vnn->iface = best;
231         best->references++;
232         vnn->pnn = ctdb->pnn;
233
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now assigned to iface '%s' refs[%d]\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            best->references));
239         return 0;
240 }
241
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243                                     struct ctdb_vnn *vnn)
244 {
245         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246                            "now unassigned (old iface '%s' refs[%d])\n",
247                            ctdb_addr_to_str(&vnn->public_address),
248                            ctdb_vnn_iface_string(vnn),
249                            vnn->iface?vnn->iface->references:0));
250         if (vnn->iface) {
251                 vnn->iface->references--;
252         }
253         vnn->iface = NULL;
254         if (vnn->pnn == ctdb->pnn) {
255                 vnn->pnn = -1;
256         }
257 }
258
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260                                struct ctdb_vnn *vnn)
261 {
262         int i;
263
264         if (vnn->iface && vnn->iface->link_up) {
265                 return true;
266         }
267
268         for (i=0; vnn->ifaces[i]; i++) {
269                 struct ctdb_iface *cur;
270
271                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
272                 if (cur == NULL) {
273                         continue;
274                 }
275
276                 if (cur->link_up) {
277                         return true;
278                 }
279         }
280
281         return false;
282 }
283
284 struct ctdb_takeover_arp {
285         struct ctdb_context *ctdb;
286         uint32_t count;
287         ctdb_sock_addr addr;
288         struct ctdb_tcp_array *tcparray;
289         struct ctdb_vnn *vnn;
290 };
291
292
293 /*
294   lists of tcp endpoints
295  */
296 struct ctdb_tcp_list {
297         struct ctdb_tcp_list *prev, *next;
298         struct ctdb_tcp_connection connection;
299 };
300
301 /*
302   list of clients to kill on IP release
303  */
304 struct ctdb_client_ip {
305         struct ctdb_client_ip *prev, *next;
306         struct ctdb_context *ctdb;
307         ctdb_sock_addr addr;
308         uint32_t client_id;
309 };
310
311
312 /*
313   send a gratuitous arp
314  */
315 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
316                                   struct timeval t, void *private_data)
317 {
318         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
319                                                         struct ctdb_takeover_arp);
320         int i, ret;
321         struct ctdb_tcp_array *tcparray;
322         const char *iface = ctdb_vnn_iface_string(arp->vnn);
323
324         ret = ctdb_sys_send_arp(&arp->addr, iface);
325         if (ret != 0) {
326                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
327                                   iface, strerror(errno)));
328         }
329
330         tcparray = arp->tcparray;
331         if (tcparray) {
332                 for (i=0;i<tcparray->num;i++) {
333                         struct ctdb_tcp_connection *tcon;
334
335                         tcon = &tcparray->connections[i];
336                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
337                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
338                                 ctdb_addr_to_str(&tcon->src_addr),
339                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
340                         ret = ctdb_sys_send_tcp(
341                                 &tcon->src_addr, 
342                                 &tcon->dst_addr,
343                                 0, 0, 0);
344                         if (ret != 0) {
345                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
346                                         ctdb_addr_to_str(&tcon->src_addr)));
347                         }
348                 }
349         }
350
351         arp->count++;
352
353         if (arp->count == CTDB_ARP_REPEAT) {
354                 talloc_free(arp);
355                 return;
356         }
357
358         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
359                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
360                         ctdb_control_send_arp, arp);
361 }
362
363 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
364                                        struct ctdb_vnn *vnn)
365 {
366         struct ctdb_takeover_arp *arp;
367         struct ctdb_tcp_array *tcparray;
368
369         if (!vnn->takeover_ctx) {
370                 vnn->takeover_ctx = talloc_new(vnn);
371                 if (!vnn->takeover_ctx) {
372                         return -1;
373                 }
374         }
375
376         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
377         if (!arp) {
378                 return -1;
379         }
380
381         arp->ctdb = ctdb;
382         arp->addr = vnn->public_address;
383         arp->vnn  = vnn;
384
385         tcparray = vnn->tcp_array;
386         if (tcparray) {
387                 /* add all of the known tcp connections for this IP to the
388                    list of tcp connections to send tickle acks for */
389                 arp->tcparray = talloc_steal(arp, tcparray);
390
391                 vnn->tcp_array = NULL;
392                 vnn->tcp_update_needed = true;
393         }
394
395         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
396                         timeval_zero(), ctdb_control_send_arp, arp);
397
398         return 0;
399 }
400
401 struct takeover_callback_state {
402         struct ctdb_req_control *c;
403         ctdb_sock_addr *addr;
404         struct ctdb_vnn *vnn;
405 };
406
407 struct ctdb_do_takeip_state {
408         struct ctdb_req_control *c;
409         struct ctdb_vnn *vnn;
410 };
411
412 /*
413   called when takeip event finishes
414  */
415 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
416                                     void *private_data)
417 {
418         struct ctdb_do_takeip_state *state =
419                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
420         int32_t ret;
421         TDB_DATA data;
422
423         if (status != 0) {
424                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
425         
426                 if (status == -ETIME) {
427                         ctdb_ban_self(ctdb);
428                 }
429                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
430                                  ctdb_addr_to_str(&state->vnn->public_address),
431                                  ctdb_vnn_iface_string(state->vnn)));
432                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
433
434                 node->flags |= NODE_FLAGS_UNHEALTHY;
435                 talloc_free(state);
436                 return;
437         }
438
439         if (ctdb->do_checkpublicip) {
440
441         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
442         if (ret != 0) {
443                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
444                 talloc_free(state);
445                 return;
446         }
447
448         }
449
450         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
451         data.dsize = strlen((char *)data.dptr) + 1;
452         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
453
454         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
455
456
457         /* the control succeeded */
458         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
459         talloc_free(state);
460         return;
461 }
462
463 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
464 {
465         state->vnn->update_in_flight = false;
466         return 0;
467 }
468
469 /*
470   take over an ip address
471  */
472 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
473                               struct ctdb_req_control *c,
474                               struct ctdb_vnn *vnn)
475 {
476         int ret;
477         struct ctdb_do_takeip_state *state;
478
479         if (vnn->update_in_flight) {
480                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
481                                     "update for this IP already in flight\n",
482                                     ctdb_addr_to_str(&vnn->public_address),
483                                     vnn->public_netmask_bits));
484                 return -1;
485         }
486
487         ret = ctdb_vnn_assign_iface(ctdb, vnn);
488         if (ret != 0) {
489                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
490                                  "assign a usable interface\n",
491                                  ctdb_addr_to_str(&vnn->public_address),
492                                  vnn->public_netmask_bits));
493                 return -1;
494         }
495
496         state = talloc(vnn, struct ctdb_do_takeip_state);
497         CTDB_NO_MEMORY(ctdb, state);
498
499         state->c = talloc_steal(ctdb, c);
500         state->vnn   = vnn;
501
502         vnn->update_in_flight = true;
503         talloc_set_destructor(state, ctdb_takeip_destructor);
504
505         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
506                             ctdb_addr_to_str(&vnn->public_address),
507                             vnn->public_netmask_bits,
508                             ctdb_vnn_iface_string(vnn)));
509
510         ret = ctdb_event_script_callback(ctdb,
511                                          state,
512                                          ctdb_do_takeip_callback,
513                                          state,
514                                          false,
515                                          CTDB_EVENT_TAKE_IP,
516                                          "%s %s %u",
517                                          ctdb_vnn_iface_string(vnn),
518                                          ctdb_addr_to_str(&vnn->public_address),
519                                          vnn->public_netmask_bits);
520
521         if (ret != 0) {
522                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
523                         ctdb_addr_to_str(&vnn->public_address),
524                         ctdb_vnn_iface_string(vnn)));
525                 talloc_free(state);
526                 return -1;
527         }
528
529         return 0;
530 }
531
532 struct ctdb_do_updateip_state {
533         struct ctdb_req_control *c;
534         struct ctdb_iface *old;
535         struct ctdb_vnn *vnn;
536 };
537
538 /*
539   called when updateip event finishes
540  */
541 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
542                                       void *private_data)
543 {
544         struct ctdb_do_updateip_state *state =
545                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
546         int32_t ret;
547
548         if (status != 0) {
549                 if (status == -ETIME) {
550                         ctdb_ban_self(ctdb);
551                 }
552                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
553                         ctdb_addr_to_str(&state->vnn->public_address),
554                         state->old->name,
555                         ctdb_vnn_iface_string(state->vnn)));
556
557                 /*
558                  * All we can do is reset the old interface
559                  * and let the next run fix it
560                  */
561                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
562                 state->vnn->iface = state->old;
563                 state->vnn->iface->references++;
564
565                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
566                 talloc_free(state);
567                 return;
568         }
569
570         if (ctdb->do_checkpublicip) {
571
572         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
573         if (ret != 0) {
574                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
575                 talloc_free(state);
576                 return;
577         }
578
579         }
580
581         /* the control succeeded */
582         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
583         talloc_free(state);
584         return;
585 }
586
587 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
588 {
589         state->vnn->update_in_flight = false;
590         return 0;
591 }
592
593 /*
594   update (move) an ip address
595  */
596 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
597                                 struct ctdb_req_control *c,
598                                 struct ctdb_vnn *vnn)
599 {
600         int ret;
601         struct ctdb_do_updateip_state *state;
602         struct ctdb_iface *old = vnn->iface;
603         const char *new_name;
604
605         if (vnn->update_in_flight) {
606                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
607                                     "update for this IP already in flight\n",
608                                     ctdb_addr_to_str(&vnn->public_address),
609                                     vnn->public_netmask_bits));
610                 return -1;
611         }
612
613         ctdb_vnn_unassign_iface(ctdb, vnn);
614         ret = ctdb_vnn_assign_iface(ctdb, vnn);
615         if (ret != 0) {
616                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
617                                  "assin a usable interface (old iface '%s')\n",
618                                  ctdb_addr_to_str(&vnn->public_address),
619                                  vnn->public_netmask_bits,
620                                  old->name));
621                 return -1;
622         }
623
624         new_name = ctdb_vnn_iface_string(vnn);
625         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
626                 /* A benign update from one interface onto itself.
627                  * no need to run the eventscripts in this case, just return
628                  * success.
629                  */
630                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
631                 return 0;
632         }
633
634         state = talloc(vnn, struct ctdb_do_updateip_state);
635         CTDB_NO_MEMORY(ctdb, state);
636
637         state->c = talloc_steal(ctdb, c);
638         state->old = old;
639         state->vnn = vnn;
640
641         vnn->update_in_flight = true;
642         talloc_set_destructor(state, ctdb_updateip_destructor);
643
644         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
645                             "interface %s to %s\n",
646                             ctdb_addr_to_str(&vnn->public_address),
647                             vnn->public_netmask_bits,
648                             old->name,
649                             new_name));
650
651         ret = ctdb_event_script_callback(ctdb,
652                                          state,
653                                          ctdb_do_updateip_callback,
654                                          state,
655                                          false,
656                                          CTDB_EVENT_UPDATE_IP,
657                                          "%s %s %s %u",
658                                          state->old->name,
659                                          new_name,
660                                          ctdb_addr_to_str(&vnn->public_address),
661                                          vnn->public_netmask_bits);
662         if (ret != 0) {
663                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
664                                  ctdb_addr_to_str(&vnn->public_address),
665                                  old->name, new_name));
666                 talloc_free(state);
667                 return -1;
668         }
669
670         return 0;
671 }
672
673 /*
674   Find the vnn of the node that has a public ip address
675   returns -1 if the address is not known as a public address
676  */
677 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
678 {
679         struct ctdb_vnn *vnn;
680
681         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
682                 if (ctdb_same_ip(&vnn->public_address, addr)) {
683                         return vnn;
684                 }
685         }
686
687         return NULL;
688 }
689
690 /*
691   take over an ip address
692  */
693 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
694                                  struct ctdb_req_control *c,
695                                  TDB_DATA indata,
696                                  bool *async_reply)
697 {
698         int ret;
699         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
700         struct ctdb_vnn *vnn;
701         bool have_ip = false;
702         bool do_updateip = false;
703         bool do_takeip = false;
704         struct ctdb_iface *best_iface = NULL;
705
706         if (pip->pnn != ctdb->pnn) {
707                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
708                                  "with pnn %d, but we're node %d\n",
709                                  ctdb_addr_to_str(&pip->addr),
710                                  pip->pnn, ctdb->pnn));
711                 return -1;
712         }
713
714         /* update out vnn list */
715         vnn = find_public_ip_vnn(ctdb, &pip->addr);
716         if (vnn == NULL) {
717                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
718                         ctdb_addr_to_str(&pip->addr)));
719                 return 0;
720         }
721
722         if (ctdb->do_checkpublicip) {
723                 have_ip = ctdb_sys_have_ip(&pip->addr);
724         }
725         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
726         if (best_iface == NULL) {
727                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
728                                  "a usable interface (old %s, have_ip %d)\n",
729                                  ctdb_addr_to_str(&vnn->public_address),
730                                  vnn->public_netmask_bits,
731                                  ctdb_vnn_iface_string(vnn),
732                                  have_ip));
733                 return -1;
734         }
735
736         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
737                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
738                 have_ip = false;
739         }
740
741
742         if (vnn->iface == NULL && have_ip) {
743                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
744                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
745                                  ctdb_addr_to_str(&vnn->public_address)));
746                 return 0;
747         }
748
749         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
750                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
751                                   "and we have it on iface[%s], but it was assigned to node %d"
752                                   "and we are node %d, banning ourself\n",
753                                  ctdb_addr_to_str(&vnn->public_address),
754                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
755                 ctdb_ban_self(ctdb);
756                 return -1;
757         }
758
759         if (vnn->pnn == -1 && have_ip) {
760                 vnn->pnn = ctdb->pnn;
761                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
762                                   "and we already have it on iface[%s], update local daemon\n",
763                                  ctdb_addr_to_str(&vnn->public_address),
764                                   ctdb_vnn_iface_string(vnn)));
765                 return 0;
766         }
767
768         if (vnn->iface) {
769                 if (vnn->iface != best_iface) {
770                         if (!vnn->iface->link_up) {
771                                 do_updateip = true;
772                         } else if (vnn->iface->references > (best_iface->references + 1)) {
773                                 /* only move when the rebalance gains something */
774                                         do_updateip = true;
775                         }
776                 }
777         }
778
779         if (!have_ip) {
780                 if (do_updateip) {
781                         ctdb_vnn_unassign_iface(ctdb, vnn);
782                         do_updateip = false;
783                 }
784                 do_takeip = true;
785         }
786
787         if (do_takeip) {
788                 ret = ctdb_do_takeip(ctdb, c, vnn);
789                 if (ret != 0) {
790                         return -1;
791                 }
792         } else if (do_updateip) {
793                 ret = ctdb_do_updateip(ctdb, c, vnn);
794                 if (ret != 0) {
795                         return -1;
796                 }
797         } else {
798                 /*
799                  * The interface is up and the kernel known the ip
800                  * => do nothing
801                  */
802                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits,
805                         ctdb_vnn_iface_string(vnn)));
806                 return 0;
807         }
808
809         /* tell ctdb_control.c that we will be replying asynchronously */
810         *async_reply = true;
811
812         return 0;
813 }
814
815 /*
816   takeover an ip address old v4 style
817  */
818 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
819                                 struct ctdb_req_control *c,
820                                 TDB_DATA indata, 
821                                 bool *async_reply)
822 {
823         TDB_DATA data;
824         
825         data.dsize = sizeof(struct ctdb_public_ip);
826         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
827         CTDB_NO_MEMORY(ctdb, data.dptr);
828         
829         memcpy(data.dptr, indata.dptr, indata.dsize);
830         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
831 }
832
833 /*
834   kill any clients that are registered with a IP that is being released
835  */
836 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
837 {
838         struct ctdb_client_ip *ip;
839
840         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
841                 ctdb_addr_to_str(addr)));
842
843         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
844                 ctdb_sock_addr tmp_addr;
845
846                 tmp_addr = ip->addr;
847                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
848                         ip->client_id,
849                         ctdb_addr_to_str(&ip->addr)));
850
851                 if (ctdb_same_ip(&tmp_addr, addr)) {
852                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
853                                                                      ip->client_id, 
854                                                                      struct ctdb_client);
855                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
856                                 ip->client_id,
857                                 ctdb_addr_to_str(&ip->addr),
858                                 client->pid));
859
860                         if (client->pid != 0) {
861                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
862                                         (unsigned)client->pid,
863                                         ctdb_addr_to_str(addr),
864                                         ip->client_id));
865                                 ctdb_kill(ctdb, client->pid, SIGKILL);
866                         }
867                 }
868         }
869 }
870
871 /*
872   called when releaseip event finishes
873  */
874 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
875                                 void *private_data)
876 {
877         struct takeover_callback_state *state = 
878                 talloc_get_type(private_data, struct takeover_callback_state);
879         TDB_DATA data;
880
881         if (status == -ETIME) {
882                 ctdb_ban_self(ctdb);
883         }
884
885         /* send a message to all clients of this node telling them
886            that the cluster has been reconfigured and they should
887            release any sockets on this IP */
888         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
889         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
890         data.dsize = strlen((char *)data.dptr)+1;
891
892         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
893
894         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
895
896         /* kill clients that have registered with this IP */
897         release_kill_clients(ctdb, state->addr);
898
899         ctdb_vnn_unassign_iface(ctdb, state->vnn);
900
901         /* the control succeeded */
902         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
903         talloc_free(state);
904 }
905
906 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
907 {
908         state->vnn->update_in_flight = false;
909         return 0;
910 }
911
912 /*
913   release an ip address
914  */
915 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
916                                 struct ctdb_req_control *c,
917                                 TDB_DATA indata, 
918                                 bool *async_reply)
919 {
920         int ret;
921         struct takeover_callback_state *state;
922         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
923         struct ctdb_vnn *vnn;
924         char *iface;
925
926         /* update our vnn list */
927         vnn = find_public_ip_vnn(ctdb, &pip->addr);
928         if (vnn == NULL) {
929                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
930                         ctdb_addr_to_str(&pip->addr)));
931                 return 0;
932         }
933         vnn->pnn = pip->pnn;
934
935         /* stop any previous arps */
936         talloc_free(vnn->takeover_ctx);
937         vnn->takeover_ctx = NULL;
938
939         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
940          * lazy multicast to drop an IP from any node that isn't the
941          * intended new node.  The following causes makes ctdbd ignore
942          * a release for any address it doesn't host.
943          */
944         if (ctdb->do_checkpublicip) {
945                 if (!ctdb_sys_have_ip(&pip->addr)) {
946                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
947                                 ctdb_addr_to_str(&pip->addr),
948                                 vnn->public_netmask_bits,
949                                 ctdb_vnn_iface_string(vnn)));
950                         ctdb_vnn_unassign_iface(ctdb, vnn);
951                         return 0;
952                 }
953         } else {
954                 if (vnn->iface == NULL) {
955                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
956                                            ctdb_addr_to_str(&pip->addr),
957                                            vnn->public_netmask_bits));
958                         return 0;
959                 }
960         }
961
962         /* There is a potential race between take_ip and us because we
963          * update the VNN via a callback that run when the
964          * eventscripts have been run.  Avoid the race by allowing one
965          * update to be in flight at a time.
966          */
967         if (vnn->update_in_flight) {
968                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
969                                     "update for this IP already in flight\n",
970                                     ctdb_addr_to_str(&vnn->public_address),
971                                     vnn->public_netmask_bits));
972                 return -1;
973         }
974
975         if (ctdb->do_checkpublicip) {
976                 iface = ctdb_sys_find_ifname(&pip->addr);
977                 if (iface == NULL) {
978                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
979                         return 0;
980                 }
981         } else {
982                 iface = strdup(ctdb_vnn_iface_string(vnn));
983         }
984
985         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
986                 ctdb_addr_to_str(&pip->addr),
987                 vnn->public_netmask_bits,
988                 iface,
989                 pip->pnn));
990
991         state = talloc(ctdb, struct takeover_callback_state);
992         CTDB_NO_MEMORY(ctdb, state);
993
994         state->c = talloc_steal(state, c);
995         state->addr = talloc(state, ctdb_sock_addr);       
996         CTDB_NO_MEMORY(ctdb, state->addr);
997         *state->addr = pip->addr;
998         state->vnn   = vnn;
999
1000         vnn->update_in_flight = true;
1001         talloc_set_destructor(state, ctdb_releaseip_destructor);
1002
1003         ret = ctdb_event_script_callback(ctdb, 
1004                                          state, release_ip_callback, state,
1005                                          false,
1006                                          CTDB_EVENT_RELEASE_IP,
1007                                          "%s %s %u",
1008                                          iface,
1009                                          ctdb_addr_to_str(&pip->addr),
1010                                          vnn->public_netmask_bits);
1011         free(iface);
1012         if (ret != 0) {
1013                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1014                         ctdb_addr_to_str(&pip->addr),
1015                         ctdb_vnn_iface_string(vnn)));
1016                 talloc_free(state);
1017                 return -1;
1018         }
1019
1020         /* tell the control that we will be reply asynchronously */
1021         *async_reply = true;
1022         return 0;
1023 }
1024
1025 /*
1026   release an ip address old v4 style
1027  */
1028 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1029                                 struct ctdb_req_control *c,
1030                                 TDB_DATA indata, 
1031                                 bool *async_reply)
1032 {
1033         TDB_DATA data;
1034         
1035         data.dsize = sizeof(struct ctdb_public_ip);
1036         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1037         CTDB_NO_MEMORY(ctdb, data.dptr);
1038         
1039         memcpy(data.dptr, indata.dptr, indata.dsize);
1040         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1041 }
1042
1043
1044 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1045                                    ctdb_sock_addr *addr,
1046                                    unsigned mask, const char *ifaces,
1047                                    bool check_address)
1048 {
1049         struct ctdb_vnn      *vnn;
1050         uint32_t num = 0;
1051         char *tmp;
1052         const char *iface;
1053         int i;
1054         int ret;
1055
1056         tmp = strdup(ifaces);
1057         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1058                 if (!ctdb_sys_check_iface_exists(iface)) {
1059                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1060                         free(tmp);
1061                         return -1;
1062                 }
1063         }
1064         free(tmp);
1065
1066         /* Verify that we dont have an entry for this ip yet */
1067         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1068                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1069                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1070                                 ctdb_addr_to_str(addr)));
1071                         return -1;
1072                 }               
1073         }
1074
1075         /* create a new vnn structure for this ip address */
1076         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1077         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1078         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1079         tmp = talloc_strdup(vnn, ifaces);
1080         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1081         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1082                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1083                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1084                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1085                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1086                 num++;
1087         }
1088         talloc_free(tmp);
1089         vnn->ifaces[num] = NULL;
1090         vnn->public_address      = *addr;
1091         vnn->public_netmask_bits = mask;
1092         vnn->pnn                 = -1;
1093         if (check_address) {
1094                 if (ctdb_sys_have_ip(addr)) {
1095                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1096                         vnn->pnn = ctdb->pnn;
1097                 }
1098         }
1099
1100         for (i=0; vnn->ifaces[i]; i++) {
1101                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1102                 if (ret != 0) {
1103                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1104                                            "for public_address[%s]\n",
1105                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1106                         talloc_free(vnn);
1107                         return -1;
1108                 }
1109         }
1110
1111         DLIST_ADD(ctdb->vnn, vnn);
1112
1113         return 0;
1114 }
1115
1116 /*
1117   setup the event script directory
1118 */
1119 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1120 {
1121         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1122         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1123         return 0;
1124 }
1125
1126 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1127                                   struct timeval t, void *private_data)
1128 {
1129         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1130                                                         struct ctdb_context);
1131         struct ctdb_vnn *vnn;
1132
1133         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1134                 int i;
1135
1136                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1137                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1138                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1139                                         vnn->ifaces[i],
1140                                         ctdb_addr_to_str(&vnn->public_address)));
1141                         }
1142                 }
1143         }
1144
1145         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1146                 timeval_current_ofs(30, 0), 
1147                 ctdb_check_interfaces_event, ctdb);
1148 }
1149
1150
1151 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1152 {
1153         if (ctdb->check_public_ifaces_ctx != NULL) {
1154                 talloc_free(ctdb->check_public_ifaces_ctx);
1155                 ctdb->check_public_ifaces_ctx = NULL;
1156         }
1157
1158         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1159         if (ctdb->check_public_ifaces_ctx == NULL) {
1160                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1161         }
1162
1163         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1164                 timeval_current_ofs(30, 0), 
1165                 ctdb_check_interfaces_event, ctdb);
1166
1167         return 0;
1168 }
1169
1170
1171 /*
1172   setup the public address lists from a file
1173 */
1174 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1175 {
1176         char **lines;
1177         int nlines;
1178         int i;
1179
1180         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1181         if (lines == NULL) {
1182                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1183                 return -1;
1184         }
1185         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1186                 nlines--;
1187         }
1188
1189         for (i=0;i<nlines;i++) {
1190                 unsigned mask;
1191                 ctdb_sock_addr addr;
1192                 const char *addrstr;
1193                 const char *ifaces;
1194                 char *tok, *line;
1195
1196                 line = lines[i];
1197                 while ((*line == ' ') || (*line == '\t')) {
1198                         line++;
1199                 }
1200                 if (*line == '#') {
1201                         continue;
1202                 }
1203                 if (strcmp(line, "") == 0) {
1204                         continue;
1205                 }
1206                 tok = strtok(line, " \t");
1207                 addrstr = tok;
1208                 tok = strtok(NULL, " \t");
1209                 if (tok == NULL) {
1210                         if (NULL == ctdb->default_public_interface) {
1211                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1212                                          i+1));
1213                                 talloc_free(lines);
1214                                 return -1;
1215                         }
1216                         ifaces = ctdb->default_public_interface;
1217                 } else {
1218                         ifaces = tok;
1219                 }
1220
1221                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1222                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1223                         talloc_free(lines);
1224                         return -1;
1225                 }
1226                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1227                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1228                         talloc_free(lines);
1229                         return -1;
1230                 }
1231         }
1232
1233
1234         talloc_free(lines);
1235         return 0;
1236 }
1237
1238 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1239                               const char *iface,
1240                               const char *ip)
1241 {
1242         struct ctdb_vnn *svnn;
1243         struct ctdb_iface *cur = NULL;
1244         bool ok;
1245         int ret;
1246
1247         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1248         CTDB_NO_MEMORY(ctdb, svnn);
1249
1250         svnn->ifaces = talloc_array(svnn, const char *, 2);
1251         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1252         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1253         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1254         svnn->ifaces[1] = NULL;
1255
1256         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1257         if (!ok) {
1258                 talloc_free(svnn);
1259                 return -1;
1260         }
1261
1262         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1263         if (ret != 0) {
1264                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1265                                    "for single_ip[%s]\n",
1266                                    svnn->ifaces[0],
1267                                    ctdb_addr_to_str(&svnn->public_address)));
1268                 talloc_free(svnn);
1269                 return -1;
1270         }
1271
1272         /* assume the single public ip interface is initially "good" */
1273         cur = ctdb_find_iface(ctdb, iface);
1274         if (cur == NULL) {
1275                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1276                 return -1;
1277         }
1278         cur->link_up = true;
1279
1280         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1281         if (ret != 0) {
1282                 talloc_free(svnn);
1283                 return -1;
1284         }
1285
1286         ctdb->single_ip_vnn = svnn;
1287         return 0;
1288 }
1289
1290 /* Given a physical node, return the number of
1291    public addresses that is currently assigned to this node.
1292 */
1293 static int node_ip_coverage(struct ctdb_context *ctdb, 
1294         int32_t pnn,
1295         struct ctdb_public_ip_list *ips)
1296 {
1297         int num=0;
1298
1299         for (;ips;ips=ips->next) {
1300                 if (ips->pnn == pnn) {
1301                         num++;
1302                 }
1303         }
1304         return num;
1305 }
1306
1307
1308 /* Can the given node host the given IP: is the public IP known to the
1309  * node and is NOIPHOST unset?
1310 */
1311 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1312                              struct ctdb_node_map *nodemap,
1313                              struct ctdb_public_ip_list *ip)
1314 {
1315         struct ctdb_all_public_ips *public_ips;
1316         int i;
1317
1318         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPHOST) {
1319                 return false;
1320         }
1321
1322         public_ips = ctdb->nodes[pnn]->available_public_ips;
1323
1324         if (public_ips == NULL) {
1325                 return false;
1326         }
1327
1328         for (i=0;i<public_ips->num;i++) {
1329                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1330                         /* yes, this node can serve this public ip */
1331                         return true;
1332                 }
1333         }
1334
1335         return false;
1336 }
1337
1338 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1339                                  struct ctdb_node_map *nodemap,
1340                                  struct ctdb_public_ip_list *ip)
1341 {
1342         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPTAKEOVER) {
1343                 return false;
1344         }
1345
1346         return can_node_host_ip(ctdb, pnn, nodemap, ip);
1347 }
1348
1349 /* search the node lists list for a node to takeover this ip.
1350    pick the node that currently are serving the least number of ips
1351    so that the ips get spread out evenly.
1352 */
1353 static int find_takeover_node(struct ctdb_context *ctdb, 
1354                 struct ctdb_node_map *nodemap,
1355                 struct ctdb_public_ip_list *ip,
1356                 struct ctdb_public_ip_list *all_ips)
1357 {
1358         int pnn, min=0, num;
1359         int i;
1360
1361         pnn    = -1;
1362         for (i=0;i<nodemap->num;i++) {
1363                 /* verify that this node can serve this ip */
1364                 if (!can_node_takeover_ip(ctdb, i, nodemap, ip)) {
1365                         /* no it couldnt   so skip to the next node */
1366                         continue;
1367                 }
1368
1369                 num = node_ip_coverage(ctdb, i, all_ips);
1370                 /* was this the first node we checked ? */
1371                 if (pnn == -1) {
1372                         pnn = i;
1373                         min  = num;
1374                 } else {
1375                         if (num < min) {
1376                                 pnn = i;
1377                                 min  = num;
1378                         }
1379                 }
1380         }       
1381         if (pnn == -1) {
1382                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1383                         ctdb_addr_to_str(&ip->addr)));
1384
1385                 return -1;
1386         }
1387
1388         ip->pnn = pnn;
1389         return 0;
1390 }
1391
1392 #define IP_KEYLEN       4
1393 static uint32_t *ip_key(ctdb_sock_addr *ip)
1394 {
1395         static uint32_t key[IP_KEYLEN];
1396
1397         bzero(key, sizeof(key));
1398
1399         switch (ip->sa.sa_family) {
1400         case AF_INET:
1401                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1402                 break;
1403         case AF_INET6: {
1404                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1405                 key[0]  = htonl(s6_a32[0]);
1406                 key[1]  = htonl(s6_a32[1]);
1407                 key[2]  = htonl(s6_a32[2]);
1408                 key[3]  = htonl(s6_a32[3]);
1409                 break;
1410         }
1411         default:
1412                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1413                 return key;
1414         }
1415
1416         return key;
1417 }
1418
1419 static void *add_ip_callback(void *parm, void *data)
1420 {
1421         struct ctdb_public_ip_list *this_ip = parm; 
1422         struct ctdb_public_ip_list *prev_ip = data; 
1423
1424         if (prev_ip == NULL) {
1425                 return parm;
1426         }
1427         if (this_ip->pnn == -1) {
1428                 this_ip->pnn = prev_ip->pnn;
1429         }
1430
1431         return parm;
1432 }
1433
1434 static int getips_count_callback(void *param, void *data)
1435 {
1436         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1437         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1438
1439         new_ip->next = *ip_list;
1440         *ip_list     = new_ip;
1441         return 0;
1442 }
1443
1444 static struct ctdb_public_ip_list *
1445 create_merged_ip_list(struct ctdb_context *ctdb)
1446 {
1447         int i, j;
1448         struct ctdb_public_ip_list *ip_list;
1449         struct ctdb_all_public_ips *public_ips;
1450
1451         if (ctdb->ip_tree != NULL) {
1452                 talloc_free(ctdb->ip_tree);
1453                 ctdb->ip_tree = NULL;
1454         }
1455         ctdb->ip_tree = trbt_create(ctdb, 0);
1456
1457         for (i=0;i<ctdb->num_nodes;i++) {
1458                 public_ips = ctdb->nodes[i]->known_public_ips;
1459
1460                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1461                         continue;
1462                 }
1463
1464                 /* there were no public ips for this node */
1465                 if (public_ips == NULL) {
1466                         continue;
1467                 }               
1468
1469                 for (j=0;j<public_ips->num;j++) {
1470                         struct ctdb_public_ip_list *tmp_ip; 
1471
1472                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1473                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1474                         /* Do not use information about IP addresses hosted
1475                          * on other nodes, it may not be accurate */
1476                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1477                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1478                         } else {
1479                                 tmp_ip->pnn = -1;
1480                         }
1481                         tmp_ip->addr = public_ips->ips[j].addr;
1482                         tmp_ip->next = NULL;
1483
1484                         trbt_insertarray32_callback(ctdb->ip_tree,
1485                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1486                                 add_ip_callback,
1487                                 tmp_ip);
1488                 }
1489         }
1490
1491         ip_list = NULL;
1492         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1493
1494         return ip_list;
1495 }
1496
1497 /* 
1498  * This is the length of the longtest common prefix between the IPs.
1499  * It is calculated by XOR-ing the 2 IPs together and counting the
1500  * number of leading zeroes.  The implementation means that all
1501  * addresses end up being 128 bits long.
1502  *
1503  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1504  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1505  * lots of nodes and IP addresses?
1506  */
1507 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1508 {
1509         uint32_t ip1_k[IP_KEYLEN];
1510         uint32_t *t;
1511         int i;
1512         uint32_t x;
1513
1514         uint32_t distance = 0;
1515
1516         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1517         t = ip_key(ip2);
1518         for (i=0; i<IP_KEYLEN; i++) {
1519                 x = ip1_k[i] ^ t[i];
1520                 if (x == 0) {
1521                         distance += 32;
1522                 } else {
1523                         /* Count number of leading zeroes. 
1524                          * FIXME? This could be optimised...
1525                          */
1526                         while ((x & (1 << 31)) == 0) {
1527                                 x <<= 1;
1528                                 distance += 1;
1529                         }
1530                 }
1531         }
1532
1533         return distance;
1534 }
1535
1536 /* Calculate the IP distance for the given IP relative to IPs on the
1537    given node.  The ips argument is generally the all_ips variable
1538    used in the main part of the algorithm.
1539  */
1540 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1541                                   struct ctdb_public_ip_list *ips,
1542                                   int pnn)
1543 {
1544         struct ctdb_public_ip_list *t;
1545         uint32_t d;
1546
1547         uint32_t sum = 0;
1548
1549         for (t=ips; t != NULL; t=t->next) {
1550                 if (t->pnn != pnn) {
1551                         continue;
1552                 }
1553
1554                 /* Optimisation: We never calculate the distance
1555                  * between an address and itself.  This allows us to
1556                  * calculate the effect of removing an address from a
1557                  * node by simply calculating the distance between
1558                  * that address and all of the exitsing addresses.
1559                  * Moreover, we assume that we're only ever dealing
1560                  * with addresses from all_ips so we can identify an
1561                  * address via a pointer rather than doing a more
1562                  * expensive address comparison. */
1563                 if (&(t->addr) == ip) {
1564                         continue;
1565                 }
1566
1567                 d = ip_distance(ip, &(t->addr));
1568                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1569         }
1570
1571         return sum;
1572 }
1573
1574 /* Return the LCP2 imbalance metric for addresses currently assigned
1575    to the given node.
1576  */
1577 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1578 {
1579         struct ctdb_public_ip_list *t;
1580
1581         uint32_t imbalance = 0;
1582
1583         for (t=all_ips; t!=NULL; t=t->next) {
1584                 if (t->pnn != pnn) {
1585                         continue;
1586                 }
1587                 /* Pass the rest of the IPs rather than the whole
1588                    all_ips input list.
1589                 */
1590                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1591         }
1592
1593         return imbalance;
1594 }
1595
1596 /* Allocate any unassigned IPs just by looping through the IPs and
1597  * finding the best node for each.
1598  */
1599 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1600                                       struct ctdb_node_map *nodemap,
1601                                       struct ctdb_public_ip_list *all_ips)
1602 {
1603         struct ctdb_public_ip_list *tmp_ip;
1604
1605         /* loop over all ip's and find a physical node to cover for 
1606            each unassigned ip.
1607         */
1608         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1609                 if (tmp_ip->pnn == -1) {
1610                         if (find_takeover_node(ctdb, nodemap, tmp_ip, all_ips)) {
1611                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1612                                         ctdb_addr_to_str(&tmp_ip->addr)));
1613                         }
1614                 }
1615         }
1616 }
1617
1618 /* Basic non-deterministic rebalancing algorithm.
1619  */
1620 static void basic_failback(struct ctdb_context *ctdb,
1621                            struct ctdb_node_map *nodemap,
1622                            struct ctdb_public_ip_list *all_ips,
1623                            int num_ips)
1624 {
1625         int i;
1626         int maxnode, maxnum, minnode, minnum, num, retries;
1627         struct ctdb_public_ip_list *tmp_ip;
1628
1629         retries = 0;
1630
1631 try_again:
1632         maxnum=0;
1633         minnum=0;
1634
1635         /* for each ip address, loop over all nodes that can serve
1636            this ip and make sure that the difference between the node
1637            serving the most and the node serving the least ip's are
1638            not greater than 1.
1639         */
1640         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1641                 if (tmp_ip->pnn == -1) {
1642                         continue;
1643                 }
1644
1645                 /* Get the highest and lowest number of ips's served by any 
1646                    valid node which can serve this ip.
1647                 */
1648                 maxnode = -1;
1649                 minnode = -1;
1650                 for (i=0;i<nodemap->num;i++) {
1651                         /* only check nodes that can actually serve this ip */
1652                         if (!can_node_takeover_ip(ctdb, i, nodemap, tmp_ip)) {
1653                                 /* no it couldnt   so skip to the next node */
1654                                 continue;
1655                         }
1656
1657                         num = node_ip_coverage(ctdb, i, all_ips);
1658                         if (maxnode == -1) {
1659                                 maxnode = i;
1660                                 maxnum  = num;
1661                         } else {
1662                                 if (num > maxnum) {
1663                                         maxnode = i;
1664                                         maxnum  = num;
1665                                 }
1666                         }
1667                         if (minnode == -1) {
1668                                 minnode = i;
1669                                 minnum  = num;
1670                         } else {
1671                                 if (num < minnum) {
1672                                         minnode = i;
1673                                         minnum  = num;
1674                                 }
1675                         }
1676                 }
1677                 if (maxnode == -1) {
1678                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1679                                 ctdb_addr_to_str(&tmp_ip->addr)));
1680
1681                         continue;
1682                 }
1683
1684                 /* if the spread between the smallest and largest coverage by
1685                    a node is >=2 we steal one of the ips from the node with
1686                    most coverage to even things out a bit.
1687                    try to do this a limited number of times since we dont
1688                    want to spend too much time balancing the ip coverage.
1689                 */
1690                 if ( (maxnum > minnum+1)
1691                      && (retries < (num_ips + 5)) ){
1692                         struct ctdb_public_ip_list *tmp;
1693
1694                         /* Reassign one of maxnode's VNNs */
1695                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1696                                 if (tmp->pnn == maxnode) {
1697                                         (void)find_takeover_node(ctdb, nodemap, tmp, all_ips);
1698                                         retries++;
1699                                         goto try_again;;
1700                                 }
1701                         }
1702                 }
1703         }
1704 }
1705
1706 struct ctdb_rebalancenodes {
1707         struct ctdb_rebalancenodes *next;
1708         uint32_t pnn;
1709 };
1710 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1711
1712
1713 /* set this flag to force the node to be rebalanced even if it just didnt
1714    become healthy again.
1715 */
1716 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1717 {
1718         struct ctdb_rebalancenodes *rebalance;
1719
1720         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1721                 if (rebalance->pnn == pnn) {
1722                         return;
1723                 }
1724         }
1725
1726         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1727         rebalance->pnn = pnn;
1728         rebalance->next = force_rebalance_list;
1729         force_rebalance_list = rebalance;
1730 }
1731
1732 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1733  * that we can unit test it.
1734  */
1735 static void lcp2_init(struct ctdb_context * tmp_ctx,
1736                struct ctdb_node_map * nodemap,
1737                struct ctdb_public_ip_list *all_ips,
1738                uint32_t **lcp2_imbalances,
1739                bool **rebalance_candidates)
1740 {
1741         int i;
1742         struct ctdb_public_ip_list *tmp_ip;
1743
1744         *rebalance_candidates = talloc_array(tmp_ctx, bool, nodemap->num);
1745         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1746         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1747         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1748
1749         for (i=0;i<nodemap->num;i++) {
1750                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1751                 /* First step: assume all nodes are candidates */
1752                 (*rebalance_candidates)[i] = true;
1753         }
1754
1755         /* 2nd step: if a node has IPs assigned then it must have been
1756          * healthy before, so we remove it from consideration.  This
1757          * is overkill but is all we have because we don't maintain
1758          * state between takeover runs.  An alternative would be to
1759          * keep state and invalidate it every time the recovery master
1760          * changes.
1761          */
1762         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1763                 if (tmp_ip->pnn != -1) {
1764                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1765                 }
1766         }
1767
1768         /* 3rd step: if a node is forced to re-balance then
1769            we allow failback onto the node */
1770         while (force_rebalance_list != NULL) {
1771                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1772
1773                 if (force_rebalance_list->pnn <= nodemap->num) {
1774                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1775                 }
1776
1777                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1778                 talloc_free(force_rebalance_list);
1779                 force_rebalance_list = next;
1780         }
1781 }
1782
1783 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1784  * the IP/node combination that will cost the least.
1785  */
1786 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1787                                      struct ctdb_node_map *nodemap,
1788                                      struct ctdb_public_ip_list *all_ips,
1789                                      uint32_t *lcp2_imbalances)
1790 {
1791         struct ctdb_public_ip_list *tmp_ip;
1792         int dstnode;
1793
1794         int minnode;
1795         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1796         struct ctdb_public_ip_list *minip;
1797
1798         bool should_loop = true;
1799         bool have_unassigned = true;
1800
1801         while (have_unassigned && should_loop) {
1802                 should_loop = false;
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1806
1807                 minnode = -1;
1808                 mindsum = 0;
1809                 minip = NULL;
1810
1811                 /* loop over each unassigned ip. */
1812                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1813                         if (tmp_ip->pnn != -1) {
1814                                 continue;
1815                         }
1816
1817                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1818                                 /* only check nodes that can actually takeover this ip */
1819                                 if (!can_node_takeover_ip(ctdb, dstnode,
1820                                                           nodemap, tmp_ip)) {
1821                                         /* no it couldnt   so skip to the next node */
1822                                         continue;
1823                                 }
1824
1825                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1826                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1827                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1828                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1829                                                    dstnode,
1830                                                    dstimbl - lcp2_imbalances[dstnode]));
1831
1832
1833                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1834                                         minnode = dstnode;
1835                                         minimbl = dstimbl;
1836                                         mindsum = dstdsum;
1837                                         minip = tmp_ip;
1838                                         should_loop = true;
1839                                 }
1840                         }
1841                 }
1842
1843                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1844
1845                 /* If we found one then assign it to the given node. */
1846                 if (minnode != -1) {
1847                         minip->pnn = minnode;
1848                         lcp2_imbalances[minnode] = minimbl;
1849                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1850                                           ctdb_addr_to_str(&(minip->addr)),
1851                                           minnode,
1852                                           mindsum));
1853                 }
1854
1855                 /* There might be a better way but at least this is clear. */
1856                 have_unassigned = false;
1857                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1858                         if (tmp_ip->pnn == -1) {
1859                                 have_unassigned = true;
1860                         }
1861                 }
1862         }
1863
1864         /* We know if we have an unassigned addresses so we might as
1865          * well optimise.
1866          */
1867         if (have_unassigned) {
1868                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1869                         if (tmp_ip->pnn == -1) {
1870                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1871                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1872                         }
1873                 }
1874         }
1875 }
1876
1877 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1878  * to move IPs from, determines the best IP/destination node
1879  * combination to move from the source node.
1880  */
1881 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1882                                     struct ctdb_node_map *nodemap,
1883                                     struct ctdb_public_ip_list *all_ips,
1884                                     int srcnode,
1885                                     uint32_t candimbl,
1886                                     uint32_t *lcp2_imbalances,
1887                                     bool *rebalance_candidates)
1888 {
1889         int dstnode, mindstnode;
1890         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1891         uint32_t minsrcimbl, mindstimbl;
1892         struct ctdb_public_ip_list *minip;
1893         struct ctdb_public_ip_list *tmp_ip;
1894
1895         /* Find an IP and destination node that best reduces imbalance. */
1896         minip = NULL;
1897         minsrcimbl = 0;
1898         mindstnode = -1;
1899         mindstimbl = 0;
1900
1901         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1902         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1903
1904         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1905                 /* Only consider addresses on srcnode. */
1906                 if (tmp_ip->pnn != srcnode) {
1907                         continue;
1908                 }
1909
1910                 /* What is this IP address costing the source node? */
1911                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1912                 srcimbl = candimbl - srcdsum;
1913
1914                 /* Consider this IP address would cost each potential
1915                  * destination node.  Destination nodes are limited to
1916                  * those that are newly healthy, since we don't want
1917                  * to do gratuitous failover of IPs just to make minor
1918                  * balance improvements.
1919                  */
1920                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1921                         if (!rebalance_candidates[dstnode]) {
1922                                 continue;
1923                         }
1924
1925                         /* only check nodes that can actually takeover this ip */
1926                         if (!can_node_takeover_ip(ctdb, dstnode,
1927                                                   nodemap, tmp_ip)) {
1928                                 /* no it couldnt   so skip to the next node */
1929                                 continue;
1930                         }
1931
1932                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1933                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1934                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1935                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1936                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1937                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1938
1939                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1940                             ((mindstnode == -1) ||                              \
1941                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1942
1943                                 minip = tmp_ip;
1944                                 minsrcimbl = srcimbl;
1945                                 mindstnode = dstnode;
1946                                 mindstimbl = dstimbl;
1947                         }
1948                 }
1949         }
1950         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1951
1952         if (mindstnode != -1) {
1953                 /* We found a move that makes things better... */
1954                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1955                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1956                                   ctdb_addr_to_str(&(minip->addr)),
1957                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1958
1959
1960                 lcp2_imbalances[srcnode] = srcimbl;
1961                 lcp2_imbalances[mindstnode] = mindstimbl;
1962                 minip->pnn = mindstnode;
1963
1964                 return true;
1965         }
1966
1967         return false;
1968         
1969 }
1970
1971 struct lcp2_imbalance_pnn {
1972         uint32_t imbalance;
1973         int pnn;
1974 };
1975
1976 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1977 {
1978         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1979         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1980
1981         if (lipa->imbalance > lipb->imbalance) {
1982                 return -1;
1983         } else if (lipa->imbalance == lipb->imbalance) {
1984                 return 0;
1985         } else {
1986                 return 1;
1987         }
1988 }
1989
1990 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1991  * node with the highest LCP2 imbalance, and then determines the best
1992  * IP/destination node combination to move from the source node.
1993  */
1994 static void lcp2_failback(struct ctdb_context *ctdb,
1995                           struct ctdb_node_map *nodemap,
1996                           struct ctdb_public_ip_list *all_ips,
1997                           uint32_t *lcp2_imbalances,
1998                           bool *rebalance_candidates)
1999 {
2000         int i, num_rebalance_candidates;
2001         struct lcp2_imbalance_pnn * lips;
2002         bool again;
2003
2004 try_again:
2005
2006         /* It is only worth continuing if we have suitable target
2007          * nodes to transfer IPs to.  This check is much cheaper than
2008          * continuing on...
2009          */
2010         num_rebalance_candidates = 0;
2011         for (i = 0; i < nodemap->num; i++) {
2012                 if (rebalance_candidates[i]) {
2013                         num_rebalance_candidates++;
2014                 }
2015         }
2016         if (num_rebalance_candidates == 0) {
2017                 return;
2018         }
2019
2020         /* Put the imbalances and nodes into an array, sort them and
2021          * iterate through candidates.  Usually the 1st one will be
2022          * used, so this doesn't cost much...
2023          */
2024         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2025         for (i = 0; i < nodemap->num; i++) {
2026                 lips[i].imbalance = lcp2_imbalances[i];
2027                 lips[i].pnn = i;
2028         }
2029         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2030               lcp2_cmp_imbalance_pnn);
2031
2032         again = false;
2033         for (i = 0; i < nodemap->num; i++) {
2034                 /* This means that all nodes had 0 or 1 addresses, so
2035                  * can't be imbalanced.
2036                  */
2037                 if (lips[i].imbalance == 0) {
2038                         break;
2039                 }
2040
2041                 if (lcp2_failback_candidate(ctdb,
2042                                             nodemap,
2043                                             all_ips,
2044                                             lips[i].pnn,
2045                                             lips[i].imbalance,
2046                                             lcp2_imbalances,
2047                                             rebalance_candidates)) {
2048                         again = true;
2049                         break;
2050                 }
2051         }
2052
2053         talloc_free(lips);
2054         if (again) {
2055                 goto try_again;
2056         }
2057 }
2058
2059 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2060                                     struct ctdb_node_map *nodemap,
2061                                     struct ctdb_public_ip_list *all_ips)
2062 {
2063         struct ctdb_public_ip_list *tmp_ip;
2064
2065         /* verify that the assigned nodes can serve that public ip
2066            and set it to -1 if not
2067         */
2068         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2069                 if (tmp_ip->pnn == -1) {
2070                         continue;
2071                 }
2072                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2073                                       nodemap, tmp_ip) != 0) {
2074                         /* this node can not serve this ip. */
2075                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2076                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2077                                            tmp_ip->pnn));
2078                         tmp_ip->pnn = -1;
2079                 }
2080         }
2081 }
2082
2083 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2084                                        struct ctdb_node_map *nodemap,
2085                                        struct ctdb_public_ip_list *all_ips)
2086 {
2087         struct ctdb_public_ip_list *tmp_ip;
2088         int i;
2089
2090         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2091        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2092         *  always be allocated the same way for a specific set of
2093         *  available/unavailable nodes.
2094         */
2095
2096         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2097                 tmp_ip->pnn = i%nodemap->num;
2098         }
2099
2100         /* IP failback doesn't make sense with deterministic
2101          * IPs, since the modulo step above implicitly fails
2102          * back IPs to their "home" node.
2103          */
2104         if (1 == ctdb->tunable.no_ip_failback) {
2105                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2106         }
2107
2108         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2109
2110         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2111
2112         /* No failback here! */
2113 }
2114
2115 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2116                                           struct ctdb_node_map *nodemap,
2117                                           struct ctdb_public_ip_list *all_ips)
2118 {
2119         /* This should be pushed down into basic_failback. */
2120         struct ctdb_public_ip_list *tmp_ip;
2121         int num_ips = 0;
2122         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2123                 num_ips++;
2124         }
2125
2126         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2127
2128         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2129
2130         /* If we don't want IPs to fail back then don't rebalance IPs. */
2131         if (1 == ctdb->tunable.no_ip_failback) {
2132                 return;
2133         }
2134
2135         /* Now, try to make sure the ip adresses are evenly distributed
2136            across the nodes.
2137         */
2138         basic_failback(ctdb, nodemap, all_ips, num_ips);
2139 }
2140
2141 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2142                           struct ctdb_node_map *nodemap,
2143                           struct ctdb_public_ip_list *all_ips)
2144 {
2145         uint32_t *lcp2_imbalances;
2146         bool *rebalance_candidates;
2147
2148         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2149
2150         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2151
2152         lcp2_init(tmp_ctx, nodemap, all_ips,
2153                   &lcp2_imbalances, &rebalance_candidates);
2154
2155         lcp2_allocate_unassigned(ctdb, nodemap, all_ips, lcp2_imbalances);
2156
2157         /* If we don't want IPs to fail back then don't rebalance IPs. */
2158         if (1 == ctdb->tunable.no_ip_failback) {
2159                 goto finished;
2160         }
2161
2162         /* Now, try to make sure the ip adresses are evenly distributed
2163            across the nodes.
2164         */
2165         lcp2_failback(ctdb, nodemap, all_ips,
2166                       lcp2_imbalances, rebalance_candidates);
2167
2168 finished:
2169         talloc_free(tmp_ctx);
2170 }
2171
2172 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2173 {
2174         int i, num_healthy;
2175
2176         /* Count how many completely healthy nodes we have */
2177         num_healthy = 0;
2178         for (i=0;i<nodemap->num;i++) {
2179                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2180                         num_healthy++;
2181                 }
2182         }
2183
2184         return num_healthy == 0;
2185 }
2186
2187 /* The calculation part of the IP allocation algorithm. */
2188 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2189                                    struct ctdb_node_map *nodemap,
2190                                    struct ctdb_public_ip_list **all_ips_p)
2191 {
2192         /* since nodes only know about those public addresses that
2193            can be served by that particular node, no single node has
2194            a full list of all public addresses that exist in the cluster.
2195            Walk over all node structures and create a merged list of
2196            all public addresses that exist in the cluster.
2197
2198            keep the tree of ips around as ctdb->ip_tree
2199         */
2200         *all_ips_p = create_merged_ip_list(ctdb);
2201
2202         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2203                 ip_alloc_lcp2(ctdb, nodemap, *all_ips_p);
2204         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2205                 ip_alloc_deterministic_ips(ctdb, nodemap, *all_ips_p);
2206         } else {
2207                 ip_alloc_nondeterministic_ips(ctdb, nodemap, *all_ips_p);
2208         }
2209
2210         /* at this point ->pnn is the node which will own each IP
2211            or -1 if there is no node that can cover this ip
2212         */
2213
2214         return;
2215 }
2216
2217 struct get_tunable_callback_data {
2218         const char *tunable;
2219         uint32_t *out;
2220 };
2221
2222 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2223                                  int32_t res, TDB_DATA outdata,
2224                                  void *callback)
2225 {
2226         struct get_tunable_callback_data *cd =
2227                 (struct get_tunable_callback_data *)callback;
2228         int size;
2229
2230         if (res != 0) {
2231                 DEBUG(DEBUG_ERR,
2232                       ("Failure to read \"%s\" tunable from remote node %d\n",
2233                        cd->tunable, pnn));
2234                 return;
2235         }
2236
2237         if (outdata.dsize != sizeof(uint32_t)) {
2238                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2239                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2240                                  (int)outdata.dsize));
2241                 return;
2242         }
2243
2244         size = talloc_get_size(cd->out) / sizeof(uint32_t);
2245         if (pnn >= size) {
2246                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2247                                  cd->tunable, pnn, size));
2248                 return;
2249         }
2250
2251                 
2252         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2253 }
2254
2255 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2256                                         TALLOC_CTX *tmp_ctx,
2257                                         struct ctdb_node_map *nodemap,
2258                                         const char *tunable)
2259 {
2260         TDB_DATA data;
2261         struct ctdb_control_get_tunable *t;
2262         uint32_t *nodes;
2263         uint32_t *tvals;
2264         struct get_tunable_callback_data callback_data;
2265
2266         tvals = talloc_zero_array(tmp_ctx, uint32_t, nodemap->num);
2267         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2268         callback_data.out = tvals;
2269         callback_data.tunable = tunable;
2270
2271         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2272         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2273         t = (struct ctdb_control_get_tunable *)data.dptr;
2274         t->length = strlen(tunable)+1;
2275         memcpy(t->name, tunable, t->length);
2276         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2277         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2278                                       nodes, 0, TAKEOVER_TIMEOUT(),
2279                                       false, data,
2280                                       get_tunable_callback, NULL,
2281                                       &callback_data) != 0) {
2282                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get %s tunable failed\n", tunable));
2283         }
2284         talloc_free(nodes);
2285         talloc_free(data.dptr);
2286
2287         return tvals;
2288 }
2289
2290 static void clear_ipflags(struct ctdb_node_map *nodemap)
2291 {
2292         int i;
2293
2294         for (i=0;i<nodemap->num;i++) {
2295                 nodemap->nodes[i].flags &=
2296                         ~(NODE_FLAGS_NOIPTAKEOVER|NODE_FLAGS_NOIPHOST);
2297         }
2298 }
2299
2300
2301 /* Set internal flags for IP allocation:
2302  *   Clear ip flags
2303  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2304  *   Set NOIPHOST ip flag for each INACTIVE node
2305  *   if all nodes are disabled:
2306  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2307  *   else
2308  *     Set NOIPHOST ip flags for disabled nodes
2309  */
2310 static void set_ipflags_internal(struct ctdb_node_map *nodemap,
2311                                  uint32_t *tval_noiptakeover,
2312                                  uint32_t *tval_noiphostonalldisabled)
2313 {
2314         int i;
2315
2316         clear_ipflags(nodemap);
2317
2318         for (i=0;i<nodemap->num;i++) {
2319                 /* Can not take IPs on node with NoIPTakeover set */
2320                 if (tval_noiptakeover[i] != 0) {
2321                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPTAKEOVER;
2322                 }
2323
2324                 /* Can not host IPs on INACTIVE node */
2325                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2326                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2327                 }
2328         }
2329
2330         if (all_nodes_are_disabled(nodemap)) {
2331                 /* If all nodes are disabled, can not host IPs on node
2332                  * with NoIPHostOnAllDisabled set
2333                  */
2334                 for (i=0;i<nodemap->num;i++) {
2335                         if (tval_noiphostonalldisabled[i] != 0) {
2336                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2337                         }
2338                 }
2339         } else {
2340                 /* If some nodes are not disabled, then can not host
2341                  * IPs on DISABLED node
2342                  */
2343                 for (i=0;i<nodemap->num;i++) {
2344                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2345                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2346                         }
2347                 }
2348         }
2349 }
2350
2351 static bool set_ipflags(struct ctdb_context *ctdb,
2352                         TALLOC_CTX *tmp_ctx,
2353                         struct ctdb_node_map *nodemap)
2354 {
2355         uint32_t *tval_noiptakeover;
2356         uint32_t *tval_noiphostonalldisabled;
2357
2358         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2359                                                    "NoIPTakeover");
2360         if (tval_noiptakeover == NULL) {
2361                 return false;
2362         }
2363
2364         tval_noiphostonalldisabled =
2365                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2366                                        "NoIPHostOnAllDisabled");
2367         if (tval_noiphostonalldisabled == NULL) {
2368                 return false;
2369         }
2370
2371         set_ipflags_internal(nodemap,
2372                              tval_noiptakeover, tval_noiphostonalldisabled);
2373
2374         talloc_free(tval_noiptakeover);
2375         talloc_free(tval_noiphostonalldisabled);
2376
2377         return true;
2378 }
2379
2380 /*
2381   make any IP alias changes for public addresses that are necessary 
2382  */
2383 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2384                       client_async_callback fail_callback, void *callback_data)
2385 {
2386         int i;
2387         struct ctdb_public_ip ip;
2388         struct ctdb_public_ipv4 ipv4;
2389         uint32_t *nodes;
2390         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2391         TDB_DATA data;
2392         struct timeval timeout;
2393         struct client_async_data *async_data;
2394         struct ctdb_client_control_state *state;
2395         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2396         uint32_t disable_timeout;
2397
2398         /*
2399          * ip failover is completely disabled, just send out the 
2400          * ipreallocated event.
2401          */
2402         if (ctdb->tunable.disable_ip_failover != 0) {
2403                 goto ipreallocated;
2404         }
2405
2406         if (!set_ipflags(ctdb, tmp_ctx, nodemap)) {
2407                 DEBUG(DEBUG_ERR,("Failed to set IP flags from tunables\n"));
2408                 return -1;
2409         }
2410
2411         ZERO_STRUCT(ip);
2412
2413         /* Do the IP reassignment calculations */
2414         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2415
2416         /* The IP flags need to be cleared because they should never
2417          * be seen outside the IP allocation code.
2418          */
2419         clear_ipflags(nodemap);
2420
2421         /* The recovery daemon does regular sanity checks of the IPs.
2422          * However, sometimes it is overzealous and thinks changes are
2423          * required when they're already underway.  This stops the
2424          * checks for a while before we start moving IPs.
2425          */
2426         disable_timeout = ctdb->tunable.takeover_timeout;
2427         data.dptr  = (uint8_t*)&disable_timeout;
2428         data.dsize = sizeof(disable_timeout);
2429         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2430                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2431                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2432         }
2433
2434         /* now tell all nodes to delete any alias that they should not
2435            have.  This will be a NOOP on nodes that don't currently
2436            hold the given alias */
2437         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2438         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2439
2440         async_data->fail_callback = fail_callback;
2441         async_data->callback_data = callback_data;
2442
2443         for (i=0;i<nodemap->num;i++) {
2444                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2445                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2446                         continue;
2447                 }
2448
2449                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2450                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2451                                 /* This node should be serving this
2452                                    vnn so dont tell it to release the ip
2453                                 */
2454                                 continue;
2455                         }
2456                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2457                                 ipv4.pnn = tmp_ip->pnn;
2458                                 ipv4.sin = tmp_ip->addr.ip;
2459
2460                                 timeout = TAKEOVER_TIMEOUT();
2461                                 data.dsize = sizeof(ipv4);
2462                                 data.dptr  = (uint8_t *)&ipv4;
2463                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2464                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2465                                                 data, async_data,
2466                                                 &timeout, NULL);
2467                         } else {
2468                                 ip.pnn  = tmp_ip->pnn;
2469                                 ip.addr = tmp_ip->addr;
2470
2471                                 timeout = TAKEOVER_TIMEOUT();
2472                                 data.dsize = sizeof(ip);
2473                                 data.dptr  = (uint8_t *)&ip;
2474                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2475                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2476                                                 data, async_data,
2477                                                 &timeout, NULL);
2478                         }
2479
2480                         if (state == NULL) {
2481                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2482                                 talloc_free(tmp_ctx);
2483                                 return -1;
2484                         }
2485                 
2486                         ctdb_client_async_add(async_data, state);
2487                 }
2488         }
2489         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2490                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2491                 talloc_free(tmp_ctx);
2492                 return -1;
2493         }
2494         talloc_free(async_data);
2495
2496
2497         /* tell all nodes to get their own IPs */
2498         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2499         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2500
2501         async_data->fail_callback = fail_callback;
2502         async_data->callback_data = callback_data;
2503
2504         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2505                 if (tmp_ip->pnn == -1) {
2506                         /* this IP won't be taken over */
2507                         continue;
2508                 }
2509
2510                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2511                         ipv4.pnn = tmp_ip->pnn;
2512                         ipv4.sin = tmp_ip->addr.ip;
2513
2514                         timeout = TAKEOVER_TIMEOUT();
2515                         data.dsize = sizeof(ipv4);
2516                         data.dptr  = (uint8_t *)&ipv4;
2517                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2518                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2519                                         data, async_data,
2520                                         &timeout, NULL);
2521                 } else {
2522                         ip.pnn  = tmp_ip->pnn;
2523                         ip.addr = tmp_ip->addr;
2524
2525                         timeout = TAKEOVER_TIMEOUT();
2526                         data.dsize = sizeof(ip);
2527                         data.dptr  = (uint8_t *)&ip;
2528                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2529                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2530                                         data, async_data,
2531                                         &timeout, NULL);
2532                 }
2533                 if (state == NULL) {
2534                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2535                         talloc_free(tmp_ctx);
2536                         return -1;
2537                 }
2538                 
2539                 ctdb_client_async_add(async_data, state);
2540         }
2541         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2542                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2543                 talloc_free(tmp_ctx);
2544                 return -1;
2545         }
2546
2547 ipreallocated:
2548         /* 
2549          * Tell all nodes to run eventscripts to process the
2550          * "ipreallocated" event.  This can do a lot of things,
2551          * including restarting services to reconfigure them if public
2552          * IPs have moved.  Once upon a time this event only used to
2553          * update natwg.
2554          */
2555         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2556         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2557                                       nodes, 0, TAKEOVER_TIMEOUT(),
2558                                       false, tdb_null,
2559                                       NULL, fail_callback,
2560                                       callback_data) != 0) {
2561                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2562         }
2563
2564         talloc_free(tmp_ctx);
2565         return 0;
2566 }
2567
2568
2569 /*
2570   destroy a ctdb_client_ip structure
2571  */
2572 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2573 {
2574         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2575                 ctdb_addr_to_str(&ip->addr),
2576                 ntohs(ip->addr.ip.sin_port),
2577                 ip->client_id));
2578
2579         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2580         return 0;
2581 }
2582
2583 /*
2584   called by a client to inform us of a TCP connection that it is managing
2585   that should tickled with an ACK when IP takeover is done
2586   we handle both the old ipv4 style of packets as well as the new ipv4/6
2587   pdus.
2588  */
2589 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2590                                 TDB_DATA indata)
2591 {
2592         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2593         struct ctdb_control_tcp *old_addr = NULL;
2594         struct ctdb_control_tcp_addr new_addr;
2595         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2596         struct ctdb_tcp_list *tcp;
2597         struct ctdb_tcp_connection t;
2598         int ret;
2599         TDB_DATA data;
2600         struct ctdb_client_ip *ip;
2601         struct ctdb_vnn *vnn;
2602         ctdb_sock_addr addr;
2603
2604         switch (indata.dsize) {
2605         case sizeof(struct ctdb_control_tcp):
2606                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2607                 ZERO_STRUCT(new_addr);
2608                 tcp_sock = &new_addr;
2609                 tcp_sock->src.ip  = old_addr->src;
2610                 tcp_sock->dest.ip = old_addr->dest;
2611                 break;
2612         case sizeof(struct ctdb_control_tcp_addr):
2613                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2614                 break;
2615         default:
2616                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2617                                  "to ctdb_control_tcp_client. size was %d but "
2618                                  "only allowed sizes are %lu and %lu\n",
2619                                  (int)indata.dsize,
2620                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2621                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2622                 return -1;
2623         }
2624
2625         addr = tcp_sock->src;
2626         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2627         addr = tcp_sock->dest;
2628         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2629
2630         ZERO_STRUCT(addr);
2631         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2632         vnn = find_public_ip_vnn(ctdb, &addr);
2633         if (vnn == NULL) {
2634                 switch (addr.sa.sa_family) {
2635                 case AF_INET:
2636                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2637                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2638                                         ctdb_addr_to_str(&addr)));
2639                         }
2640                         break;
2641                 case AF_INET6:
2642                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2643                                 ctdb_addr_to_str(&addr)));
2644                         break;
2645                 default:
2646                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2647                 }
2648
2649                 return 0;
2650         }
2651
2652         if (vnn->pnn != ctdb->pnn) {
2653                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2654                         ctdb_addr_to_str(&addr),
2655                         client_id, client->pid));
2656                 /* failing this call will tell smbd to die */
2657                 return -1;
2658         }
2659
2660         ip = talloc(client, struct ctdb_client_ip);
2661         CTDB_NO_MEMORY(ctdb, ip);
2662
2663         ip->ctdb      = ctdb;
2664         ip->addr      = addr;
2665         ip->client_id = client_id;
2666         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2667         DLIST_ADD(ctdb->client_ip_list, ip);
2668
2669         tcp = talloc(client, struct ctdb_tcp_list);
2670         CTDB_NO_MEMORY(ctdb, tcp);
2671
2672         tcp->connection.src_addr = tcp_sock->src;
2673         tcp->connection.dst_addr = tcp_sock->dest;
2674
2675         DLIST_ADD(client->tcp_list, tcp);
2676
2677         t.src_addr = tcp_sock->src;
2678         t.dst_addr = tcp_sock->dest;
2679
2680         data.dptr = (uint8_t *)&t;
2681         data.dsize = sizeof(t);
2682
2683         switch (addr.sa.sa_family) {
2684         case AF_INET:
2685                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2686                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2687                         ctdb_addr_to_str(&tcp_sock->src),
2688                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2689                 break;
2690         case AF_INET6:
2691                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2692                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2693                         ctdb_addr_to_str(&tcp_sock->src),
2694                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2695                 break;
2696         default:
2697                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2698         }
2699
2700
2701         /* tell all nodes about this tcp connection */
2702         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2703                                        CTDB_CONTROL_TCP_ADD,
2704                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2705         if (ret != 0) {
2706                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2707                 return -1;
2708         }
2709
2710         return 0;
2711 }
2712
2713 /*
2714   find a tcp address on a list
2715  */
2716 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2717                                            struct ctdb_tcp_connection *tcp)
2718 {
2719         int i;
2720
2721         if (array == NULL) {
2722                 return NULL;
2723         }
2724
2725         for (i=0;i<array->num;i++) {
2726                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2727                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2728                         return &array->connections[i];
2729                 }
2730         }
2731         return NULL;
2732 }
2733
2734
2735
2736 /*
2737   called by a daemon to inform us of a TCP connection that one of its
2738   clients managing that should tickled with an ACK when IP takeover is
2739   done
2740  */
2741 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2742 {
2743         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2744         struct ctdb_tcp_array *tcparray;
2745         struct ctdb_tcp_connection tcp;
2746         struct ctdb_vnn *vnn;
2747
2748         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2749         if (vnn == NULL) {
2750                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2751                         ctdb_addr_to_str(&p->dst_addr)));
2752
2753                 return -1;
2754         }
2755
2756
2757         tcparray = vnn->tcp_array;
2758
2759         /* If this is the first tickle */
2760         if (tcparray == NULL) {
2761                 tcparray = talloc_size(ctdb->nodes, 
2762                         offsetof(struct ctdb_tcp_array, connections) +
2763                         sizeof(struct ctdb_tcp_connection) * 1);
2764                 CTDB_NO_MEMORY(ctdb, tcparray);
2765                 vnn->tcp_array = tcparray;
2766
2767                 tcparray->num = 0;
2768                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2769                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2770
2771                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2772                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2773                 tcparray->num++;
2774
2775                 if (tcp_update_needed) {
2776                         vnn->tcp_update_needed = true;
2777                 }
2778                 return 0;
2779         }
2780
2781
2782         /* Do we already have this tickle ?*/
2783         tcp.src_addr = p->src_addr;
2784         tcp.dst_addr = p->dst_addr;
2785         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2786                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2787                         ctdb_addr_to_str(&tcp.dst_addr),
2788                         ntohs(tcp.dst_addr.ip.sin_port),
2789                         vnn->pnn));
2790                 return 0;
2791         }
2792
2793         /* A new tickle, we must add it to the array */
2794         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2795                                         struct ctdb_tcp_connection,
2796                                         tcparray->num+1);
2797         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2798
2799         vnn->tcp_array = tcparray;
2800         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2801         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2802         tcparray->num++;
2803                                 
2804         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2805                 ctdb_addr_to_str(&tcp.dst_addr),
2806                 ntohs(tcp.dst_addr.ip.sin_port),
2807                 vnn->pnn));
2808
2809         if (tcp_update_needed) {
2810                 vnn->tcp_update_needed = true;
2811         }
2812
2813         return 0;
2814 }
2815
2816
2817 /*
2818   called by a daemon to inform us of a TCP connection that one of its
2819   clients managing that should tickled with an ACK when IP takeover is
2820   done
2821  */
2822 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2823 {
2824         struct ctdb_tcp_connection *tcpp;
2825         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2826
2827         if (vnn == NULL) {
2828                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2829                         ctdb_addr_to_str(&conn->dst_addr)));
2830                 return;
2831         }
2832
2833         /* if the array is empty we cant remove it
2834            and we dont need to do anything
2835          */
2836         if (vnn->tcp_array == NULL) {
2837                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2838                         ctdb_addr_to_str(&conn->dst_addr),
2839                         ntohs(conn->dst_addr.ip.sin_port)));
2840                 return;
2841         }
2842
2843
2844         /* See if we know this connection
2845            if we dont know this connection  then we dont need to do anything
2846          */
2847         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2848         if (tcpp == NULL) {
2849                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2850                         ctdb_addr_to_str(&conn->dst_addr),
2851                         ntohs(conn->dst_addr.ip.sin_port)));
2852                 return;
2853         }
2854
2855
2856         /* We need to remove this entry from the array.
2857            Instead of allocating a new array and copying data to it
2858            we cheat and just copy the last entry in the existing array
2859            to the entry that is to be removed and just shring the 
2860            ->num field
2861          */
2862         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2863         vnn->tcp_array->num--;
2864
2865         /* If we deleted the last entry we also need to remove the entire array
2866          */
2867         if (vnn->tcp_array->num == 0) {
2868                 talloc_free(vnn->tcp_array);
2869                 vnn->tcp_array = NULL;
2870         }               
2871
2872         vnn->tcp_update_needed = true;
2873
2874         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2875                 ctdb_addr_to_str(&conn->src_addr),
2876                 ntohs(conn->src_addr.ip.sin_port)));
2877 }
2878
2879
2880 /*
2881   called by a daemon to inform us of a TCP connection that one of its
2882   clients used are no longer needed in the tickle database
2883  */
2884 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2885 {
2886         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2887
2888         ctdb_remove_tcp_connection(ctdb, conn);
2889
2890         return 0;
2891 }
2892
2893
2894 /*
2895   called when a daemon restarts - send all tickes for all public addresses
2896   we are serving immediately to the new node.
2897  */
2898 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2899 {
2900 /*XXX here we should send all tickes we are serving to the new node */
2901         return 0;
2902 }
2903
2904
2905 /*
2906   called when a client structure goes away - hook to remove
2907   elements from the tcp_list in all daemons
2908  */
2909 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2910 {
2911         while (client->tcp_list) {
2912                 struct ctdb_tcp_list *tcp = client->tcp_list;
2913                 DLIST_REMOVE(client->tcp_list, tcp);
2914                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2915         }
2916 }
2917
2918
2919 /*
2920   release all IPs on shutdown
2921  */
2922 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2923 {
2924         struct ctdb_vnn *vnn;
2925
2926         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2927                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2928                         ctdb_vnn_unassign_iface(ctdb, vnn);
2929                         continue;
2930                 }
2931                 if (!vnn->iface) {
2932                         continue;
2933                 }
2934                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2935                                   ctdb_vnn_iface_string(vnn),
2936                                   ctdb_addr_to_str(&vnn->public_address),
2937                                   vnn->public_netmask_bits);
2938                 release_kill_clients(ctdb, &vnn->public_address);
2939                 ctdb_vnn_unassign_iface(ctdb, vnn);
2940         }
2941 }
2942
2943
2944 /*
2945   get list of public IPs
2946  */
2947 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2948                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2949 {
2950         int i, num, len;
2951         struct ctdb_all_public_ips *ips;
2952         struct ctdb_vnn *vnn;
2953         bool only_available = false;
2954
2955         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2956                 only_available = true;
2957         }
2958
2959         /* count how many public ip structures we have */
2960         num = 0;
2961         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2962                 num++;
2963         }
2964
2965         len = offsetof(struct ctdb_all_public_ips, ips) + 
2966                 num*sizeof(struct ctdb_public_ip);
2967         ips = talloc_zero_size(outdata, len);
2968         CTDB_NO_MEMORY(ctdb, ips);
2969
2970         i = 0;
2971         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2972                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2973                         continue;
2974                 }
2975                 ips->ips[i].pnn  = vnn->pnn;
2976                 ips->ips[i].addr = vnn->public_address;
2977                 i++;
2978         }
2979         ips->num = i;
2980         len = offsetof(struct ctdb_all_public_ips, ips) +
2981                 i*sizeof(struct ctdb_public_ip);
2982
2983         outdata->dsize = len;
2984         outdata->dptr  = (uint8_t *)ips;
2985
2986         return 0;
2987 }
2988
2989
2990 /*
2991   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2992  */
2993 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2994                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2995 {
2996         int i, num, len;
2997         struct ctdb_all_public_ipsv4 *ips;
2998         struct ctdb_vnn *vnn;
2999
3000         /* count how many public ip structures we have */
3001         num = 0;
3002         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3003                 if (vnn->public_address.sa.sa_family != AF_INET) {
3004                         continue;
3005                 }
3006                 num++;
3007         }
3008
3009         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3010                 num*sizeof(struct ctdb_public_ipv4);
3011         ips = talloc_zero_size(outdata, len);
3012         CTDB_NO_MEMORY(ctdb, ips);
3013
3014         outdata->dsize = len;
3015         outdata->dptr  = (uint8_t *)ips;
3016
3017         ips->num = num;
3018         i = 0;
3019         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3020                 if (vnn->public_address.sa.sa_family != AF_INET) {
3021                         continue;
3022                 }
3023                 ips->ips[i].pnn = vnn->pnn;
3024                 ips->ips[i].sin = vnn->public_address.ip;
3025                 i++;
3026         }
3027
3028         return 0;
3029 }
3030
3031 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3032                                         struct ctdb_req_control *c,
3033                                         TDB_DATA indata,
3034                                         TDB_DATA *outdata)
3035 {
3036         int i, num, len;
3037         ctdb_sock_addr *addr;
3038         struct ctdb_control_public_ip_info *info;
3039         struct ctdb_vnn *vnn;
3040
3041         addr = (ctdb_sock_addr *)indata.dptr;
3042
3043         vnn = find_public_ip_vnn(ctdb, addr);
3044         if (vnn == NULL) {
3045                 /* if it is not a public ip   it could be our 'single ip' */
3046                 if (ctdb->single_ip_vnn) {
3047                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3048                                 vnn = ctdb->single_ip_vnn;
3049                         }
3050                 }
3051         }
3052         if (vnn == NULL) {
3053                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3054                                  "'%s'not a public address\n",
3055                                  ctdb_addr_to_str(addr)));
3056                 return -1;
3057         }
3058
3059         /* count how many public ip structures we have */
3060         num = 0;
3061         for (;vnn->ifaces[num];) {
3062                 num++;
3063         }
3064
3065         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3066                 num*sizeof(struct ctdb_control_iface_info);
3067         info = talloc_zero_size(outdata, len);
3068         CTDB_NO_MEMORY(ctdb, info);
3069
3070         info->ip.addr = vnn->public_address;
3071         info->ip.pnn = vnn->pnn;
3072         info->active_idx = 0xFFFFFFFF;
3073
3074         for (i=0; vnn->ifaces[i]; i++) {
3075                 struct ctdb_iface *cur;
3076
3077                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3078                 if (cur == NULL) {
3079                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3080                                            vnn->ifaces[i]));
3081                         return -1;
3082                 }
3083                 if (vnn->iface == cur) {
3084                         info->active_idx = i;
3085                 }
3086                 strcpy(info->ifaces[i].name, cur->name);
3087                 info->ifaces[i].link_state = cur->link_up;
3088                 info->ifaces[i].references = cur->references;
3089         }
3090         info->num = i;
3091         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3092                 i*sizeof(struct ctdb_control_iface_info);
3093
3094         outdata->dsize = len;
3095         outdata->dptr  = (uint8_t *)info;
3096
3097         return 0;
3098 }
3099
3100 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3101                                 struct ctdb_req_control *c,
3102                                 TDB_DATA *outdata)
3103 {
3104         int i, num, len;
3105         struct ctdb_control_get_ifaces *ifaces;
3106         struct ctdb_iface *cur;
3107
3108         /* count how many public ip structures we have */
3109         num = 0;
3110         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3111                 num++;
3112         }
3113
3114         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3115                 num*sizeof(struct ctdb_control_iface_info);
3116         ifaces = talloc_zero_size(outdata, len);
3117         CTDB_NO_MEMORY(ctdb, ifaces);
3118
3119         i = 0;
3120         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3121                 strcpy(ifaces->ifaces[i].name, cur->name);
3122                 ifaces->ifaces[i].link_state = cur->link_up;
3123                 ifaces->ifaces[i].references = cur->references;
3124                 i++;
3125         }
3126         ifaces->num = i;
3127         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3128                 i*sizeof(struct ctdb_control_iface_info);
3129
3130         outdata->dsize = len;
3131         outdata->dptr  = (uint8_t *)ifaces;
3132
3133         return 0;
3134 }
3135
3136 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3137                                     struct ctdb_req_control *c,
3138                                     TDB_DATA indata)
3139 {
3140         struct ctdb_control_iface_info *info;
3141         struct ctdb_iface *iface;
3142         bool link_up = false;
3143
3144         info = (struct ctdb_control_iface_info *)indata.dptr;
3145
3146         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3147                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3148                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3149                                   len, len, info->name));
3150                 return -1;
3151         }
3152
3153         switch (info->link_state) {
3154         case 0:
3155                 link_up = false;
3156                 break;
3157         case 1:
3158                 link_up = true;
3159                 break;
3160         default:
3161                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3162                                   (unsigned int)info->link_state));
3163                 return -1;
3164         }
3165
3166         if (info->references != 0) {
3167                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3168                                   (unsigned int)info->references));
3169                 return -1;
3170         }
3171
3172         iface = ctdb_find_iface(ctdb, info->name);
3173         if (iface == NULL) {
3174                 return -1;
3175         }
3176
3177         if (link_up == iface->link_up) {
3178                 return 0;
3179         }
3180
3181         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3182               ("iface[%s] has changed it's link status %s => %s\n",
3183                iface->name,
3184                iface->link_up?"up":"down",
3185                link_up?"up":"down"));
3186
3187         iface->link_up = link_up;
3188         return 0;
3189 }
3190
3191
3192 /* 
3193    structure containing the listening socket and the list of tcp connections
3194    that the ctdb daemon is to kill
3195 */
3196 struct ctdb_kill_tcp {
3197         struct ctdb_vnn *vnn;
3198         struct ctdb_context *ctdb;
3199         int capture_fd;
3200         struct fd_event *fde;
3201         trbt_tree_t *connections;
3202         void *private_data;
3203 };
3204
3205 /*
3206   a tcp connection that is to be killed
3207  */
3208 struct ctdb_killtcp_con {
3209         ctdb_sock_addr src_addr;
3210         ctdb_sock_addr dst_addr;
3211         int count;
3212         struct ctdb_kill_tcp *killtcp;
3213 };
3214
3215 /* this function is used to create a key to represent this socketpair
3216    in the killtcp tree.
3217    this key is used to insert and lookup matching socketpairs that are
3218    to be tickled and RST
3219 */
3220 #define KILLTCP_KEYLEN  10
3221 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3222 {
3223         static uint32_t key[KILLTCP_KEYLEN];
3224
3225         bzero(key, sizeof(key));
3226
3227         if (src->sa.sa_family != dst->sa.sa_family) {
3228                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3229                 return key;
3230         }
3231         
3232         switch (src->sa.sa_family) {
3233         case AF_INET:
3234                 key[0]  = dst->ip.sin_addr.s_addr;
3235                 key[1]  = src->ip.sin_addr.s_addr;
3236                 key[2]  = dst->ip.sin_port;
3237                 key[3]  = src->ip.sin_port;
3238                 break;
3239         case AF_INET6: {
3240                 uint32_t *dst6_addr32 =
3241                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3242                 uint32_t *src6_addr32 =
3243                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3244                 key[0]  = dst6_addr32[3];
3245                 key[1]  = src6_addr32[3];
3246                 key[2]  = dst6_addr32[2];
3247                 key[3]  = src6_addr32[2];
3248                 key[4]  = dst6_addr32[1];
3249                 key[5]  = src6_addr32[1];
3250                 key[6]  = dst6_addr32[0];
3251                 key[7]  = src6_addr32[0];
3252                 key[8]  = dst->ip6.sin6_port;
3253                 key[9]  = src->ip6.sin6_port;
3254                 break;
3255         }
3256         default:
3257                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3258                 return key;
3259         }
3260
3261         return key;
3262 }
3263
3264 /*
3265   called when we get a read event on the raw socket
3266  */
3267 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3268                                 uint16_t flags, void *private_data)
3269 {
3270         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3271         struct ctdb_killtcp_con *con;
3272         ctdb_sock_addr src, dst;
3273         uint32_t ack_seq, seq;
3274
3275         if (!(flags & EVENT_FD_READ)) {
3276                 return;
3277         }
3278
3279         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3280                                 killtcp->private_data,
3281                                 &src, &dst,
3282                                 &ack_seq, &seq) != 0) {
3283                 /* probably a non-tcp ACK packet */
3284                 return;
3285         }
3286
3287         /* check if we have this guy in our list of connections
3288            to kill
3289         */
3290         con = trbt_lookuparray32(killtcp->connections, 
3291                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3292         if (con == NULL) {
3293                 /* no this was some other packet we can just ignore */
3294                 return;
3295         }
3296
3297         /* This one has been tickled !
3298            now reset him and remove him from the list.
3299          */
3300         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3301                 ntohs(con->dst_addr.ip.sin_port),
3302                 ctdb_addr_to_str(&con->src_addr),
3303                 ntohs(con->src_addr.ip.sin_port)));
3304
3305         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3306         talloc_free(con);
3307 }
3308
3309
3310 /* when traversing the list of all tcp connections to send tickle acks to
3311    (so that we can capture the ack coming back and kill the connection
3312     by a RST)
3313    this callback is called for each connection we are currently trying to kill
3314 */
3315 static int tickle_connection_traverse(void *param, void *data)
3316 {
3317         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3318
3319         /* have tried too many times, just give up */
3320         if (con->count >= 5) {
3321                 /* can't delete in traverse: reparent to delete_cons */
3322                 talloc_steal(param, con);
3323                 return 0;
3324         }
3325
3326         /* othervise, try tickling it again */
3327         con->count++;
3328         ctdb_sys_send_tcp(
3329                 (ctdb_sock_addr *)&con->dst_addr,
3330                 (ctdb_sock_addr *)&con->src_addr,
3331                 0, 0, 0);
3332         return 0;
3333 }
3334
3335
3336 /* 
3337    called every second until all sentenced connections have been reset
3338  */
3339 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3340                                               struct timeval t, void *private_data)
3341 {
3342         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3343         void *delete_cons = talloc_new(NULL);
3344
3345         /* loop over all connections sending tickle ACKs */
3346         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3347
3348         /* now we've finished traverse, it's safe to do deletion. */
3349         talloc_free(delete_cons);
3350
3351         /* If there are no more connections to kill we can remove the
3352            entire killtcp structure
3353          */
3354         if ( (killtcp->connections == NULL) || 
3355              (killtcp->connections->root == NULL) ) {
3356                 talloc_free(killtcp);
3357                 return;
3358         }
3359
3360         /* try tickling them again in a seconds time
3361          */
3362         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3363                         ctdb_tickle_sentenced_connections, killtcp);
3364 }
3365
3366 /*
3367   destroy the killtcp structure
3368  */
3369 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3370 {
3371         struct ctdb_vnn *tmpvnn;
3372
3373         /* verify that this vnn is still active */
3374         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3375                 if (tmpvnn == killtcp->vnn) {
3376                         break;
3377                 }
3378         }
3379
3380         if (tmpvnn == NULL) {
3381                 return 0;
3382         }
3383
3384         if (killtcp->vnn->killtcp != killtcp) {
3385                 return 0;
3386         }
3387
3388         killtcp->vnn->killtcp = NULL;
3389
3390         return 0;
3391 }
3392
3393
3394 /* nothing fancy here, just unconditionally replace any existing
3395    connection structure with the new one.
3396
3397    dont even free the old one if it did exist, that one is talloc_stolen
3398    by the same node in the tree anyway and will be deleted when the new data 
3399    is deleted
3400 */
3401 static void *add_killtcp_callback(void *parm, void *data)
3402 {
3403         return parm;
3404 }
3405
3406 /*
3407   add a tcp socket to the list of connections we want to RST
3408  */
3409 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3410                                        ctdb_sock_addr *s,
3411                                        ctdb_sock_addr *d)
3412 {
3413         ctdb_sock_addr src, dst;
3414         struct ctdb_kill_tcp *killtcp;
3415         struct ctdb_killtcp_con *con;
3416         struct ctdb_vnn *vnn;
3417
3418         ctdb_canonicalize_ip(s, &src);
3419         ctdb_canonicalize_ip(d, &dst);
3420
3421         vnn = find_public_ip_vnn(ctdb, &dst);
3422         if (vnn == NULL) {
3423                 vnn = find_public_ip_vnn(ctdb, &src);
3424         }
3425         if (vnn == NULL) {
3426                 /* if it is not a public ip   it could be our 'single ip' */
3427                 if (ctdb->single_ip_vnn) {
3428                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3429                                 vnn = ctdb->single_ip_vnn;
3430                         }
3431                 }
3432         }
3433         if (vnn == NULL) {
3434                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3435                 return -1;
3436         }
3437
3438         killtcp = vnn->killtcp;
3439         
3440         /* If this is the first connection to kill we must allocate
3441            a new structure
3442          */
3443         if (killtcp == NULL) {
3444                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3445                 CTDB_NO_MEMORY(ctdb, killtcp);
3446
3447                 killtcp->vnn         = vnn;
3448                 killtcp->ctdb        = ctdb;
3449                 killtcp->capture_fd  = -1;
3450                 killtcp->connections = trbt_create(killtcp, 0);
3451
3452                 vnn->killtcp         = killtcp;
3453                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3454         }
3455
3456
3457
3458         /* create a structure that describes this connection we want to
3459            RST and store it in killtcp->connections
3460         */
3461         con = talloc(killtcp, struct ctdb_killtcp_con);
3462         CTDB_NO_MEMORY(ctdb, con);
3463         con->src_addr = src;
3464         con->dst_addr = dst;
3465         con->count    = 0;
3466         con->killtcp  = killtcp;
3467
3468
3469         trbt_insertarray32_callback(killtcp->connections,
3470                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3471                         add_killtcp_callback, con);
3472
3473         /* 
3474            If we dont have a socket to listen on yet we must create it
3475          */
3476         if (killtcp->capture_fd == -1) {
3477                 const char *iface = ctdb_vnn_iface_string(vnn);
3478                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3479                 if (killtcp->capture_fd == -1) {
3480                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3481                                           "socket on iface '%s' for killtcp (%s)\n",
3482                                           iface, strerror(errno)));
3483                         goto failed;
3484                 }
3485         }
3486
3487
3488         if (killtcp->fde == NULL) {
3489                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3490                                             EVENT_FD_READ,
3491                                             capture_tcp_handler, killtcp);
3492                 tevent_fd_set_auto_close(killtcp->fde);
3493
3494                 /* We also need to set up some events to tickle all these connections
3495                    until they are all reset
3496                 */
3497                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3498                                 ctdb_tickle_sentenced_connections, killtcp);
3499         }
3500
3501         /* tickle him once now */
3502         ctdb_sys_send_tcp(
3503                 &con->dst_addr,
3504                 &con->src_addr,
3505                 0, 0, 0);
3506
3507         return 0;
3508
3509 failed:
3510         talloc_free(vnn->killtcp);
3511         vnn->killtcp = NULL;
3512         return -1;
3513 }
3514
3515 /*
3516   kill a TCP connection.
3517  */
3518 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3519 {
3520         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3521
3522         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3523 }
3524
3525 /*
3526   called by a daemon to inform us of the entire list of TCP tickles for
3527   a particular public address.
3528   this control should only be sent by the node that is currently serving
3529   that public address.
3530  */
3531 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3532 {
3533         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3534         struct ctdb_tcp_array *tcparray;
3535         struct ctdb_vnn *vnn;
3536
3537         /* We must at least have tickles.num or else we cant verify the size
3538            of the received data blob
3539          */
3540         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3541                                         tickles.connections)) {
3542                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3543                 return -1;
3544         }
3545
3546         /* verify that the size of data matches what we expect */
3547         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3548                                 tickles.connections)
3549                          + sizeof(struct ctdb_tcp_connection)
3550                                  * list->tickles.num) {
3551                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3552                 return -1;
3553         }       
3554
3555         vnn = find_public_ip_vnn(ctdb, &list->addr);
3556         if (vnn == NULL) {
3557                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3558                         ctdb_addr_to_str(&list->addr)));
3559
3560                 return 1;
3561         }
3562
3563         /* remove any old ticklelist we might have */
3564         talloc_free(vnn->tcp_array);
3565         vnn->tcp_array = NULL;
3566
3567         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3568         CTDB_NO_MEMORY(ctdb, tcparray);
3569
3570         tcparray->num = list->tickles.num;
3571
3572         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3573         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3574
3575         memcpy(tcparray->connections, &list->tickles.connections[0], 
3576                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3577
3578         /* We now have a new fresh tickle list array for this vnn */
3579         vnn->tcp_array = talloc_steal(vnn, tcparray);
3580         
3581         return 0;
3582 }
3583
3584 /*
3585   called to return the full list of tickles for the puclic address associated 
3586   with the provided vnn
3587  */
3588 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3589 {
3590         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3591         struct ctdb_control_tcp_tickle_list *list;
3592         struct ctdb_tcp_array *tcparray;
3593         int num;
3594         struct ctdb_vnn *vnn;
3595
3596         vnn = find_public_ip_vnn(ctdb, addr);
3597         if (vnn == NULL) {
3598                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3599                         ctdb_addr_to_str(addr)));
3600
3601                 return 1;
3602         }
3603
3604         tcparray = vnn->tcp_array;
3605         if (tcparray) {
3606                 num = tcparray->num;
3607         } else {
3608                 num = 0;
3609         }
3610
3611         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3612                                 tickles.connections)
3613                         + sizeof(struct ctdb_tcp_connection) * num;
3614
3615         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3616         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3617         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3618
3619         list->addr = *addr;
3620         list->tickles.num = num;
3621         if (num) {
3622                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3623                         sizeof(struct ctdb_tcp_connection) * num);
3624         }
3625
3626         return 0;
3627 }
3628
3629
3630 /*
3631   set the list of all tcp tickles for a public address
3632  */
3633 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3634                               struct timeval timeout, uint32_t destnode, 
3635                               ctdb_sock_addr *addr,
3636                               struct ctdb_tcp_array *tcparray)
3637 {
3638         int ret, num;
3639         TDB_DATA data;
3640         struct ctdb_control_tcp_tickle_list *list;
3641
3642         if (tcparray) {
3643                 num = tcparray->num;
3644         } else {
3645                 num = 0;
3646         }
3647
3648         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3649                                 tickles.connections) +
3650                         sizeof(struct ctdb_tcp_connection) * num;
3651         data.dptr = talloc_size(ctdb, data.dsize);
3652         CTDB_NO_MEMORY(ctdb, data.dptr);
3653
3654         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3655         list->addr = *addr;
3656         list->tickles.num = num;
3657         if (tcparray) {
3658                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3659         }
3660
3661         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3662                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3663                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3664         if (ret != 0) {
3665                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3666                 return -1;
3667         }
3668
3669         talloc_free(data.dptr);
3670
3671         return ret;
3672 }
3673
3674
3675 /*
3676   perform tickle updates if required
3677  */
3678 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3679                                 struct timed_event *te, 
3680                                 struct timeval t, void *private_data)
3681 {
3682         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3683         int ret;
3684         struct ctdb_vnn *vnn;
3685
3686         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3687                 /* we only send out updates for public addresses that 
3688                    we have taken over
3689                  */
3690                 if (ctdb->pnn != vnn->pnn) {
3691                         continue;
3692                 }
3693                 /* We only send out the updates if we need to */
3694                 if (!vnn->tcp_update_needed) {
3695                         continue;
3696                 }
3697                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3698                                 TAKEOVER_TIMEOUT(),
3699                                 CTDB_BROADCAST_CONNECTED,
3700                                 &vnn->public_address,
3701                                 vnn->tcp_array);
3702                 if (ret != 0) {
3703                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3704                                 ctdb_addr_to_str(&vnn->public_address)));
3705                 }
3706         }
3707
3708         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3709                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3710                              ctdb_update_tcp_tickles, ctdb);
3711 }               
3712         
3713
3714 /*
3715   start periodic update of tcp tickles
3716  */
3717 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3718 {
3719         ctdb->tickle_update_context = talloc_new(ctdb);
3720
3721         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3722                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3723                              ctdb_update_tcp_tickles, ctdb);
3724 }
3725
3726
3727
3728
3729 struct control_gratious_arp {
3730         struct ctdb_context *ctdb;
3731         ctdb_sock_addr addr;
3732         const char *iface;
3733         int count;
3734 };
3735
3736 /*
3737   send a control_gratuitous arp
3738  */
3739 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3740                                   struct timeval t, void *private_data)
3741 {
3742         int ret;
3743         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3744                                                         struct control_gratious_arp);
3745
3746         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3747         if (ret != 0) {
3748                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3749                                  arp->iface, strerror(errno)));
3750         }
3751
3752
3753         arp->count++;
3754         if (arp->count == CTDB_ARP_REPEAT) {
3755                 talloc_free(arp);
3756                 return;
3757         }
3758
3759         event_add_timed(arp->ctdb->ev, arp, 
3760                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3761                         send_gratious_arp, arp);
3762 }
3763
3764
3765 /*
3766   send a gratious arp 
3767  */
3768 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3769 {
3770         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3771         struct control_gratious_arp *arp;
3772
3773         /* verify the size of indata */
3774         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3775                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3776                                  (unsigned)indata.dsize, 
3777                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3778                 return -1;
3779         }
3780         if (indata.dsize != 
3781                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3782                 + gratious_arp->len ) ){
3783
3784                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3785                         "but should be %u bytes\n", 
3786                          (unsigned)indata.dsize, 
3787                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3788                 return -1;
3789         }
3790
3791
3792         arp = talloc(ctdb, struct control_gratious_arp);
3793         CTDB_NO_MEMORY(ctdb, arp);
3794
3795         arp->ctdb  = ctdb;
3796         arp->addr   = gratious_arp->addr;
3797         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3798         CTDB_NO_MEMORY(ctdb, arp->iface);
3799         arp->count = 0;
3800         
3801         event_add_timed(arp->ctdb->ev, arp, 
3802                         timeval_zero(), send_gratious_arp, arp);
3803
3804         return 0;
3805 }
3806
3807 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3808 {
3809         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3810         int ret;
3811
3812         /* verify the size of indata */
3813         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3814                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3815                 return -1;
3816         }
3817         if (indata.dsize != 
3818                 ( offsetof(struct ctdb_control_ip_iface, iface)
3819                 + pub->len ) ){
3820
3821                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3822                         "but should be %u bytes\n", 
3823                          (unsigned)indata.dsize, 
3824                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3825                 return -1;
3826         }
3827
3828         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3829
3830         if (ret != 0) {
3831                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3832                 return -1;
3833         }
3834
3835         return 0;
3836 }
3837
3838 /*
3839   called when releaseip event finishes for del_public_address
3840  */
3841 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3842                                 void *private_data)
3843 {
3844         talloc_free(private_data);
3845 }
3846
3847 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3848 {
3849         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3850         struct ctdb_vnn *vnn;
3851         int ret;
3852
3853         /* verify the size of indata */
3854         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3855                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3856                 return -1;
3857         }
3858         if (indata.dsize != 
3859                 ( offsetof(struct ctdb_control_ip_iface, iface)
3860                 + pub->len ) ){
3861
3862                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3863                         "but should be %u bytes\n", 
3864                          (unsigned)indata.dsize, 
3865                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3866                 return -1;
3867         }
3868
3869         /* walk over all public addresses until we find a match */
3870         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3871                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3872                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3873
3874                         DLIST_REMOVE(ctdb->vnn, vnn);
3875                         talloc_steal(mem_ctx, vnn);
3876                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3877                         if (vnn->pnn != ctdb->pnn) {
3878                                 if (vnn->iface != NULL) {
3879                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3880                                 }
3881                                 talloc_free(mem_ctx);
3882                                 return 0;
3883                         }
3884                         vnn->pnn = -1;
3885
3886                         ret = ctdb_event_script_callback(ctdb, 
3887                                          mem_ctx, delete_ip_callback, mem_ctx,
3888                                          false,
3889                                          CTDB_EVENT_RELEASE_IP,
3890                                          "%s %s %u",
3891                                          ctdb_vnn_iface_string(vnn),
3892                                          ctdb_addr_to_str(&vnn->public_address),
3893                                          vnn->public_netmask_bits);
3894                         if (vnn->iface != NULL) {
3895                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3896                         }
3897                         if (ret != 0) {
3898                                 return -1;
3899                         }
3900                         return 0;
3901                 }
3902         }
3903
3904         return -1;
3905 }
3906
3907
3908 struct ipreallocated_callback_state {
3909         struct ctdb_req_control *c;
3910 };
3911
3912 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3913                                         int status, void *p)
3914 {
3915         struct ipreallocated_callback_state *state =
3916                 talloc_get_type(p, struct ipreallocated_callback_state);
3917
3918         if (status != 0) {
3919                 DEBUG(DEBUG_ERR,
3920                       (" \"ipreallocated\" event script failed (status %d)\n",
3921                        status));
3922                 if (status == -ETIME) {
3923                         ctdb_ban_self(ctdb);
3924                 }
3925         }
3926
3927         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3928         talloc_free(state);
3929 }
3930
3931 /* A control to run the ipreallocated event */
3932 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3933                                    struct ctdb_req_control *c,
3934                                    bool *async_reply)
3935 {
3936         int ret;
3937         struct ipreallocated_callback_state *state;
3938
3939         state = talloc(ctdb, struct ipreallocated_callback_state);
3940         CTDB_NO_MEMORY(ctdb, state);
3941
3942         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3943
3944         ret = ctdb_event_script_callback(ctdb, state,
3945                                          ctdb_ipreallocated_callback, state,
3946                                          false, CTDB_EVENT_IPREALLOCATED,
3947                                          "%s", "");
3948
3949         if (ret != 0) {
3950                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3951                 talloc_free(state);
3952                 return -1;
3953         }
3954
3955         /* tell the control that we will be reply asynchronously */
3956         state->c    = talloc_steal(state, c);
3957         *async_reply = true;
3958
3959         return 0;
3960 }
3961
3962
3963 /* This function is called from the recovery daemon to verify that a remote
3964    node has the expected ip allocation.
3965    This is verified against ctdb->ip_tree
3966 */
3967 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3968 {
3969         struct ctdb_public_ip_list *tmp_ip; 
3970         int i;
3971
3972         if (ctdb->ip_tree == NULL) {
3973                 /* dont know the expected allocation yet, assume remote node
3974                    is correct. */
3975                 return 0;
3976         }
3977
3978         if (ips == NULL) {
3979                 return 0;
3980         }
3981
3982         for (i=0; i<ips->num; i++) {
3983                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3984                 if (tmp_ip == NULL) {
3985                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3986                         return -1;
3987                 }
3988
3989                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3990                         continue;
3991                 }
3992
3993                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3994                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3995                         return -1;
3996                 }
3997         }
3998
3999         return 0;
4000 }
4001
4002 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4003 {
4004         struct ctdb_public_ip_list *tmp_ip; 
4005
4006         if (ctdb->ip_tree == NULL) {
4007                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4008                 return -1;
4009         }
4010
4011         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4012         if (tmp_ip == NULL) {
4013                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4014                 return -1;
4015         }
4016
4017         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4018         tmp_ip->pnn = ip->pnn;
4019
4020         return 0;
4021 }
4022
4023
4024 struct ctdb_reloadips_handle {
4025         struct ctdb_context *ctdb;
4026         struct ctdb_req_control *c;
4027         int status;
4028         int fd[2];
4029         pid_t child;
4030         struct fd_event *fde;
4031 };
4032
4033 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4034 {
4035         if (h == h->ctdb->reload_ips) {
4036                 h->ctdb->reload_ips = NULL;
4037         }
4038         if (h->c != NULL) {
4039                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4040                 h->c = NULL;
4041         }
4042         ctdb_kill(h->ctdb, h->child, SIGKILL);
4043         return 0;
4044 }
4045
4046 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4047                                 struct timed_event *te,
4048                                 struct timeval t, void *private_data)
4049 {
4050         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4051
4052         talloc_free(h);
4053 }       
4054
4055 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4056                              uint16_t flags, void *private_data)
4057 {
4058         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4059
4060         char res;
4061         int ret;
4062
4063         ret = read(h->fd[0], &res, 1);
4064         if (ret < 1 || res != 0) {
4065                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4066                 res = 1;
4067         }
4068         h->status = res;
4069
4070         talloc_free(h);
4071 }
4072
4073 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4074 {
4075         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4076         struct ctdb_all_public_ips *ips;
4077         struct ctdb_vnn *vnn;
4078         int i, ret;
4079
4080         /* read the ip allocation from the local node */
4081         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4082         if (ret != 0) {
4083                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4084                 talloc_free(mem_ctx);
4085                 return -1;
4086         }
4087
4088         /* re-read the public ips file */
4089         ctdb->vnn = NULL;
4090         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4091                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4092                 talloc_free(mem_ctx);
4093                 return -1;
4094         }               
4095
4096
4097         /* check the previous list of ips and scan for ips that have been
4098            dropped.
4099          */
4100         for (i = 0; i < ips->num; i++) {
4101                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4102                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4103                                 break;
4104                         }
4105                 }
4106
4107                 /* we need to delete this ip, no longer available on this node */
4108                 if (vnn == NULL) {
4109                         struct ctdb_control_ip_iface pub;
4110
4111                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4112                         pub.addr  = ips->ips[i].addr;
4113                         pub.mask  = 0;
4114                         pub.len   = 0;
4115
4116                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4117                         if (ret != 0) {
4118                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4119                                 return -1;
4120                         }
4121                 }
4122         }
4123
4124
4125         /* loop over all new ones and check the ones we need to add */
4126         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4127                 for (i = 0; i < ips->num; i++) {
4128                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4129                                 break;
4130                         }
4131                 }
4132                 if (i == ips->num) {
4133                         struct ctdb_control_ip_iface pub;
4134                         const char *ifaces = NULL;
4135                         int iface = 0;
4136
4137                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4138
4139                         pub.addr  = vnn->public_address;
4140                         pub.mask  = vnn->public_netmask_bits;
4141
4142
4143                         ifaces = vnn->ifaces[0];
4144                         iface = 1;
4145                         while (vnn->ifaces[iface] != NULL) {
4146                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4147                                 iface++;
4148                         }
4149                         pub.len   = strlen(ifaces)+1;
4150                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4151
4152                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4153                         if (ret != 0) {
4154                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4155                                 return -1;
4156                         }
4157                 }
4158         }
4159
4160         return 0;
4161 }
4162
4163 /* This control is sent to force the node to re-read the public addresses file
4164    and drop any addresses we should nnot longer host, and add new addresses
4165    that we are now able to host
4166 */
4167 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4168 {
4169         struct ctdb_reloadips_handle *h;
4170         pid_t parent = getpid();
4171
4172         if (ctdb->reload_ips != NULL) {
4173                 talloc_free(ctdb->reload_ips);
4174                 ctdb->reload_ips = NULL;
4175         }
4176
4177         h = talloc(ctdb, struct ctdb_reloadips_handle);
4178         CTDB_NO_MEMORY(ctdb, h);
4179         h->ctdb     = ctdb;
4180         h->c        = NULL;
4181         h->status   = -1;
4182         
4183         if (pipe(h->fd) == -1) {
4184                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4185                 talloc_free(h);
4186                 return -1;
4187         }
4188
4189         h->child = ctdb_fork(ctdb);
4190         if (h->child == (pid_t)-1) {
4191                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4192                 close(h->fd[0]);
4193                 close(h->fd[1]);
4194                 talloc_free(h);
4195                 return -1;
4196         }
4197
4198         /* child process */
4199         if (h->child == 0) {
4200                 signed char res = 0;
4201
4202                 close(h->fd[0]);
4203                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4204
4205                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4206                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4207                         res = -1;
4208                 } else {
4209                         res = ctdb_reloadips_child(ctdb);
4210                         if (res != 0) {
4211                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4212                         }
4213                 }
4214
4215                 write(h->fd[1], &res, 1);
4216                 /* make sure we die when our parent dies */
4217                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4218                         sleep(5);
4219                 }
4220                 _exit(0);
4221         }
4222
4223         h->c             = talloc_steal(h, c);
4224
4225         close(h->fd[1]);
4226         set_close_on_exec(h->fd[0]);
4227
4228         talloc_set_destructor(h, ctdb_reloadips_destructor);
4229
4230
4231         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4232                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4233                         (void *)h);
4234         tevent_fd_set_auto_close(h->fde);
4235
4236         event_add_timed(ctdb->ev, h,
4237                         timeval_current_ofs(120, 0),
4238                         ctdb_reloadips_timeout_event, h);
4239
4240         /* we reply later */
4241         *async_reply = true;
4242         return 0;
4243 }