recoverd: ctdb_takeover_run() uses CTDB_CONTROL_IPREALLOCATED
[kai/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Check if this is a public ip known to the node, i.e. can that
1303    node takeover this ip ?
1304 */
1305 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                 struct ctdb_public_ip_list *ip)
1307 {
1308         struct ctdb_all_public_ips *public_ips;
1309         int i;
1310
1311         public_ips = ctdb->nodes[pnn]->available_public_ips;
1312
1313         if (public_ips == NULL) {
1314                 return -1;
1315         }
1316
1317         for (i=0;i<public_ips->num;i++) {
1318                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1319                         /* yes, this node can serve this public ip */
1320                         return 0;
1321                 }
1322         }
1323
1324         return -1;
1325 }
1326
1327
1328 /* search the node lists list for a node to takeover this ip.
1329    pick the node that currently are serving the least number of ips
1330    so that the ips get spread out evenly.
1331 */
1332 static int find_takeover_node(struct ctdb_context *ctdb, 
1333                 struct ctdb_node_map *nodemap, uint32_t mask, 
1334                 struct ctdb_public_ip_list *ip,
1335                 struct ctdb_public_ip_list *all_ips)
1336 {
1337         int pnn, min=0, num;
1338         int i;
1339
1340         pnn    = -1;
1341         for (i=0;i<nodemap->num;i++) {
1342                 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1343                         /* This node is not allowed to takeover any addresses
1344                         */
1345                         continue;
1346                 }
1347
1348                 if (nodemap->nodes[i].flags & mask) {
1349                         /* This node is not healty and can not be used to serve
1350                            a public address 
1351                         */
1352                         continue;
1353                 }
1354
1355                 /* verify that this node can serve this ip */
1356                 if (can_node_serve_ip(ctdb, i, ip)) {
1357                         /* no it couldnt   so skip to the next node */
1358                         continue;
1359                 }
1360
1361                 num = node_ip_coverage(ctdb, i, all_ips);
1362                 /* was this the first node we checked ? */
1363                 if (pnn == -1) {
1364                         pnn = i;
1365                         min  = num;
1366                 } else {
1367                         if (num < min) {
1368                                 pnn = i;
1369                                 min  = num;
1370                         }
1371                 }
1372         }       
1373         if (pnn == -1) {
1374                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1375                         ctdb_addr_to_str(&ip->addr)));
1376
1377                 return -1;
1378         }
1379
1380         ip->pnn = pnn;
1381         return 0;
1382 }
1383
1384 #define IP_KEYLEN       4
1385 static uint32_t *ip_key(ctdb_sock_addr *ip)
1386 {
1387         static uint32_t key[IP_KEYLEN];
1388
1389         bzero(key, sizeof(key));
1390
1391         switch (ip->sa.sa_family) {
1392         case AF_INET:
1393                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1394                 break;
1395         case AF_INET6: {
1396                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1397                 key[0]  = htonl(s6_a32[0]);
1398                 key[1]  = htonl(s6_a32[1]);
1399                 key[2]  = htonl(s6_a32[2]);
1400                 key[3]  = htonl(s6_a32[3]);
1401                 break;
1402         }
1403         default:
1404                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1405                 return key;
1406         }
1407
1408         return key;
1409 }
1410
1411 static void *add_ip_callback(void *parm, void *data)
1412 {
1413         struct ctdb_public_ip_list *this_ip = parm; 
1414         struct ctdb_public_ip_list *prev_ip = data; 
1415
1416         if (prev_ip == NULL) {
1417                 return parm;
1418         }
1419         if (this_ip->pnn == -1) {
1420                 this_ip->pnn = prev_ip->pnn;
1421         }
1422
1423         return parm;
1424 }
1425
1426 static int getips_count_callback(void *param, void *data)
1427 {
1428         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1429         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1430
1431         new_ip->next = *ip_list;
1432         *ip_list     = new_ip;
1433         return 0;
1434 }
1435
1436 static struct ctdb_public_ip_list *
1437 create_merged_ip_list(struct ctdb_context *ctdb)
1438 {
1439         int i, j;
1440         struct ctdb_public_ip_list *ip_list;
1441         struct ctdb_all_public_ips *public_ips;
1442
1443         if (ctdb->ip_tree != NULL) {
1444                 talloc_free(ctdb->ip_tree);
1445                 ctdb->ip_tree = NULL;
1446         }
1447         ctdb->ip_tree = trbt_create(ctdb, 0);
1448
1449         for (i=0;i<ctdb->num_nodes;i++) {
1450                 public_ips = ctdb->nodes[i]->known_public_ips;
1451
1452                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1453                         continue;
1454                 }
1455
1456                 /* there were no public ips for this node */
1457                 if (public_ips == NULL) {
1458                         continue;
1459                 }               
1460
1461                 for (j=0;j<public_ips->num;j++) {
1462                         struct ctdb_public_ip_list *tmp_ip; 
1463
1464                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1465                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1466                         /* Do not use information about IP addresses hosted
1467                          * on other nodes, it may not be accurate */
1468                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1469                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1470                         } else {
1471                                 tmp_ip->pnn = -1;
1472                         }
1473                         tmp_ip->addr = public_ips->ips[j].addr;
1474                         tmp_ip->next = NULL;
1475
1476                         trbt_insertarray32_callback(ctdb->ip_tree,
1477                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1478                                 add_ip_callback,
1479                                 tmp_ip);
1480                 }
1481         }
1482
1483         ip_list = NULL;
1484         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1485
1486         return ip_list;
1487 }
1488
1489 /* 
1490  * This is the length of the longtest common prefix between the IPs.
1491  * It is calculated by XOR-ing the 2 IPs together and counting the
1492  * number of leading zeroes.  The implementation means that all
1493  * addresses end up being 128 bits long.
1494  *
1495  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1496  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1497  * lots of nodes and IP addresses?
1498  */
1499 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1500 {
1501         uint32_t ip1_k[IP_KEYLEN];
1502         uint32_t *t;
1503         int i;
1504         uint32_t x;
1505
1506         uint32_t distance = 0;
1507
1508         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1509         t = ip_key(ip2);
1510         for (i=0; i<IP_KEYLEN; i++) {
1511                 x = ip1_k[i] ^ t[i];
1512                 if (x == 0) {
1513                         distance += 32;
1514                 } else {
1515                         /* Count number of leading zeroes. 
1516                          * FIXME? This could be optimised...
1517                          */
1518                         while ((x & (1 << 31)) == 0) {
1519                                 x <<= 1;
1520                                 distance += 1;
1521                         }
1522                 }
1523         }
1524
1525         return distance;
1526 }
1527
1528 /* Calculate the IP distance for the given IP relative to IPs on the
1529    given node.  The ips argument is generally the all_ips variable
1530    used in the main part of the algorithm.
1531  */
1532 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1533                                   struct ctdb_public_ip_list *ips,
1534                                   int pnn)
1535 {
1536         struct ctdb_public_ip_list *t;
1537         uint32_t d;
1538
1539         uint32_t sum = 0;
1540
1541         for (t=ips; t != NULL; t=t->next) {
1542                 if (t->pnn != pnn) {
1543                         continue;
1544                 }
1545
1546                 /* Optimisation: We never calculate the distance
1547                  * between an address and itself.  This allows us to
1548                  * calculate the effect of removing an address from a
1549                  * node by simply calculating the distance between
1550                  * that address and all of the exitsing addresses.
1551                  * Moreover, we assume that we're only ever dealing
1552                  * with addresses from all_ips so we can identify an
1553                  * address via a pointer rather than doing a more
1554                  * expensive address comparison. */
1555                 if (&(t->addr) == ip) {
1556                         continue;
1557                 }
1558
1559                 d = ip_distance(ip, &(t->addr));
1560                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1561         }
1562
1563         return sum;
1564 }
1565
1566 /* Return the LCP2 imbalance metric for addresses currently assigned
1567    to the given node.
1568  */
1569 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1570 {
1571         struct ctdb_public_ip_list *t;
1572
1573         uint32_t imbalance = 0;
1574
1575         for (t=all_ips; t!=NULL; t=t->next) {
1576                 if (t->pnn != pnn) {
1577                         continue;
1578                 }
1579                 /* Pass the rest of the IPs rather than the whole
1580                    all_ips input list.
1581                 */
1582                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1583         }
1584
1585         return imbalance;
1586 }
1587
1588 /* Allocate any unassigned IPs just by looping through the IPs and
1589  * finding the best node for each.
1590  */
1591 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1592                                       struct ctdb_node_map *nodemap,
1593                                       uint32_t mask,
1594                                       struct ctdb_public_ip_list *all_ips)
1595 {
1596         struct ctdb_public_ip_list *tmp_ip;
1597
1598         /* loop over all ip's and find a physical node to cover for 
1599            each unassigned ip.
1600         */
1601         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1602                 if (tmp_ip->pnn == -1) {
1603                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1604                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1605                                         ctdb_addr_to_str(&tmp_ip->addr)));
1606                         }
1607                 }
1608         }
1609 }
1610
1611 /* Basic non-deterministic rebalancing algorithm.
1612  */
1613 static void basic_failback(struct ctdb_context *ctdb,
1614                            struct ctdb_node_map *nodemap,
1615                            uint32_t mask,
1616                            struct ctdb_public_ip_list *all_ips,
1617                            int num_ips)
1618 {
1619         int i;
1620         int maxnode, maxnum, minnode, minnum, num, retries;
1621         struct ctdb_public_ip_list *tmp_ip;
1622
1623         retries = 0;
1624
1625 try_again:
1626         maxnum=0;
1627         minnum=0;
1628
1629         /* for each ip address, loop over all nodes that can serve
1630            this ip and make sure that the difference between the node
1631            serving the most and the node serving the least ip's are
1632            not greater than 1.
1633         */
1634         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1635                 if (tmp_ip->pnn == -1) {
1636                         continue;
1637                 }
1638
1639                 /* Get the highest and lowest number of ips's served by any 
1640                    valid node which can serve this ip.
1641                 */
1642                 maxnode = -1;
1643                 minnode = -1;
1644                 for (i=0;i<nodemap->num;i++) {
1645                         if (nodemap->nodes[i].flags & mask) {
1646                                 continue;
1647                         }
1648
1649                         /* Only check nodes that are allowed to takeover an ip */
1650                         if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1651                                 continue;
1652                         }
1653
1654                         /* only check nodes that can actually serve this ip */
1655                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1656                                 /* no it couldnt   so skip to the next node */
1657                                 continue;
1658                         }
1659
1660                         num = node_ip_coverage(ctdb, i, all_ips);
1661                         if (maxnode == -1) {
1662                                 maxnode = i;
1663                                 maxnum  = num;
1664                         } else {
1665                                 if (num > maxnum) {
1666                                         maxnode = i;
1667                                         maxnum  = num;
1668                                 }
1669                         }
1670                         if (minnode == -1) {
1671                                 minnode = i;
1672                                 minnum  = num;
1673                         } else {
1674                                 if (num < minnum) {
1675                                         minnode = i;
1676                                         minnum  = num;
1677                                 }
1678                         }
1679                 }
1680                 if (maxnode == -1) {
1681                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1682                                 ctdb_addr_to_str(&tmp_ip->addr)));
1683
1684                         continue;
1685                 }
1686
1687                 /* if the spread between the smallest and largest coverage by
1688                    a node is >=2 we steal one of the ips from the node with
1689                    most coverage to even things out a bit.
1690                    try to do this a limited number of times since we dont
1691                    want to spend too much time balancing the ip coverage.
1692                 */
1693                 if ( (maxnum > minnum+1)
1694                      && (retries < (num_ips + 5)) ){
1695                         struct ctdb_public_ip_list *tmp;
1696
1697                         /* Reassign one of maxnode's VNNs */
1698                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1699                                 if (tmp->pnn == maxnode) {
1700                                         (void)find_takeover_node(ctdb, nodemap, mask, tmp, all_ips);
1701                                         retries++;
1702                                         goto try_again;;
1703                                 }
1704                         }
1705                 }
1706         }
1707 }
1708
1709 struct ctdb_rebalancenodes {
1710         struct ctdb_rebalancenodes *next;
1711         uint32_t pnn;
1712 };
1713 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1714
1715
1716 /* set this flag to force the node to be rebalanced even if it just didnt
1717    become healthy again.
1718 */
1719 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1720 {
1721         struct ctdb_rebalancenodes *rebalance;
1722
1723         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1724                 if (rebalance->pnn == pnn) {
1725                         return;
1726                 }
1727         }
1728
1729         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1730         rebalance->pnn = pnn;
1731         rebalance->next = force_rebalance_list;
1732         force_rebalance_list = rebalance;
1733 }
1734
1735 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1736  * that we can unit test it.
1737  */
1738 static void lcp2_init(struct ctdb_context * tmp_ctx,
1739                struct ctdb_node_map * nodemap,
1740                uint32_t mask,
1741                struct ctdb_public_ip_list *all_ips,
1742                uint32_t **lcp2_imbalances,
1743                bool **newly_healthy)
1744 {
1745         int i;
1746         struct ctdb_public_ip_list *tmp_ip;
1747
1748         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1750         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1751         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752
1753         for (i=0;i<nodemap->num;i++) {
1754                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1755                 /* First step: is the node "healthy"? */
1756                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1757         }
1758
1759         /* 2nd step: if a ndoe has IPs assigned then it must have been
1760          * healthy before, so we remove it from consideration... */
1761         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1762                 if (tmp_ip->pnn != -1) {
1763                         (*newly_healthy)[tmp_ip->pnn] = false;
1764                 }
1765         }
1766
1767         /* 3rd step: if a node is forced to re-balance then
1768            we allow failback onto the node */
1769         while (force_rebalance_list != NULL) {
1770                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1771
1772                 if (force_rebalance_list->pnn <= nodemap->num) {
1773                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1774                 }
1775
1776                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1777                 talloc_free(force_rebalance_list);
1778                 force_rebalance_list = next;
1779         }
1780 }
1781
1782 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1783  * the IP/node combination that will cost the least.
1784  */
1785 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1786                               struct ctdb_node_map *nodemap,
1787                               uint32_t mask,
1788                               struct ctdb_public_ip_list *all_ips,
1789                               uint32_t *lcp2_imbalances)
1790 {
1791         struct ctdb_public_ip_list *tmp_ip;
1792         int dstnode;
1793
1794         int minnode;
1795         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1796         struct ctdb_public_ip_list *minip;
1797
1798         bool should_loop = true;
1799         bool have_unassigned = true;
1800
1801         while (have_unassigned && should_loop) {
1802                 should_loop = false;
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1806
1807                 minnode = -1;
1808                 mindsum = 0;
1809                 minip = NULL;
1810
1811                 /* loop over each unassigned ip. */
1812                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1813                         if (tmp_ip->pnn != -1) {
1814                                 continue;
1815                         }
1816
1817                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1818                                 /* Only check nodes that are allowed to takeover an ip */
1819                                 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1820                                         continue;
1821                                 }
1822
1823                                 /* only check nodes that can actually serve this ip */
1824                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1825                                         /* no it couldnt   so skip to the next node */
1826                                         continue;
1827                                 }
1828                                 if (nodemap->nodes[dstnode].flags & mask) {
1829                                         continue;
1830                                 }
1831
1832                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1833                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1834                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1835                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1836                                                    dstnode,
1837                                                    dstimbl - lcp2_imbalances[dstnode]));
1838
1839
1840                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1841                                         minnode = dstnode;
1842                                         minimbl = dstimbl;
1843                                         mindsum = dstdsum;
1844                                         minip = tmp_ip;
1845                                         should_loop = true;
1846                                 }
1847                         }
1848                 }
1849
1850                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1851
1852                 /* If we found one then assign it to the given node. */
1853                 if (minnode != -1) {
1854                         minip->pnn = minnode;
1855                         lcp2_imbalances[minnode] = minimbl;
1856                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1857                                           ctdb_addr_to_str(&(minip->addr)),
1858                                           minnode,
1859                                           mindsum));
1860                 }
1861
1862                 /* There might be a better way but at least this is clear. */
1863                 have_unassigned = false;
1864                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1865                         if (tmp_ip->pnn == -1) {
1866                                 have_unassigned = true;
1867                         }
1868                 }
1869         }
1870
1871         /* We know if we have an unassigned addresses so we might as
1872          * well optimise.
1873          */
1874         if (have_unassigned) {
1875                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876                         if (tmp_ip->pnn == -1) {
1877                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1878                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1879                         }
1880                 }
1881         }
1882 }
1883
1884 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1885  * to move IPs from, determines the best IP/destination node
1886  * combination to move from the source node.
1887  */
1888 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1889                                     struct ctdb_node_map *nodemap,
1890                                     struct ctdb_public_ip_list *all_ips,
1891                                     int srcnode,
1892                                     uint32_t candimbl,
1893                                     uint32_t *lcp2_imbalances,
1894                                     bool *newly_healthy)
1895 {
1896         int dstnode, mindstnode;
1897         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1898         uint32_t minsrcimbl, mindstimbl;
1899         struct ctdb_public_ip_list *minip;
1900         struct ctdb_public_ip_list *tmp_ip;
1901
1902         /* Find an IP and destination node that best reduces imbalance. */
1903         minip = NULL;
1904         minsrcimbl = 0;
1905         mindstnode = -1;
1906         mindstimbl = 0;
1907
1908         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1909         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1910
1911         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1912                 /* Only consider addresses on srcnode. */
1913                 if (tmp_ip->pnn != srcnode) {
1914                         continue;
1915                 }
1916
1917                 /* What is this IP address costing the source node? */
1918                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1919                 srcimbl = candimbl - srcdsum;
1920
1921                 /* Consider this IP address would cost each potential
1922                  * destination node.  Destination nodes are limited to
1923                  * those that are newly healthy, since we don't want
1924                  * to do gratuitous failover of IPs just to make minor
1925                  * balance improvements.
1926                  */
1927                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1928                         if (! newly_healthy[dstnode]) {
1929                                 continue;
1930                         }
1931
1932                         /* Only check nodes that are allowed to takeover an ip */
1933                         if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1934                                 continue;
1935                         }
1936
1937                         /* only check nodes that can actually serve this ip */
1938                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1939                                 /* no it couldnt   so skip to the next node */
1940                                 continue;
1941                         }
1942
1943                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1944                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1945                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1946                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1947                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1948                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1949
1950                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1951                             ((mindstnode == -1) ||                              \
1952                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1953
1954                                 minip = tmp_ip;
1955                                 minsrcimbl = srcimbl;
1956                                 mindstnode = dstnode;
1957                                 mindstimbl = dstimbl;
1958                         }
1959                 }
1960         }
1961         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1962
1963         if (mindstnode != -1) {
1964                 /* We found a move that makes things better... */
1965                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1966                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1967                                   ctdb_addr_to_str(&(minip->addr)),
1968                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1969
1970
1971                 lcp2_imbalances[srcnode] = srcimbl;
1972                 lcp2_imbalances[mindstnode] = mindstimbl;
1973                 minip->pnn = mindstnode;
1974
1975                 return true;
1976         }
1977
1978         return false;
1979         
1980 }
1981
1982 struct lcp2_imbalance_pnn {
1983         uint32_t imbalance;
1984         int pnn;
1985 };
1986
1987 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1988 {
1989         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1990         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1991
1992         if (lipa->imbalance > lipb->imbalance) {
1993                 return -1;
1994         } else if (lipa->imbalance == lipb->imbalance) {
1995                 return 0;
1996         } else {
1997                 return 1;
1998         }
1999 }
2000
2001 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2002  * node with the highest LCP2 imbalance, and then determines the best
2003  * IP/destination node combination to move from the source node.
2004  */
2005 static void lcp2_failback(struct ctdb_context *ctdb,
2006                           struct ctdb_node_map *nodemap,
2007                           uint32_t mask,
2008                           struct ctdb_public_ip_list *all_ips,
2009                           uint32_t *lcp2_imbalances,
2010                           bool *newly_healthy)
2011 {
2012         int i, num_newly_healthy;
2013         struct lcp2_imbalance_pnn * lips;
2014         bool again;
2015
2016 try_again:
2017
2018         /* It is only worth continuing if we have suitable target
2019          * nodes to transfer IPs to.  This check is much cheaper than
2020          * continuing on...
2021          */
2022         num_newly_healthy = 0;
2023         for (i = 0; i < nodemap->num; i++) {
2024                 if (newly_healthy[i]) {
2025                         num_newly_healthy++;
2026                 }
2027         }
2028         if (num_newly_healthy == 0) {
2029                 return;
2030         }
2031
2032         /* Put the imbalances and nodes into an array, sort them and
2033          * iterate through candidates.  Usually the 1st one will be
2034          * used, so this doesn't cost much...
2035          */
2036         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2037         for (i = 0; i < nodemap->num; i++) {
2038                 lips[i].imbalance = lcp2_imbalances[i];
2039                 lips[i].pnn = i;
2040         }
2041         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2042               lcp2_cmp_imbalance_pnn);
2043
2044         again = false;
2045         for (i = 0; i < nodemap->num; i++) {
2046                 /* This means that all nodes had 0 or 1 addresses, so
2047                  * can't be imbalanced.
2048                  */
2049                 if (lips[i].imbalance == 0) {
2050                         break;
2051                 }
2052
2053                 if (lcp2_failback_candidate(ctdb,
2054                                             nodemap,
2055                                             all_ips,
2056                                             lips[i].pnn,
2057                                             lips[i].imbalance,
2058                                             lcp2_imbalances,
2059                                             newly_healthy)) {
2060                         again = true;
2061                         break;
2062                 }
2063         }
2064
2065         talloc_free(lips);
2066         if (again) {
2067                 goto try_again;
2068         }
2069 }
2070
2071 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2072                                     struct ctdb_node_map *nodemap,
2073                                     struct ctdb_public_ip_list *all_ips,
2074                                     uint32_t mask)
2075 {
2076         struct ctdb_public_ip_list *tmp_ip;
2077
2078         /* mark all public addresses with a masked node as being served by
2079            node -1
2080         */
2081         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2082                 if (tmp_ip->pnn == -1) {
2083                         continue;
2084                 }
2085                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
2086                         tmp_ip->pnn = -1;
2087                 }
2088         }
2089
2090         /* verify that the assigned nodes can serve that public ip
2091            and set it to -1 if not
2092         */
2093         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2094                 if (tmp_ip->pnn == -1) {
2095                         continue;
2096                 }
2097                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2098                         /* this node can not serve this ip. */
2099                         tmp_ip->pnn = -1;
2100                 }
2101         }
2102 }
2103
2104 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2105                                        struct ctdb_node_map *nodemap,
2106                                        struct ctdb_public_ip_list *all_ips,
2107                                        uint32_t mask)
2108 {
2109         struct ctdb_public_ip_list *tmp_ip;
2110         int i;
2111
2112         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2113        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2114         *  always be allocated the same way for a specific set of
2115         *  available/unavailable nodes.
2116         */
2117
2118         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2119                 tmp_ip->pnn = i%nodemap->num;
2120         }
2121
2122         /* IP failback doesn't make sense with deterministic
2123          * IPs, since the modulo step above implicitly fails
2124          * back IPs to their "home" node.
2125          */
2126         if (1 == ctdb->tunable.no_ip_failback) {
2127                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2128         }
2129
2130         unassign_unsuitable_ips(ctdb, nodemap, all_ips, mask);
2131
2132         basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2133
2134         /* No failback here! */
2135 }
2136
2137 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2138                                           struct ctdb_node_map *nodemap,
2139                                           struct ctdb_public_ip_list *all_ips,
2140                                           uint32_t mask)
2141 {
2142         /* This should be pushed down into basic_failback. */
2143         struct ctdb_public_ip_list *tmp_ip;
2144         int num_ips = 0;
2145         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2146                 num_ips++;
2147         }
2148
2149         unassign_unsuitable_ips(ctdb, nodemap, all_ips, mask);
2150
2151         basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2152
2153         /* If we don't want IPs to fail back then don't rebalance IPs. */
2154         if (1 == ctdb->tunable.no_ip_failback) {
2155                 return;
2156         }
2157
2158         /* Now, try to make sure the ip adresses are evenly distributed
2159            across the nodes.
2160         */
2161         basic_failback(ctdb, nodemap, mask, all_ips, num_ips);
2162 }
2163
2164 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2165                           struct ctdb_node_map *nodemap,
2166                           struct ctdb_public_ip_list *all_ips,
2167                           uint32_t mask)
2168 {
2169         uint32_t *lcp2_imbalances;
2170         bool *newly_healthy;
2171
2172         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2173
2174         unassign_unsuitable_ips(ctdb, nodemap, all_ips, mask);
2175
2176         lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2177
2178         lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2179
2180         /* If we don't want IPs to fail back then don't rebalance IPs. */
2181         if (1 == ctdb->tunable.no_ip_failback) {
2182                 goto finished;
2183         }
2184
2185         /* Now, try to make sure the ip adresses are evenly distributed
2186            across the nodes.
2187         */
2188         lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy);
2189
2190 finished:
2191         talloc_free(tmp_ctx);
2192 }
2193
2194 /* The calculation part of the IP allocation algorithm. */
2195 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2196                                    struct ctdb_node_map *nodemap,
2197                                    struct ctdb_public_ip_list **all_ips_p)
2198 {
2199         int i, num_healthy;
2200         uint32_t mask;
2201
2202         /* Count how many completely healthy nodes we have */
2203         num_healthy = 0;
2204         for (i=0;i<nodemap->num;i++) {
2205                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2206                         num_healthy++;
2207                 }
2208         }
2209
2210         /* If we have healthy nodes then we will only consider them
2211            for serving public addresses
2212         */
2213         mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
2214         if ((num_healthy == 0) &&
2215             (ctdb->tunable.no_ip_takeover_on_disabled == 0)) {
2216                 /* We didnt have any completely healthy nodes so
2217                    use "disabled" nodes as a fallback
2218                 */
2219                 mask = NODE_FLAGS_INACTIVE;
2220         }
2221
2222         /* since nodes only know about those public addresses that
2223            can be served by that particular node, no single node has
2224            a full list of all public addresses that exist in the cluster.
2225            Walk over all node structures and create a merged list of
2226            all public addresses that exist in the cluster.
2227
2228            keep the tree of ips around as ctdb->ip_tree
2229         */
2230         *all_ips_p = create_merged_ip_list(ctdb);
2231
2232         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2233                 ip_alloc_lcp2(ctdb, nodemap, *all_ips_p, mask);
2234         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2235                 ip_alloc_deterministic_ips(ctdb, nodemap, *all_ips_p, mask);
2236         } else {
2237                 ip_alloc_nondeterministic_ips(ctdb, nodemap, *all_ips_p, mask);
2238         }
2239
2240         /* at this point ->pnn is the node which will own each IP
2241            or -1 if there is no node that can cover this ip
2242         */
2243
2244         return;
2245 }
2246
2247 static void noiptakeover_cb(struct ctdb_context *ctdb, uint32_t pnn, int32_t res, TDB_DATA outdata, void *callback)
2248 {
2249         struct ctdb_node_map *nodemap = (struct ctdb_node_map *)callback;
2250
2251         if (res != 0) {
2252                 DEBUG(DEBUG_ERR,("Failure to read NoIPTakeover tunable from remote node %d\n", pnn));
2253                 return;
2254         }
2255
2256         if (outdata.dsize != sizeof(uint32_t)) {
2257                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading NoIPTakeover tunable from node %d. Expected %d bytes but received %d bytes\n", pnn, (int)sizeof(uint32_t), (int)outdata.dsize));
2258                 return;
2259         }
2260
2261         if (pnn >= nodemap->num) {
2262                 DEBUG(DEBUG_ERR,("Got NoIPTakeover reply from node %d but nodemap only has %d entries\n", pnn, nodemap->num));
2263                 return;
2264         }
2265
2266         if (*(uint32_t *)outdata.dptr != 0) {
2267                 nodemap->nodes[pnn].flags |= NODE_FLAGS_NOIPTAKEOVER;
2268         }
2269 }
2270
2271 /*
2272   make any IP alias changes for public addresses that are necessary 
2273  */
2274 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2275                       client_async_callback fail_callback, void *callback_data)
2276 {
2277         int i;
2278         struct ctdb_public_ip ip;
2279         struct ctdb_public_ipv4 ipv4;
2280         struct ctdb_control_get_tunable *t;
2281         uint32_t *nodes;
2282         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2283         TDB_DATA data;
2284         struct timeval timeout;
2285         struct client_async_data *async_data;
2286         struct ctdb_client_control_state *state;
2287         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2288         uint32_t disable_timeout;
2289
2290         /*
2291          * ip failover is completely disabled, just send out the 
2292          * ipreallocated event.
2293          */
2294         if (ctdb->tunable.disable_ip_failover != 0) {
2295                 goto ipreallocated;
2296         }
2297
2298
2299         /* assume all nodes do support failback */
2300         for (i=0;i<nodemap->num;i++) {
2301                 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2302         }
2303         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen("NoIPTakeover") + 1;
2304         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2305         t = (struct ctdb_control_get_tunable *)data.dptr;
2306         t->length = strlen("NoIPTakeover")+1;
2307         memcpy(t->name, "NoIPTakeover", t->length);
2308         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2309         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2310                                       nodes, 0, TAKEOVER_TIMEOUT(),
2311                                       false, data,
2312                                       noiptakeover_cb, NULL,
2313                                       nodemap) != 0) {
2314                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get noiptakeover tunable failed\n"));
2315         }
2316         talloc_free(nodes);
2317         talloc_free(data.dptr);
2318
2319
2320         ZERO_STRUCT(ip);
2321
2322         /* Do the IP reassignment calculations */
2323         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2324
2325         /* The recovery daemon does regular sanity checks of the IPs.
2326          * However, sometimes it is overzealous and thinks changes are
2327          * required when they're already underway.  This stops the
2328          * checks for a while before we start moving IPs.
2329          */
2330         disable_timeout = ctdb->tunable.takeover_timeout;
2331         data.dptr  = (uint8_t*)&disable_timeout;
2332         data.dsize = sizeof(disable_timeout);
2333         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2334                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2335                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2336         }
2337
2338         /* now tell all nodes to delete any alias that they should not
2339            have.  This will be a NOOP on nodes that don't currently
2340            hold the given alias */
2341         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2342         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2343
2344         async_data->fail_callback = fail_callback;
2345         async_data->callback_data = callback_data;
2346
2347         for (i=0;i<nodemap->num;i++) {
2348                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2349                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2350                         continue;
2351                 }
2352
2353                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2354                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2355                                 /* This node should be serving this
2356                                    vnn so dont tell it to release the ip
2357                                 */
2358                                 continue;
2359                         }
2360                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2361                                 ipv4.pnn = tmp_ip->pnn;
2362                                 ipv4.sin = tmp_ip->addr.ip;
2363
2364                                 timeout = TAKEOVER_TIMEOUT();
2365                                 data.dsize = sizeof(ipv4);
2366                                 data.dptr  = (uint8_t *)&ipv4;
2367                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2368                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2369                                                 data, async_data,
2370                                                 &timeout, NULL);
2371                         } else {
2372                                 ip.pnn  = tmp_ip->pnn;
2373                                 ip.addr = tmp_ip->addr;
2374
2375                                 timeout = TAKEOVER_TIMEOUT();
2376                                 data.dsize = sizeof(ip);
2377                                 data.dptr  = (uint8_t *)&ip;
2378                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2379                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2380                                                 data, async_data,
2381                                                 &timeout, NULL);
2382                         }
2383
2384                         if (state == NULL) {
2385                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2386                                 talloc_free(tmp_ctx);
2387                                 return -1;
2388                         }
2389                 
2390                         ctdb_client_async_add(async_data, state);
2391                 }
2392         }
2393         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2394                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2395                 talloc_free(tmp_ctx);
2396                 return -1;
2397         }
2398         talloc_free(async_data);
2399
2400
2401         /* tell all nodes to get their own IPs */
2402         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2403         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2404
2405         async_data->fail_callback = fail_callback;
2406         async_data->callback_data = callback_data;
2407
2408         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2409                 if (tmp_ip->pnn == -1) {
2410                         /* this IP won't be taken over */
2411                         continue;
2412                 }
2413
2414                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2415                         ipv4.pnn = tmp_ip->pnn;
2416                         ipv4.sin = tmp_ip->addr.ip;
2417
2418                         timeout = TAKEOVER_TIMEOUT();
2419                         data.dsize = sizeof(ipv4);
2420                         data.dptr  = (uint8_t *)&ipv4;
2421                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2422                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2423                                         data, async_data,
2424                                         &timeout, NULL);
2425                 } else {
2426                         ip.pnn  = tmp_ip->pnn;
2427                         ip.addr = tmp_ip->addr;
2428
2429                         timeout = TAKEOVER_TIMEOUT();
2430                         data.dsize = sizeof(ip);
2431                         data.dptr  = (uint8_t *)&ip;
2432                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2433                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2434                                         data, async_data,
2435                                         &timeout, NULL);
2436                 }
2437                 if (state == NULL) {
2438                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2439                         talloc_free(tmp_ctx);
2440                         return -1;
2441                 }
2442                 
2443                 ctdb_client_async_add(async_data, state);
2444         }
2445         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2446                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2447                 talloc_free(tmp_ctx);
2448                 return -1;
2449         }
2450
2451 ipreallocated:
2452         /* 
2453          * Tell all nodes to run eventscripts to process the
2454          * "ipreallocated" event.  This can do a lot of things,
2455          * including restarting services to reconfigure them if public
2456          * IPs have moved.  Once upon a time this event only used to
2457          * update natwg.
2458          */
2459         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2460         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2461                                       nodes, 0, TAKEOVER_TIMEOUT(),
2462                                       false, tdb_null,
2463                                       NULL, fail_callback,
2464                                       callback_data) != 0) {
2465                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2466         }
2467
2468         talloc_free(tmp_ctx);
2469         return 0;
2470 }
2471
2472
2473 /*
2474   destroy a ctdb_client_ip structure
2475  */
2476 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2477 {
2478         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2479                 ctdb_addr_to_str(&ip->addr),
2480                 ntohs(ip->addr.ip.sin_port),
2481                 ip->client_id));
2482
2483         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2484         return 0;
2485 }
2486
2487 /*
2488   called by a client to inform us of a TCP connection that it is managing
2489   that should tickled with an ACK when IP takeover is done
2490   we handle both the old ipv4 style of packets as well as the new ipv4/6
2491   pdus.
2492  */
2493 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2494                                 TDB_DATA indata)
2495 {
2496         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2497         struct ctdb_control_tcp *old_addr = NULL;
2498         struct ctdb_control_tcp_addr new_addr;
2499         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2500         struct ctdb_tcp_list *tcp;
2501         struct ctdb_tcp_connection t;
2502         int ret;
2503         TDB_DATA data;
2504         struct ctdb_client_ip *ip;
2505         struct ctdb_vnn *vnn;
2506         ctdb_sock_addr addr;
2507
2508         switch (indata.dsize) {
2509         case sizeof(struct ctdb_control_tcp):
2510                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2511                 ZERO_STRUCT(new_addr);
2512                 tcp_sock = &new_addr;
2513                 tcp_sock->src.ip  = old_addr->src;
2514                 tcp_sock->dest.ip = old_addr->dest;
2515                 break;
2516         case sizeof(struct ctdb_control_tcp_addr):
2517                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2518                 break;
2519         default:
2520                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2521                                  "to ctdb_control_tcp_client. size was %d but "
2522                                  "only allowed sizes are %lu and %lu\n",
2523                                  (int)indata.dsize,
2524                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2525                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2526                 return -1;
2527         }
2528
2529         addr = tcp_sock->src;
2530         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2531         addr = tcp_sock->dest;
2532         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2533
2534         ZERO_STRUCT(addr);
2535         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2536         vnn = find_public_ip_vnn(ctdb, &addr);
2537         if (vnn == NULL) {
2538                 switch (addr.sa.sa_family) {
2539                 case AF_INET:
2540                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2541                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2542                                         ctdb_addr_to_str(&addr)));
2543                         }
2544                         break;
2545                 case AF_INET6:
2546                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2547                                 ctdb_addr_to_str(&addr)));
2548                         break;
2549                 default:
2550                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2551                 }
2552
2553                 return 0;
2554         }
2555
2556         if (vnn->pnn != ctdb->pnn) {
2557                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2558                         ctdb_addr_to_str(&addr),
2559                         client_id, client->pid));
2560                 /* failing this call will tell smbd to die */
2561                 return -1;
2562         }
2563
2564         ip = talloc(client, struct ctdb_client_ip);
2565         CTDB_NO_MEMORY(ctdb, ip);
2566
2567         ip->ctdb      = ctdb;
2568         ip->addr      = addr;
2569         ip->client_id = client_id;
2570         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2571         DLIST_ADD(ctdb->client_ip_list, ip);
2572
2573         tcp = talloc(client, struct ctdb_tcp_list);
2574         CTDB_NO_MEMORY(ctdb, tcp);
2575
2576         tcp->connection.src_addr = tcp_sock->src;
2577         tcp->connection.dst_addr = tcp_sock->dest;
2578
2579         DLIST_ADD(client->tcp_list, tcp);
2580
2581         t.src_addr = tcp_sock->src;
2582         t.dst_addr = tcp_sock->dest;
2583
2584         data.dptr = (uint8_t *)&t;
2585         data.dsize = sizeof(t);
2586
2587         switch (addr.sa.sa_family) {
2588         case AF_INET:
2589                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2590                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2591                         ctdb_addr_to_str(&tcp_sock->src),
2592                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2593                 break;
2594         case AF_INET6:
2595                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2596                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2597                         ctdb_addr_to_str(&tcp_sock->src),
2598                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2599                 break;
2600         default:
2601                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2602         }
2603
2604
2605         /* tell all nodes about this tcp connection */
2606         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2607                                        CTDB_CONTROL_TCP_ADD,
2608                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2609         if (ret != 0) {
2610                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2611                 return -1;
2612         }
2613
2614         return 0;
2615 }
2616
2617 /*
2618   find a tcp address on a list
2619  */
2620 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2621                                            struct ctdb_tcp_connection *tcp)
2622 {
2623         int i;
2624
2625         if (array == NULL) {
2626                 return NULL;
2627         }
2628
2629         for (i=0;i<array->num;i++) {
2630                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2631                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2632                         return &array->connections[i];
2633                 }
2634         }
2635         return NULL;
2636 }
2637
2638
2639
2640 /*
2641   called by a daemon to inform us of a TCP connection that one of its
2642   clients managing that should tickled with an ACK when IP takeover is
2643   done
2644  */
2645 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2646 {
2647         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2648         struct ctdb_tcp_array *tcparray;
2649         struct ctdb_tcp_connection tcp;
2650         struct ctdb_vnn *vnn;
2651
2652         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2653         if (vnn == NULL) {
2654                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2655                         ctdb_addr_to_str(&p->dst_addr)));
2656
2657                 return -1;
2658         }
2659
2660
2661         tcparray = vnn->tcp_array;
2662
2663         /* If this is the first tickle */
2664         if (tcparray == NULL) {
2665                 tcparray = talloc_size(ctdb->nodes, 
2666                         offsetof(struct ctdb_tcp_array, connections) +
2667                         sizeof(struct ctdb_tcp_connection) * 1);
2668                 CTDB_NO_MEMORY(ctdb, tcparray);
2669                 vnn->tcp_array = tcparray;
2670
2671                 tcparray->num = 0;
2672                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2673                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2674
2675                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2676                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2677                 tcparray->num++;
2678
2679                 if (tcp_update_needed) {
2680                         vnn->tcp_update_needed = true;
2681                 }
2682                 return 0;
2683         }
2684
2685
2686         /* Do we already have this tickle ?*/
2687         tcp.src_addr = p->src_addr;
2688         tcp.dst_addr = p->dst_addr;
2689         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2690                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2691                         ctdb_addr_to_str(&tcp.dst_addr),
2692                         ntohs(tcp.dst_addr.ip.sin_port),
2693                         vnn->pnn));
2694                 return 0;
2695         }
2696
2697         /* A new tickle, we must add it to the array */
2698         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2699                                         struct ctdb_tcp_connection,
2700                                         tcparray->num+1);
2701         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2702
2703         vnn->tcp_array = tcparray;
2704         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2705         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2706         tcparray->num++;
2707                                 
2708         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2709                 ctdb_addr_to_str(&tcp.dst_addr),
2710                 ntohs(tcp.dst_addr.ip.sin_port),
2711                 vnn->pnn));
2712
2713         if (tcp_update_needed) {
2714                 vnn->tcp_update_needed = true;
2715         }
2716
2717         return 0;
2718 }
2719
2720
2721 /*
2722   called by a daemon to inform us of a TCP connection that one of its
2723   clients managing that should tickled with an ACK when IP takeover is
2724   done
2725  */
2726 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2727 {
2728         struct ctdb_tcp_connection *tcpp;
2729         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2730
2731         if (vnn == NULL) {
2732                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2733                         ctdb_addr_to_str(&conn->dst_addr)));
2734                 return;
2735         }
2736
2737         /* if the array is empty we cant remove it
2738            and we dont need to do anything
2739          */
2740         if (vnn->tcp_array == NULL) {
2741                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2742                         ctdb_addr_to_str(&conn->dst_addr),
2743                         ntohs(conn->dst_addr.ip.sin_port)));
2744                 return;
2745         }
2746
2747
2748         /* See if we know this connection
2749            if we dont know this connection  then we dont need to do anything
2750          */
2751         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2752         if (tcpp == NULL) {
2753                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2754                         ctdb_addr_to_str(&conn->dst_addr),
2755                         ntohs(conn->dst_addr.ip.sin_port)));
2756                 return;
2757         }
2758
2759
2760         /* We need to remove this entry from the array.
2761            Instead of allocating a new array and copying data to it
2762            we cheat and just copy the last entry in the existing array
2763            to the entry that is to be removed and just shring the 
2764            ->num field
2765          */
2766         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2767         vnn->tcp_array->num--;
2768
2769         /* If we deleted the last entry we also need to remove the entire array
2770          */
2771         if (vnn->tcp_array->num == 0) {
2772                 talloc_free(vnn->tcp_array);
2773                 vnn->tcp_array = NULL;
2774         }               
2775
2776         vnn->tcp_update_needed = true;
2777
2778         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2779                 ctdb_addr_to_str(&conn->src_addr),
2780                 ntohs(conn->src_addr.ip.sin_port)));
2781 }
2782
2783
2784 /*
2785   called by a daemon to inform us of a TCP connection that one of its
2786   clients used are no longer needed in the tickle database
2787  */
2788 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2789 {
2790         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2791
2792         ctdb_remove_tcp_connection(ctdb, conn);
2793
2794         return 0;
2795 }
2796
2797
2798 /*
2799   called when a daemon restarts - send all tickes for all public addresses
2800   we are serving immediately to the new node.
2801  */
2802 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2803 {
2804 /*XXX here we should send all tickes we are serving to the new node */
2805         return 0;
2806 }
2807
2808
2809 /*
2810   called when a client structure goes away - hook to remove
2811   elements from the tcp_list in all daemons
2812  */
2813 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2814 {
2815         while (client->tcp_list) {
2816                 struct ctdb_tcp_list *tcp = client->tcp_list;
2817                 DLIST_REMOVE(client->tcp_list, tcp);
2818                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2819         }
2820 }
2821
2822
2823 /*
2824   release all IPs on shutdown
2825  */
2826 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2827 {
2828         struct ctdb_vnn *vnn;
2829
2830         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2831                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2832                         ctdb_vnn_unassign_iface(ctdb, vnn);
2833                         continue;
2834                 }
2835                 if (!vnn->iface) {
2836                         continue;
2837                 }
2838                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2839                                   ctdb_vnn_iface_string(vnn),
2840                                   ctdb_addr_to_str(&vnn->public_address),
2841                                   vnn->public_netmask_bits);
2842                 release_kill_clients(ctdb, &vnn->public_address);
2843                 ctdb_vnn_unassign_iface(ctdb, vnn);
2844         }
2845 }
2846
2847
2848 /*
2849   get list of public IPs
2850  */
2851 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2852                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2853 {
2854         int i, num, len;
2855         struct ctdb_all_public_ips *ips;
2856         struct ctdb_vnn *vnn;
2857         bool only_available = false;
2858
2859         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2860                 only_available = true;
2861         }
2862
2863         /* count how many public ip structures we have */
2864         num = 0;
2865         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2866                 num++;
2867         }
2868
2869         len = offsetof(struct ctdb_all_public_ips, ips) + 
2870                 num*sizeof(struct ctdb_public_ip);
2871         ips = talloc_zero_size(outdata, len);
2872         CTDB_NO_MEMORY(ctdb, ips);
2873
2874         i = 0;
2875         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2876                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2877                         continue;
2878                 }
2879                 ips->ips[i].pnn  = vnn->pnn;
2880                 ips->ips[i].addr = vnn->public_address;
2881                 i++;
2882         }
2883         ips->num = i;
2884         len = offsetof(struct ctdb_all_public_ips, ips) +
2885                 i*sizeof(struct ctdb_public_ip);
2886
2887         outdata->dsize = len;
2888         outdata->dptr  = (uint8_t *)ips;
2889
2890         return 0;
2891 }
2892
2893
2894 /*
2895   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2896  */
2897 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2898                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2899 {
2900         int i, num, len;
2901         struct ctdb_all_public_ipsv4 *ips;
2902         struct ctdb_vnn *vnn;
2903
2904         /* count how many public ip structures we have */
2905         num = 0;
2906         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2907                 if (vnn->public_address.sa.sa_family != AF_INET) {
2908                         continue;
2909                 }
2910                 num++;
2911         }
2912
2913         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2914                 num*sizeof(struct ctdb_public_ipv4);
2915         ips = talloc_zero_size(outdata, len);
2916         CTDB_NO_MEMORY(ctdb, ips);
2917
2918         outdata->dsize = len;
2919         outdata->dptr  = (uint8_t *)ips;
2920
2921         ips->num = num;
2922         i = 0;
2923         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2924                 if (vnn->public_address.sa.sa_family != AF_INET) {
2925                         continue;
2926                 }
2927                 ips->ips[i].pnn = vnn->pnn;
2928                 ips->ips[i].sin = vnn->public_address.ip;
2929                 i++;
2930         }
2931
2932         return 0;
2933 }
2934
2935 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2936                                         struct ctdb_req_control *c,
2937                                         TDB_DATA indata,
2938                                         TDB_DATA *outdata)
2939 {
2940         int i, num, len;
2941         ctdb_sock_addr *addr;
2942         struct ctdb_control_public_ip_info *info;
2943         struct ctdb_vnn *vnn;
2944
2945         addr = (ctdb_sock_addr *)indata.dptr;
2946
2947         vnn = find_public_ip_vnn(ctdb, addr);
2948         if (vnn == NULL) {
2949                 /* if it is not a public ip   it could be our 'single ip' */
2950                 if (ctdb->single_ip_vnn) {
2951                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2952                                 vnn = ctdb->single_ip_vnn;
2953                         }
2954                 }
2955         }
2956         if (vnn == NULL) {
2957                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2958                                  "'%s'not a public address\n",
2959                                  ctdb_addr_to_str(addr)));
2960                 return -1;
2961         }
2962
2963         /* count how many public ip structures we have */
2964         num = 0;
2965         for (;vnn->ifaces[num];) {
2966                 num++;
2967         }
2968
2969         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2970                 num*sizeof(struct ctdb_control_iface_info);
2971         info = talloc_zero_size(outdata, len);
2972         CTDB_NO_MEMORY(ctdb, info);
2973
2974         info->ip.addr = vnn->public_address;
2975         info->ip.pnn = vnn->pnn;
2976         info->active_idx = 0xFFFFFFFF;
2977
2978         for (i=0; vnn->ifaces[i]; i++) {
2979                 struct ctdb_iface *cur;
2980
2981                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2982                 if (cur == NULL) {
2983                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2984                                            vnn->ifaces[i]));
2985                         return -1;
2986                 }
2987                 if (vnn->iface == cur) {
2988                         info->active_idx = i;
2989                 }
2990                 strcpy(info->ifaces[i].name, cur->name);
2991                 info->ifaces[i].link_state = cur->link_up;
2992                 info->ifaces[i].references = cur->references;
2993         }
2994         info->num = i;
2995         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2996                 i*sizeof(struct ctdb_control_iface_info);
2997
2998         outdata->dsize = len;
2999         outdata->dptr  = (uint8_t *)info;
3000
3001         return 0;
3002 }
3003
3004 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3005                                 struct ctdb_req_control *c,
3006                                 TDB_DATA *outdata)
3007 {
3008         int i, num, len;
3009         struct ctdb_control_get_ifaces *ifaces;
3010         struct ctdb_iface *cur;
3011
3012         /* count how many public ip structures we have */
3013         num = 0;
3014         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3015                 num++;
3016         }
3017
3018         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3019                 num*sizeof(struct ctdb_control_iface_info);
3020         ifaces = talloc_zero_size(outdata, len);
3021         CTDB_NO_MEMORY(ctdb, ifaces);
3022
3023         i = 0;
3024         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3025                 strcpy(ifaces->ifaces[i].name, cur->name);
3026                 ifaces->ifaces[i].link_state = cur->link_up;
3027                 ifaces->ifaces[i].references = cur->references;
3028                 i++;
3029         }
3030         ifaces->num = i;
3031         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3032                 i*sizeof(struct ctdb_control_iface_info);
3033
3034         outdata->dsize = len;
3035         outdata->dptr  = (uint8_t *)ifaces;
3036
3037         return 0;
3038 }
3039
3040 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3041                                     struct ctdb_req_control *c,
3042                                     TDB_DATA indata)
3043 {
3044         struct ctdb_control_iface_info *info;
3045         struct ctdb_iface *iface;
3046         bool link_up = false;
3047
3048         info = (struct ctdb_control_iface_info *)indata.dptr;
3049
3050         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3051                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3052                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3053                                   len, len, info->name));
3054                 return -1;
3055         }
3056
3057         switch (info->link_state) {
3058         case 0:
3059                 link_up = false;
3060                 break;
3061         case 1:
3062                 link_up = true;
3063                 break;
3064         default:
3065                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3066                                   (unsigned int)info->link_state));
3067                 return -1;
3068         }
3069
3070         if (info->references != 0) {
3071                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3072                                   (unsigned int)info->references));
3073                 return -1;
3074         }
3075
3076         iface = ctdb_find_iface(ctdb, info->name);
3077         if (iface == NULL) {
3078                 return -1;
3079         }
3080
3081         if (link_up == iface->link_up) {
3082                 return 0;
3083         }
3084
3085         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3086               ("iface[%s] has changed it's link status %s => %s\n",
3087                iface->name,
3088                iface->link_up?"up":"down",
3089                link_up?"up":"down"));
3090
3091         iface->link_up = link_up;
3092         return 0;
3093 }
3094
3095
3096 /* 
3097    structure containing the listening socket and the list of tcp connections
3098    that the ctdb daemon is to kill
3099 */
3100 struct ctdb_kill_tcp {
3101         struct ctdb_vnn *vnn;
3102         struct ctdb_context *ctdb;
3103         int capture_fd;
3104         struct fd_event *fde;
3105         trbt_tree_t *connections;
3106         void *private_data;
3107 };
3108
3109 /*
3110   a tcp connection that is to be killed
3111  */
3112 struct ctdb_killtcp_con {
3113         ctdb_sock_addr src_addr;
3114         ctdb_sock_addr dst_addr;
3115         int count;
3116         struct ctdb_kill_tcp *killtcp;
3117 };
3118
3119 /* this function is used to create a key to represent this socketpair
3120    in the killtcp tree.
3121    this key is used to insert and lookup matching socketpairs that are
3122    to be tickled and RST
3123 */
3124 #define KILLTCP_KEYLEN  10
3125 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3126 {
3127         static uint32_t key[KILLTCP_KEYLEN];
3128
3129         bzero(key, sizeof(key));
3130
3131         if (src->sa.sa_family != dst->sa.sa_family) {
3132                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3133                 return key;
3134         }
3135         
3136         switch (src->sa.sa_family) {
3137         case AF_INET:
3138                 key[0]  = dst->ip.sin_addr.s_addr;
3139                 key[1]  = src->ip.sin_addr.s_addr;
3140                 key[2]  = dst->ip.sin_port;
3141                 key[3]  = src->ip.sin_port;
3142                 break;
3143         case AF_INET6: {
3144                 uint32_t *dst6_addr32 =
3145                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3146                 uint32_t *src6_addr32 =
3147                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3148                 key[0]  = dst6_addr32[3];
3149                 key[1]  = src6_addr32[3];
3150                 key[2]  = dst6_addr32[2];
3151                 key[3]  = src6_addr32[2];
3152                 key[4]  = dst6_addr32[1];
3153                 key[5]  = src6_addr32[1];
3154                 key[6]  = dst6_addr32[0];
3155                 key[7]  = src6_addr32[0];
3156                 key[8]  = dst->ip6.sin6_port;
3157                 key[9]  = src->ip6.sin6_port;
3158                 break;
3159         }
3160         default:
3161                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3162                 return key;
3163         }
3164
3165         return key;
3166 }
3167
3168 /*
3169   called when we get a read event on the raw socket
3170  */
3171 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3172                                 uint16_t flags, void *private_data)
3173 {
3174         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3175         struct ctdb_killtcp_con *con;
3176         ctdb_sock_addr src, dst;
3177         uint32_t ack_seq, seq;
3178
3179         if (!(flags & EVENT_FD_READ)) {
3180                 return;
3181         }
3182
3183         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3184                                 killtcp->private_data,
3185                                 &src, &dst,
3186                                 &ack_seq, &seq) != 0) {
3187                 /* probably a non-tcp ACK packet */
3188                 return;
3189         }
3190
3191         /* check if we have this guy in our list of connections
3192            to kill
3193         */
3194         con = trbt_lookuparray32(killtcp->connections, 
3195                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3196         if (con == NULL) {
3197                 /* no this was some other packet we can just ignore */
3198                 return;
3199         }
3200
3201         /* This one has been tickled !
3202            now reset him and remove him from the list.
3203          */
3204         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3205                 ntohs(con->dst_addr.ip.sin_port),
3206                 ctdb_addr_to_str(&con->src_addr),
3207                 ntohs(con->src_addr.ip.sin_port)));
3208
3209         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3210         talloc_free(con);
3211 }
3212
3213
3214 /* when traversing the list of all tcp connections to send tickle acks to
3215    (so that we can capture the ack coming back and kill the connection
3216     by a RST)
3217    this callback is called for each connection we are currently trying to kill
3218 */
3219 static int tickle_connection_traverse(void *param, void *data)
3220 {
3221         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3222
3223         /* have tried too many times, just give up */
3224         if (con->count >= 5) {
3225                 /* can't delete in traverse: reparent to delete_cons */
3226                 talloc_steal(param, con);
3227                 return 0;
3228         }
3229
3230         /* othervise, try tickling it again */
3231         con->count++;
3232         ctdb_sys_send_tcp(
3233                 (ctdb_sock_addr *)&con->dst_addr,
3234                 (ctdb_sock_addr *)&con->src_addr,
3235                 0, 0, 0);
3236         return 0;
3237 }
3238
3239
3240 /* 
3241    called every second until all sentenced connections have been reset
3242  */
3243 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3244                                               struct timeval t, void *private_data)
3245 {
3246         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3247         void *delete_cons = talloc_new(NULL);
3248
3249         /* loop over all connections sending tickle ACKs */
3250         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3251
3252         /* now we've finished traverse, it's safe to do deletion. */
3253         talloc_free(delete_cons);
3254
3255         /* If there are no more connections to kill we can remove the
3256            entire killtcp structure
3257          */
3258         if ( (killtcp->connections == NULL) || 
3259              (killtcp->connections->root == NULL) ) {
3260                 talloc_free(killtcp);
3261                 return;
3262         }
3263
3264         /* try tickling them again in a seconds time
3265          */
3266         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3267                         ctdb_tickle_sentenced_connections, killtcp);
3268 }
3269
3270 /*
3271   destroy the killtcp structure
3272  */
3273 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3274 {
3275         struct ctdb_vnn *tmpvnn;
3276
3277         /* verify that this vnn is still active */
3278         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3279                 if (tmpvnn == killtcp->vnn) {
3280                         break;
3281                 }
3282         }
3283
3284         if (tmpvnn == NULL) {
3285                 return 0;
3286         }
3287
3288         if (killtcp->vnn->killtcp != killtcp) {
3289                 return 0;
3290         }
3291
3292         killtcp->vnn->killtcp = NULL;
3293
3294         return 0;
3295 }
3296
3297
3298 /* nothing fancy here, just unconditionally replace any existing
3299    connection structure with the new one.
3300
3301    dont even free the old one if it did exist, that one is talloc_stolen
3302    by the same node in the tree anyway and will be deleted when the new data 
3303    is deleted
3304 */
3305 static void *add_killtcp_callback(void *parm, void *data)
3306 {
3307         return parm;
3308 }
3309
3310 /*
3311   add a tcp socket to the list of connections we want to RST
3312  */
3313 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3314                                        ctdb_sock_addr *s,
3315                                        ctdb_sock_addr *d)
3316 {
3317         ctdb_sock_addr src, dst;
3318         struct ctdb_kill_tcp *killtcp;
3319         struct ctdb_killtcp_con *con;
3320         struct ctdb_vnn *vnn;
3321
3322         ctdb_canonicalize_ip(s, &src);
3323         ctdb_canonicalize_ip(d, &dst);
3324
3325         vnn = find_public_ip_vnn(ctdb, &dst);
3326         if (vnn == NULL) {
3327                 vnn = find_public_ip_vnn(ctdb, &src);
3328         }
3329         if (vnn == NULL) {
3330                 /* if it is not a public ip   it could be our 'single ip' */
3331                 if (ctdb->single_ip_vnn) {
3332                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3333                                 vnn = ctdb->single_ip_vnn;
3334                         }
3335                 }
3336         }
3337         if (vnn == NULL) {
3338                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3339                 return -1;
3340         }
3341
3342         killtcp = vnn->killtcp;
3343         
3344         /* If this is the first connection to kill we must allocate
3345            a new structure
3346          */
3347         if (killtcp == NULL) {
3348                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3349                 CTDB_NO_MEMORY(ctdb, killtcp);
3350
3351                 killtcp->vnn         = vnn;
3352                 killtcp->ctdb        = ctdb;
3353                 killtcp->capture_fd  = -1;
3354                 killtcp->connections = trbt_create(killtcp, 0);
3355
3356                 vnn->killtcp         = killtcp;
3357                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3358         }
3359
3360
3361
3362         /* create a structure that describes this connection we want to
3363            RST and store it in killtcp->connections
3364         */
3365         con = talloc(killtcp, struct ctdb_killtcp_con);
3366         CTDB_NO_MEMORY(ctdb, con);
3367         con->src_addr = src;
3368         con->dst_addr = dst;
3369         con->count    = 0;
3370         con->killtcp  = killtcp;
3371
3372
3373         trbt_insertarray32_callback(killtcp->connections,
3374                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3375                         add_killtcp_callback, con);
3376
3377         /* 
3378            If we dont have a socket to listen on yet we must create it
3379          */
3380         if (killtcp->capture_fd == -1) {
3381                 const char *iface = ctdb_vnn_iface_string(vnn);
3382                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3383                 if (killtcp->capture_fd == -1) {
3384                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3385                                           "socket on iface '%s' for killtcp (%s)\n",
3386                                           iface, strerror(errno)));
3387                         goto failed;
3388                 }
3389         }
3390
3391
3392         if (killtcp->fde == NULL) {
3393                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3394                                             EVENT_FD_READ,
3395                                             capture_tcp_handler, killtcp);
3396                 tevent_fd_set_auto_close(killtcp->fde);
3397
3398                 /* We also need to set up some events to tickle all these connections
3399                    until they are all reset
3400                 */
3401                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3402                                 ctdb_tickle_sentenced_connections, killtcp);
3403         }
3404
3405         /* tickle him once now */
3406         ctdb_sys_send_tcp(
3407                 &con->dst_addr,
3408                 &con->src_addr,
3409                 0, 0, 0);
3410
3411         return 0;
3412
3413 failed:
3414         talloc_free(vnn->killtcp);
3415         vnn->killtcp = NULL;
3416         return -1;
3417 }
3418
3419 /*
3420   kill a TCP connection.
3421  */
3422 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3423 {
3424         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3425
3426         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3427 }
3428
3429 /*
3430   called by a daemon to inform us of the entire list of TCP tickles for
3431   a particular public address.
3432   this control should only be sent by the node that is currently serving
3433   that public address.
3434  */
3435 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3436 {
3437         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3438         struct ctdb_tcp_array *tcparray;
3439         struct ctdb_vnn *vnn;
3440
3441         /* We must at least have tickles.num or else we cant verify the size
3442            of the received data blob
3443          */
3444         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3445                                         tickles.connections)) {
3446                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3447                 return -1;
3448         }
3449
3450         /* verify that the size of data matches what we expect */
3451         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3452                                 tickles.connections)
3453                          + sizeof(struct ctdb_tcp_connection)
3454                                  * list->tickles.num) {
3455                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3456                 return -1;
3457         }       
3458
3459         vnn = find_public_ip_vnn(ctdb, &list->addr);
3460         if (vnn == NULL) {
3461                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3462                         ctdb_addr_to_str(&list->addr)));
3463
3464                 return 1;
3465         }
3466
3467         /* remove any old ticklelist we might have */
3468         talloc_free(vnn->tcp_array);
3469         vnn->tcp_array = NULL;
3470
3471         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3472         CTDB_NO_MEMORY(ctdb, tcparray);
3473
3474         tcparray->num = list->tickles.num;
3475
3476         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3477         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3478
3479         memcpy(tcparray->connections, &list->tickles.connections[0], 
3480                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3481
3482         /* We now have a new fresh tickle list array for this vnn */
3483         vnn->tcp_array = talloc_steal(vnn, tcparray);
3484         
3485         return 0;
3486 }
3487
3488 /*
3489   called to return the full list of tickles for the puclic address associated 
3490   with the provided vnn
3491  */
3492 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3493 {
3494         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3495         struct ctdb_control_tcp_tickle_list *list;
3496         struct ctdb_tcp_array *tcparray;
3497         int num;
3498         struct ctdb_vnn *vnn;
3499
3500         vnn = find_public_ip_vnn(ctdb, addr);
3501         if (vnn == NULL) {
3502                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3503                         ctdb_addr_to_str(addr)));
3504
3505                 return 1;
3506         }
3507
3508         tcparray = vnn->tcp_array;
3509         if (tcparray) {
3510                 num = tcparray->num;
3511         } else {
3512                 num = 0;
3513         }
3514
3515         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3516                                 tickles.connections)
3517                         + sizeof(struct ctdb_tcp_connection) * num;
3518
3519         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3520         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3521         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3522
3523         list->addr = *addr;
3524         list->tickles.num = num;
3525         if (num) {
3526                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3527                         sizeof(struct ctdb_tcp_connection) * num);
3528         }
3529
3530         return 0;
3531 }
3532
3533
3534 /*
3535   set the list of all tcp tickles for a public address
3536  */
3537 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3538                               struct timeval timeout, uint32_t destnode, 
3539                               ctdb_sock_addr *addr,
3540                               struct ctdb_tcp_array *tcparray)
3541 {
3542         int ret, num;
3543         TDB_DATA data;
3544         struct ctdb_control_tcp_tickle_list *list;
3545
3546         if (tcparray) {
3547                 num = tcparray->num;
3548         } else {
3549                 num = 0;
3550         }
3551
3552         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3553                                 tickles.connections) +
3554                         sizeof(struct ctdb_tcp_connection) * num;
3555         data.dptr = talloc_size(ctdb, data.dsize);
3556         CTDB_NO_MEMORY(ctdb, data.dptr);
3557
3558         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3559         list->addr = *addr;
3560         list->tickles.num = num;
3561         if (tcparray) {
3562                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3563         }
3564
3565         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3566                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3567                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3568         if (ret != 0) {
3569                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3570                 return -1;
3571         }
3572
3573         talloc_free(data.dptr);
3574
3575         return ret;
3576 }
3577
3578
3579 /*
3580   perform tickle updates if required
3581  */
3582 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3583                                 struct timed_event *te, 
3584                                 struct timeval t, void *private_data)
3585 {
3586         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3587         int ret;
3588         struct ctdb_vnn *vnn;
3589
3590         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3591                 /* we only send out updates for public addresses that 
3592                    we have taken over
3593                  */
3594                 if (ctdb->pnn != vnn->pnn) {
3595                         continue;
3596                 }
3597                 /* We only send out the updates if we need to */
3598                 if (!vnn->tcp_update_needed) {
3599                         continue;
3600                 }
3601                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3602                                 TAKEOVER_TIMEOUT(),
3603                                 CTDB_BROADCAST_CONNECTED,
3604                                 &vnn->public_address,
3605                                 vnn->tcp_array);
3606                 if (ret != 0) {
3607                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3608                                 ctdb_addr_to_str(&vnn->public_address)));
3609                 }
3610         }
3611
3612         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3613                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3614                              ctdb_update_tcp_tickles, ctdb);
3615 }               
3616         
3617
3618 /*
3619   start periodic update of tcp tickles
3620  */
3621 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3622 {
3623         ctdb->tickle_update_context = talloc_new(ctdb);
3624
3625         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3626                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3627                              ctdb_update_tcp_tickles, ctdb);
3628 }
3629
3630
3631
3632
3633 struct control_gratious_arp {
3634         struct ctdb_context *ctdb;
3635         ctdb_sock_addr addr;
3636         const char *iface;
3637         int count;
3638 };
3639
3640 /*
3641   send a control_gratuitous arp
3642  */
3643 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3644                                   struct timeval t, void *private_data)
3645 {
3646         int ret;
3647         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3648                                                         struct control_gratious_arp);
3649
3650         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3651         if (ret != 0) {
3652                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3653                                  arp->iface, strerror(errno)));
3654         }
3655
3656
3657         arp->count++;
3658         if (arp->count == CTDB_ARP_REPEAT) {
3659                 talloc_free(arp);
3660                 return;
3661         }
3662
3663         event_add_timed(arp->ctdb->ev, arp, 
3664                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3665                         send_gratious_arp, arp);
3666 }
3667
3668
3669 /*
3670   send a gratious arp 
3671  */
3672 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3673 {
3674         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3675         struct control_gratious_arp *arp;
3676
3677         /* verify the size of indata */
3678         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3679                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3680                                  (unsigned)indata.dsize, 
3681                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3682                 return -1;
3683         }
3684         if (indata.dsize != 
3685                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3686                 + gratious_arp->len ) ){
3687
3688                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3689                         "but should be %u bytes\n", 
3690                          (unsigned)indata.dsize, 
3691                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3692                 return -1;
3693         }
3694
3695
3696         arp = talloc(ctdb, struct control_gratious_arp);
3697         CTDB_NO_MEMORY(ctdb, arp);
3698
3699         arp->ctdb  = ctdb;
3700         arp->addr   = gratious_arp->addr;
3701         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3702         CTDB_NO_MEMORY(ctdb, arp->iface);
3703         arp->count = 0;
3704         
3705         event_add_timed(arp->ctdb->ev, arp, 
3706                         timeval_zero(), send_gratious_arp, arp);
3707
3708         return 0;
3709 }
3710
3711 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3712 {
3713         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3714         int ret;
3715
3716         /* verify the size of indata */
3717         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3718                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3719                 return -1;
3720         }
3721         if (indata.dsize != 
3722                 ( offsetof(struct ctdb_control_ip_iface, iface)
3723                 + pub->len ) ){
3724
3725                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3726                         "but should be %u bytes\n", 
3727                          (unsigned)indata.dsize, 
3728                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3729                 return -1;
3730         }
3731
3732         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3733
3734         if (ret != 0) {
3735                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3736                 return -1;
3737         }
3738
3739         return 0;
3740 }
3741
3742 /*
3743   called when releaseip event finishes for del_public_address
3744  */
3745 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3746                                 void *private_data)
3747 {
3748         talloc_free(private_data);
3749 }
3750
3751 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3752 {
3753         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3754         struct ctdb_vnn *vnn;
3755         int ret;
3756
3757         /* verify the size of indata */
3758         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3759                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3760                 return -1;
3761         }
3762         if (indata.dsize != 
3763                 ( offsetof(struct ctdb_control_ip_iface, iface)
3764                 + pub->len ) ){
3765
3766                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3767                         "but should be %u bytes\n", 
3768                          (unsigned)indata.dsize, 
3769                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3770                 return -1;
3771         }
3772
3773         /* walk over all public addresses until we find a match */
3774         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3775                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3776                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3777
3778                         DLIST_REMOVE(ctdb->vnn, vnn);
3779                         talloc_steal(mem_ctx, vnn);
3780                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3781                         if (vnn->pnn != ctdb->pnn) {
3782                                 if (vnn->iface != NULL) {
3783                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3784                                 }
3785                                 talloc_free(mem_ctx);
3786                                 return 0;
3787                         }
3788                         vnn->pnn = -1;
3789
3790                         ret = ctdb_event_script_callback(ctdb, 
3791                                          mem_ctx, delete_ip_callback, mem_ctx,
3792                                          false,
3793                                          CTDB_EVENT_RELEASE_IP,
3794                                          "%s %s %u",
3795                                          ctdb_vnn_iface_string(vnn),
3796                                          ctdb_addr_to_str(&vnn->public_address),
3797                                          vnn->public_netmask_bits);
3798                         if (vnn->iface != NULL) {
3799                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3800                         }
3801                         if (ret != 0) {
3802                                 return -1;
3803                         }
3804                         return 0;
3805                 }
3806         }
3807
3808         return -1;
3809 }
3810
3811
3812 struct ipreallocated_callback_state {
3813         struct ctdb_req_control *c;
3814 };
3815
3816 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3817                                         int status, void *p)
3818 {
3819         struct ipreallocated_callback_state *state =
3820                 talloc_get_type(p, struct ipreallocated_callback_state);
3821
3822         if (status != 0) {
3823                 DEBUG(DEBUG_ERR,
3824                       (" \"ipreallocated\" event script failed (status %d)\n",
3825                        status));
3826                 if (status == -ETIME) {
3827                         ctdb_ban_self(ctdb);
3828                 }
3829         }
3830
3831         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3832         talloc_free(state);
3833 }
3834
3835 /* A control to run the ipreallocated event */
3836 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3837                                    struct ctdb_req_control *c,
3838                                    bool *async_reply)
3839 {
3840         int ret;
3841         struct ipreallocated_callback_state *state;
3842
3843         state = talloc(ctdb, struct ipreallocated_callback_state);
3844         CTDB_NO_MEMORY(ctdb, state);
3845
3846         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3847
3848         ret = ctdb_event_script_callback(ctdb, state,
3849                                          ctdb_ipreallocated_callback, state,
3850                                          false, CTDB_EVENT_IPREALLOCATED,
3851                                          "%s", "");
3852
3853         if (ret != 0) {
3854                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3855                 talloc_free(state);
3856                 return -1;
3857         }
3858
3859         /* tell the control that we will be reply asynchronously */
3860         state->c    = talloc_steal(state, c);
3861         *async_reply = true;
3862
3863         return 0;
3864 }
3865
3866
3867 /* This function is called from the recovery daemon to verify that a remote
3868    node has the expected ip allocation.
3869    This is verified against ctdb->ip_tree
3870 */
3871 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3872 {
3873         struct ctdb_public_ip_list *tmp_ip; 
3874         int i;
3875
3876         if (ctdb->ip_tree == NULL) {
3877                 /* dont know the expected allocation yet, assume remote node
3878                    is correct. */
3879                 return 0;
3880         }
3881
3882         if (ips == NULL) {
3883                 return 0;
3884         }
3885
3886         for (i=0; i<ips->num; i++) {
3887                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3888                 if (tmp_ip == NULL) {
3889                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3890                         return -1;
3891                 }
3892
3893                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3894                         continue;
3895                 }
3896
3897                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3898                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3899                         return -1;
3900                 }
3901         }
3902
3903         return 0;
3904 }
3905
3906 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3907 {
3908         struct ctdb_public_ip_list *tmp_ip; 
3909
3910         if (ctdb->ip_tree == NULL) {
3911                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3912                 return -1;
3913         }
3914
3915         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3916         if (tmp_ip == NULL) {
3917                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3918                 return -1;
3919         }
3920
3921         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3922         tmp_ip->pnn = ip->pnn;
3923
3924         return 0;
3925 }
3926
3927
3928 struct ctdb_reloadips_handle {
3929         struct ctdb_context *ctdb;
3930         struct ctdb_req_control *c;
3931         int status;
3932         int fd[2];
3933         pid_t child;
3934         struct fd_event *fde;
3935 };
3936
3937 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3938 {
3939         if (h == h->ctdb->reload_ips) {
3940                 h->ctdb->reload_ips = NULL;
3941         }
3942         if (h->c != NULL) {
3943                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3944                 h->c = NULL;
3945         }
3946         ctdb_kill(h->ctdb, h->child, SIGKILL);
3947         return 0;
3948 }
3949
3950 static void ctdb_reloadips_timeout_event(struct event_context *ev,
3951                                 struct timed_event *te,
3952                                 struct timeval t, void *private_data)
3953 {
3954         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3955
3956         talloc_free(h);
3957 }       
3958
3959 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
3960                              uint16_t flags, void *private_data)
3961 {
3962         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3963
3964         char res;
3965         int ret;
3966
3967         ret = read(h->fd[0], &res, 1);
3968         if (ret < 1 || res != 0) {
3969                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3970                 res = 1;
3971         }
3972         h->status = res;
3973
3974         talloc_free(h);
3975 }
3976
3977 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3978 {
3979         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3980         struct ctdb_all_public_ips *ips;
3981         struct ctdb_vnn *vnn;
3982         int i, ret;
3983
3984         /* read the ip allocation from the local node */
3985         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
3986         if (ret != 0) {
3987                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
3988                 talloc_free(mem_ctx);
3989                 return -1;
3990         }
3991
3992         /* re-read the public ips file */
3993         ctdb->vnn = NULL;
3994         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3995                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3996                 talloc_free(mem_ctx);
3997                 return -1;
3998         }               
3999
4000
4001         /* check the previous list of ips and scan for ips that have been
4002            dropped.
4003          */
4004         for (i = 0; i < ips->num; i++) {
4005                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4006                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4007                                 break;
4008                         }
4009                 }
4010
4011                 /* we need to delete this ip, no longer available on this node */
4012                 if (vnn == NULL) {
4013                         struct ctdb_control_ip_iface pub;
4014
4015                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4016                         pub.addr  = ips->ips[i].addr;
4017                         pub.mask  = 0;
4018                         pub.len   = 0;
4019
4020                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4021                         if (ret != 0) {
4022                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4023                                 return -1;
4024                         }
4025                 }
4026         }
4027
4028
4029         /* loop over all new ones and check the ones we need to add */
4030         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4031                 for (i = 0; i < ips->num; i++) {
4032                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4033                                 break;
4034                         }
4035                 }
4036                 if (i == ips->num) {
4037                         struct ctdb_control_ip_iface pub;
4038                         const char *ifaces = NULL;
4039                         int iface = 0;
4040
4041                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4042
4043                         pub.addr  = vnn->public_address;
4044                         pub.mask  = vnn->public_netmask_bits;
4045
4046
4047                         ifaces = vnn->ifaces[0];
4048                         iface = 1;
4049                         while (vnn->ifaces[iface] != NULL) {
4050                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4051                                 iface++;
4052                         }
4053                         pub.len   = strlen(ifaces)+1;
4054                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4055
4056                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4057                         if (ret != 0) {
4058                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4059                                 return -1;
4060                         }
4061                 }
4062         }
4063
4064         return 0;
4065 }
4066
4067 /* This control is sent to force the node to re-read the public addresses file
4068    and drop any addresses we should nnot longer host, and add new addresses
4069    that we are now able to host
4070 */
4071 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4072 {
4073         struct ctdb_reloadips_handle *h;
4074         pid_t parent = getpid();
4075
4076         if (ctdb->reload_ips != NULL) {
4077                 talloc_free(ctdb->reload_ips);
4078                 ctdb->reload_ips = NULL;
4079         }
4080
4081         h = talloc(ctdb, struct ctdb_reloadips_handle);
4082         CTDB_NO_MEMORY(ctdb, h);
4083         h->ctdb     = ctdb;
4084         h->c        = NULL;
4085         h->status   = -1;
4086         
4087         if (pipe(h->fd) == -1) {
4088                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4089                 talloc_free(h);
4090                 return -1;
4091         }
4092
4093         h->child = ctdb_fork(ctdb);
4094         if (h->child == (pid_t)-1) {
4095                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4096                 close(h->fd[0]);
4097                 close(h->fd[1]);
4098                 talloc_free(h);
4099                 return -1;
4100         }
4101
4102         /* child process */
4103         if (h->child == 0) {
4104                 signed char res = 0;
4105
4106                 close(h->fd[0]);
4107                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4108
4109                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4110                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4111                         res = -1;
4112                 } else {
4113                         res = ctdb_reloadips_child(ctdb);
4114                         if (res != 0) {
4115                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4116                         }
4117                 }
4118
4119                 write(h->fd[1], &res, 1);
4120                 /* make sure we die when our parent dies */
4121                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4122                         sleep(5);
4123                 }
4124                 _exit(0);
4125         }
4126
4127         h->c             = talloc_steal(h, c);
4128
4129         close(h->fd[1]);
4130         set_close_on_exec(h->fd[0]);
4131
4132         talloc_set_destructor(h, ctdb_reloadips_destructor);
4133
4134
4135         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4136                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4137                         (void *)h);
4138         tevent_fd_set_auto_close(h->fde);
4139
4140         event_add_timed(ctdb->ev, h,
4141                         timeval_current_ofs(120, 0),
4142                         ctdb_reloadips_timeout_event, h);
4143
4144         /* we reply later */
4145         *async_reply = true;
4146         return 0;
4147 }