Merge remote branch 'martins/master'
[garming/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tevent/tevent.h"
23 #include "lib/tdb/include/tdb.h"
24 #include "lib/util/dlinklist.h"
25 #include "system/network.h"
26 #include "system/filesys.h"
27 #include "system/wait.h"
28 #include "../include/ctdb_private.h"
29 #include "../common/rb_tree.h"
30
31
32 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33
34 #define CTDB_ARP_INTERVAL 1
35 #define CTDB_ARP_REPEAT   3
36
37 struct ctdb_iface {
38         struct ctdb_iface *prev, *next;
39         const char *name;
40         bool link_up;
41         uint32_t references;
42 };
43
44 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
45 {
46         if (vnn->iface) {
47                 return vnn->iface->name;
48         }
49
50         return "__none__";
51 }
52
53 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
54 {
55         struct ctdb_iface *i;
56
57         /* Verify that we dont have an entry for this ip yet */
58         for (i=ctdb->ifaces;i;i=i->next) {
59                 if (strcmp(i->name, iface) == 0) {
60                         return 0;
61                 }
62         }
63
64         /* create a new structure for this interface */
65         i = talloc_zero(ctdb, struct ctdb_iface);
66         CTDB_NO_MEMORY_FATAL(ctdb, i);
67         i->name = talloc_strdup(i, iface);
68         CTDB_NO_MEMORY(ctdb, i->name);
69         i->link_up = false;
70
71         DLIST_ADD(ctdb->ifaces, i);
72
73         return 0;
74 }
75
76 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
77                                           const char *iface)
78 {
79         struct ctdb_iface *i;
80
81         /* Verify that we dont have an entry for this ip yet */
82         for (i=ctdb->ifaces;i;i=i->next) {
83                 if (strcmp(i->name, iface) == 0) {
84                         return i;
85                 }
86         }
87
88         return NULL;
89 }
90
91 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
92                                               struct ctdb_vnn *vnn)
93 {
94         int i;
95         struct ctdb_iface *cur = NULL;
96         struct ctdb_iface *best = NULL;
97
98         for (i=0; vnn->ifaces[i]; i++) {
99
100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
101                 if (cur == NULL) {
102                         continue;
103                 }
104
105                 if (!cur->link_up) {
106                         continue;
107                 }
108
109                 if (best == NULL) {
110                         best = cur;
111                         continue;
112                 }
113
114                 if (cur->references < best->references) {
115                         best = cur;
116                         continue;
117                 }
118         }
119
120         return best;
121 }
122
123 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
124                                      struct ctdb_vnn *vnn)
125 {
126         struct ctdb_iface *best = NULL;
127
128         if (vnn->iface) {
129                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
130                                    "still assigned to iface '%s'\n",
131                                    ctdb_addr_to_str(&vnn->public_address),
132                                    ctdb_vnn_iface_string(vnn)));
133                 return 0;
134         }
135
136         best = ctdb_vnn_best_iface(ctdb, vnn);
137         if (best == NULL) {
138                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
139                                   "cannot assign to iface any iface\n",
140                                   ctdb_addr_to_str(&vnn->public_address)));
141                 return -1;
142         }
143
144         vnn->iface = best;
145         best->references++;
146         vnn->pnn = ctdb->pnn;
147
148         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
149                            "now assigned to iface '%s' refs[%d]\n",
150                            ctdb_addr_to_str(&vnn->public_address),
151                            ctdb_vnn_iface_string(vnn),
152                            best->references));
153         return 0;
154 }
155
156 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
157                                     struct ctdb_vnn *vnn)
158 {
159         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
160                            "now unassigned (old iface '%s' refs[%d])\n",
161                            ctdb_addr_to_str(&vnn->public_address),
162                            ctdb_vnn_iface_string(vnn),
163                            vnn->iface?vnn->iface->references:0));
164         if (vnn->iface) {
165                 vnn->iface->references--;
166         }
167         vnn->iface = NULL;
168         if (vnn->pnn == ctdb->pnn) {
169                 vnn->pnn = -1;
170         }
171 }
172
173 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
174                                struct ctdb_vnn *vnn)
175 {
176         int i;
177
178         if (vnn->iface && vnn->iface->link_up) {
179                 return true;
180         }
181
182         for (i=0; vnn->ifaces[i]; i++) {
183                 struct ctdb_iface *cur;
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (cur->link_up) {
191                         return true;
192                 }
193         }
194
195         return false;
196 }
197
198 struct ctdb_takeover_arp {
199         struct ctdb_context *ctdb;
200         uint32_t count;
201         ctdb_sock_addr addr;
202         struct ctdb_tcp_array *tcparray;
203         struct ctdb_vnn *vnn;
204 };
205
206
207 /*
208   lists of tcp endpoints
209  */
210 struct ctdb_tcp_list {
211         struct ctdb_tcp_list *prev, *next;
212         struct ctdb_tcp_connection connection;
213 };
214
215 /*
216   list of clients to kill on IP release
217  */
218 struct ctdb_client_ip {
219         struct ctdb_client_ip *prev, *next;
220         struct ctdb_context *ctdb;
221         ctdb_sock_addr addr;
222         uint32_t client_id;
223 };
224
225
226 /*
227   send a gratuitous arp
228  */
229 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
230                                   struct timeval t, void *private_data)
231 {
232         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
233                                                         struct ctdb_takeover_arp);
234         int i, ret;
235         struct ctdb_tcp_array *tcparray;
236         const char *iface = ctdb_vnn_iface_string(arp->vnn);
237
238         ret = ctdb_sys_send_arp(&arp->addr, iface);
239         if (ret != 0) {
240                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
241                                   iface, strerror(errno)));
242         }
243
244         tcparray = arp->tcparray;
245         if (tcparray) {
246                 for (i=0;i<tcparray->num;i++) {
247                         struct ctdb_tcp_connection *tcon;
248
249                         tcon = &tcparray->connections[i];
250                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
251                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
252                                 ctdb_addr_to_str(&tcon->src_addr),
253                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
254                         ret = ctdb_sys_send_tcp(
255                                 &tcon->src_addr, 
256                                 &tcon->dst_addr,
257                                 0, 0, 0);
258                         if (ret != 0) {
259                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
260                                         ctdb_addr_to_str(&tcon->src_addr)));
261                         }
262                 }
263         }
264
265         arp->count++;
266
267         if (arp->count == CTDB_ARP_REPEAT) {
268                 talloc_free(arp);
269                 return;
270         }
271
272         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
273                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
274                         ctdb_control_send_arp, arp);
275 }
276
277 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
278                                        struct ctdb_vnn *vnn)
279 {
280         struct ctdb_takeover_arp *arp;
281         struct ctdb_tcp_array *tcparray;
282
283         if (!vnn->takeover_ctx) {
284                 vnn->takeover_ctx = talloc_new(vnn);
285                 if (!vnn->takeover_ctx) {
286                         return -1;
287                 }
288         }
289
290         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
291         if (!arp) {
292                 return -1;
293         }
294
295         arp->ctdb = ctdb;
296         arp->addr = vnn->public_address;
297         arp->vnn  = vnn;
298
299         tcparray = vnn->tcp_array;
300         if (tcparray) {
301                 /* add all of the known tcp connections for this IP to the
302                    list of tcp connections to send tickle acks for */
303                 arp->tcparray = talloc_steal(arp, tcparray);
304
305                 vnn->tcp_array = NULL;
306                 vnn->tcp_update_needed = true;
307         }
308
309         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
310                         timeval_zero(), ctdb_control_send_arp, arp);
311
312         return 0;
313 }
314
315 struct takeover_callback_state {
316         struct ctdb_req_control *c;
317         ctdb_sock_addr *addr;
318         struct ctdb_vnn *vnn;
319 };
320
321 struct ctdb_do_takeip_state {
322         struct ctdb_req_control *c;
323         struct ctdb_vnn *vnn;
324 };
325
326 /*
327   called when takeip event finishes
328  */
329 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
330                                     void *private_data)
331 {
332         struct ctdb_do_takeip_state *state =
333                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
334         int32_t ret;
335         TDB_DATA data;
336
337         if (status != 0) {
338                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339         
340                 if (status == -ETIME) {
341                         ctdb_ban_self(ctdb);
342                 }
343                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
344                                  ctdb_addr_to_str(&state->vnn->public_address),
345                                  ctdb_vnn_iface_string(state->vnn)));
346                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347
348                 node->flags |= NODE_FLAGS_UNHEALTHY;
349                 talloc_free(state);
350                 return;
351         }
352
353         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
354         if (ret != 0) {
355                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
356                 talloc_free(state);
357                 return;
358         }
359
360         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
361         data.dsize = strlen((char *)data.dptr) + 1;
362         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
363
364         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
365
366
367         /* the control succeeded */
368         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
369         talloc_free(state);
370         return;
371 }
372
373 /*
374   take over an ip address
375  */
376 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
377                               struct ctdb_req_control *c,
378                               struct ctdb_vnn *vnn)
379 {
380         int ret;
381         struct ctdb_do_takeip_state *state;
382
383         ret = ctdb_vnn_assign_iface(ctdb, vnn);
384         if (ret != 0) {
385                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
386                                  "assin a usable interface\n",
387                                  ctdb_addr_to_str(&vnn->public_address),
388                                  vnn->public_netmask_bits));
389                 return -1;
390         }
391
392         state = talloc(vnn, struct ctdb_do_takeip_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(ctdb, c);
396         state->vnn   = vnn;
397
398         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
399                             ctdb_addr_to_str(&vnn->public_address),
400                             vnn->public_netmask_bits,
401                             ctdb_vnn_iface_string(vnn)));
402
403         ret = ctdb_event_script_callback(ctdb,
404                                          state,
405                                          ctdb_do_takeip_callback,
406                                          state,
407                                          false,
408                                          CTDB_EVENT_TAKE_IP,
409                                          "%s %s %u",
410                                          ctdb_vnn_iface_string(vnn),
411                                          ctdb_addr_to_str(&vnn->public_address),
412                                          vnn->public_netmask_bits);
413
414         if (ret != 0) {
415                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
416                         ctdb_addr_to_str(&vnn->public_address),
417                         ctdb_vnn_iface_string(vnn)));
418                 talloc_free(state);
419                 return -1;
420         }
421
422         return 0;
423 }
424
425 struct ctdb_do_updateip_state {
426         struct ctdb_req_control *c;
427         struct ctdb_iface *old;
428         struct ctdb_vnn *vnn;
429 };
430
431 /*
432   called when updateip event finishes
433  */
434 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
435                                       void *private_data)
436 {
437         struct ctdb_do_updateip_state *state =
438                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
439         int32_t ret;
440
441         if (status != 0) {
442                 if (status == -ETIME) {
443                         ctdb_ban_self(ctdb);
444                 }
445                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
446                         ctdb_addr_to_str(&state->vnn->public_address),
447                         state->old->name,
448                         ctdb_vnn_iface_string(state->vnn)));
449
450                 /*
451                  * All we can do is reset the old interface
452                  * and let the next run fix it
453                  */
454                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
455                 state->vnn->iface = state->old;
456                 state->vnn->iface->references++;
457
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459                 talloc_free(state);
460                 return;
461         }
462
463         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
464         if (ret != 0) {
465                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
466                 talloc_free(state);
467                 return;
468         }
469
470         /* the control succeeded */
471         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
472         talloc_free(state);
473         return;
474 }
475
476 /*
477   update (move) an ip address
478  */
479 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
480                                 struct ctdb_req_control *c,
481                                 struct ctdb_vnn *vnn)
482 {
483         int ret;
484         struct ctdb_do_updateip_state *state;
485         struct ctdb_iface *old = vnn->iface;
486         const char *new_name;
487
488         ctdb_vnn_unassign_iface(ctdb, vnn);
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
492                                  "assin a usable interface (old iface '%s')\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits,
495                                  old->name));
496                 return -1;
497         }
498
499         new_name = ctdb_vnn_iface_string(vnn);
500         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
501                 /* A benign update from one interface onto itself.
502                  * no need to run the eventscripts in this case, just return
503                  * success.
504                  */
505                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
506                 return 0;
507         }
508
509         state = talloc(vnn, struct ctdb_do_updateip_state);
510         CTDB_NO_MEMORY(ctdb, state);
511
512         state->c = talloc_steal(ctdb, c);
513         state->old = old;
514         state->vnn = vnn;
515
516         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
517                             "interface %s to %s\n",
518                             ctdb_addr_to_str(&vnn->public_address),
519                             vnn->public_netmask_bits,
520                             old->name,
521                             new_name));
522
523         ret = ctdb_event_script_callback(ctdb,
524                                          state,
525                                          ctdb_do_updateip_callback,
526                                          state,
527                                          false,
528                                          CTDB_EVENT_UPDATE_IP,
529                                          "%s %s %s %u",
530                                          state->old->name,
531                                          new_name,
532                                          ctdb_addr_to_str(&vnn->public_address),
533                                          vnn->public_netmask_bits);
534         if (ret != 0) {
535                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
536                                  ctdb_addr_to_str(&vnn->public_address),
537                                  old->name, new_name));
538                 talloc_free(state);
539                 return -1;
540         }
541
542         return 0;
543 }
544
545 /*
546   Find the vnn of the node that has a public ip address
547   returns -1 if the address is not known as a public address
548  */
549 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
550 {
551         struct ctdb_vnn *vnn;
552
553         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
554                 if (ctdb_same_ip(&vnn->public_address, addr)) {
555                         return vnn;
556                 }
557         }
558
559         return NULL;
560 }
561
562 /*
563   take over an ip address
564  */
565 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
566                                  struct ctdb_req_control *c,
567                                  TDB_DATA indata,
568                                  bool *async_reply)
569 {
570         int ret;
571         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
572         struct ctdb_vnn *vnn;
573         bool have_ip = false;
574         bool do_updateip = false;
575         bool do_takeip = false;
576         struct ctdb_iface *best_iface = NULL;
577
578         if (pip->pnn != ctdb->pnn) {
579                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
580                                  "with pnn %d, but we're node %d\n",
581                                  ctdb_addr_to_str(&pip->addr),
582                                  pip->pnn, ctdb->pnn));
583                 return -1;
584         }
585
586         /* update out vnn list */
587         vnn = find_public_ip_vnn(ctdb, &pip->addr);
588         if (vnn == NULL) {
589                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
590                         ctdb_addr_to_str(&pip->addr)));
591                 return 0;
592         }
593
594         have_ip = ctdb_sys_have_ip(&pip->addr);
595         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
596         if (best_iface == NULL) {
597                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
598                                  "a usable interface (old %s, have_ip %d)\n",
599                                  ctdb_addr_to_str(&vnn->public_address),
600                                  vnn->public_netmask_bits,
601                                  ctdb_vnn_iface_string(vnn),
602                                  have_ip));
603                 return -1;
604         }
605
606         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
607                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
608                 have_ip = false;
609         }
610
611         if (vnn->iface == NULL && have_ip) {
612                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
613                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
614                                  ctdb_addr_to_str(&vnn->public_address)));
615                 return 0;
616         }
617
618         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
619                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
620                                   "and we have it on iface[%s], but it was assigned to node %d"
621                                   "and we are node %d, banning ourself\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
624                 ctdb_ban_self(ctdb);
625                 return -1;
626         }
627
628         if (vnn->pnn == -1 && have_ip) {
629                 vnn->pnn = ctdb->pnn;
630                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
631                                   "and we already have it on iface[%s], update local daemon\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                   ctdb_vnn_iface_string(vnn)));
634                 return 0;
635         }
636
637         if (vnn->iface) {
638                 if (vnn->iface->link_up) {
639                         /* only move when the rebalance gains something */
640                         if (vnn->iface->references > (best_iface->references + 1)) {
641                                 do_updateip = true;
642                         }
643                 } else if (vnn->iface != best_iface) {
644                         do_updateip = true;
645                 }
646         }
647
648         if (!have_ip) {
649                 if (do_updateip) {
650                         ctdb_vnn_unassign_iface(ctdb, vnn);
651                         do_updateip = false;
652                 }
653                 do_takeip = true;
654         }
655
656         if (do_takeip) {
657                 ret = ctdb_do_takeip(ctdb, c, vnn);
658                 if (ret != 0) {
659                         return -1;
660                 }
661         } else if (do_updateip) {
662                 ret = ctdb_do_updateip(ctdb, c, vnn);
663                 if (ret != 0) {
664                         return -1;
665                 }
666         } else {
667                 /*
668                  * The interface is up and the kernel known the ip
669                  * => do nothing
670                  */
671                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
672                         ctdb_addr_to_str(&pip->addr),
673                         vnn->public_netmask_bits,
674                         ctdb_vnn_iface_string(vnn)));
675                 return 0;
676         }
677
678         /* tell ctdb_control.c that we will be replying asynchronously */
679         *async_reply = true;
680
681         return 0;
682 }
683
684 /*
685   takeover an ip address old v4 style
686  */
687 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
688                                 struct ctdb_req_control *c,
689                                 TDB_DATA indata, 
690                                 bool *async_reply)
691 {
692         TDB_DATA data;
693         
694         data.dsize = sizeof(struct ctdb_public_ip);
695         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
696         CTDB_NO_MEMORY(ctdb, data.dptr);
697         
698         memcpy(data.dptr, indata.dptr, indata.dsize);
699         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
700 }
701
702 /*
703   kill any clients that are registered with a IP that is being released
704  */
705 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
706 {
707         struct ctdb_client_ip *ip;
708
709         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
710                 ctdb_addr_to_str(addr)));
711
712         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
713                 ctdb_sock_addr tmp_addr;
714
715                 tmp_addr = ip->addr;
716                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
717                         ip->client_id,
718                         ctdb_addr_to_str(&ip->addr)));
719
720                 if (ctdb_same_ip(&tmp_addr, addr)) {
721                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
722                                                                      ip->client_id, 
723                                                                      struct ctdb_client);
724                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
725                                 ip->client_id,
726                                 ctdb_addr_to_str(&ip->addr),
727                                 client->pid));
728
729                         if (client->pid != 0) {
730                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
731                                         (unsigned)client->pid,
732                                         ctdb_addr_to_str(addr),
733                                         ip->client_id));
734                                 kill(client->pid, SIGKILL);
735                         }
736                 }
737         }
738 }
739
740 /*
741   called when releaseip event finishes
742  */
743 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
744                                 void *private_data)
745 {
746         struct takeover_callback_state *state = 
747                 talloc_get_type(private_data, struct takeover_callback_state);
748         TDB_DATA data;
749
750         if (status == -ETIME) {
751                 ctdb_ban_self(ctdb);
752         }
753
754         /* send a message to all clients of this node telling them
755            that the cluster has been reconfigured and they should
756            release any sockets on this IP */
757         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
758         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
759         data.dsize = strlen((char *)data.dptr)+1;
760
761         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
762
763         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
764
765         /* kill clients that have registered with this IP */
766         release_kill_clients(ctdb, state->addr);
767
768         ctdb_vnn_unassign_iface(ctdb, state->vnn);
769
770         /* the control succeeded */
771         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
772         talloc_free(state);
773 }
774
775 /*
776   release an ip address
777  */
778 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
779                                 struct ctdb_req_control *c,
780                                 TDB_DATA indata, 
781                                 bool *async_reply)
782 {
783         int ret;
784         struct takeover_callback_state *state;
785         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
786         struct ctdb_vnn *vnn;
787
788         /* update our vnn list */
789         vnn = find_public_ip_vnn(ctdb, &pip->addr);
790         if (vnn == NULL) {
791                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
792                         ctdb_addr_to_str(&pip->addr)));
793                 return 0;
794         }
795         vnn->pnn = pip->pnn;
796
797         /* stop any previous arps */
798         talloc_free(vnn->takeover_ctx);
799         vnn->takeover_ctx = NULL;
800
801         if (!ctdb_sys_have_ip(&pip->addr)) {
802                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits, 
805                         ctdb_vnn_iface_string(vnn)));
806                 ctdb_vnn_unassign_iface(ctdb, vnn);
807                 return 0;
808         }
809
810         if (vnn->iface == NULL) {
811                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
812                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
813                                  ctdb_addr_to_str(&vnn->public_address)));
814                 return 0;
815         }
816
817         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
818                 ctdb_addr_to_str(&pip->addr),
819                 vnn->public_netmask_bits, 
820                 ctdb_vnn_iface_string(vnn),
821                 pip->pnn));
822
823         state = talloc(ctdb, struct takeover_callback_state);
824         CTDB_NO_MEMORY(ctdb, state);
825
826         state->c = talloc_steal(state, c);
827         state->addr = talloc(state, ctdb_sock_addr);       
828         CTDB_NO_MEMORY(ctdb, state->addr);
829         *state->addr = pip->addr;
830         state->vnn   = vnn;
831
832         ret = ctdb_event_script_callback(ctdb, 
833                                          state, release_ip_callback, state,
834                                          false,
835                                          CTDB_EVENT_RELEASE_IP,
836                                          "%s %s %u",
837                                          ctdb_vnn_iface_string(vnn),
838                                          ctdb_addr_to_str(&pip->addr),
839                                          vnn->public_netmask_bits);
840         if (ret != 0) {
841                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
842                         ctdb_addr_to_str(&pip->addr),
843                         ctdb_vnn_iface_string(vnn)));
844                 talloc_free(state);
845                 return -1;
846         }
847
848         /* tell the control that we will be reply asynchronously */
849         *async_reply = true;
850         return 0;
851 }
852
853 /*
854   release an ip address old v4 style
855  */
856 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
857                                 struct ctdb_req_control *c,
858                                 TDB_DATA indata, 
859                                 bool *async_reply)
860 {
861         TDB_DATA data;
862         
863         data.dsize = sizeof(struct ctdb_public_ip);
864         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
865         CTDB_NO_MEMORY(ctdb, data.dptr);
866         
867         memcpy(data.dptr, indata.dptr, indata.dsize);
868         return ctdb_control_release_ip(ctdb, c, data, async_reply);
869 }
870
871
872 static int ctdb_add_public_address(struct ctdb_context *ctdb,
873                                    ctdb_sock_addr *addr,
874                                    unsigned mask, const char *ifaces)
875 {
876         struct ctdb_vnn      *vnn;
877         uint32_t num = 0;
878         char *tmp;
879         const char *iface;
880         int i;
881         int ret;
882
883         tmp = strdup(ifaces);
884         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
885                 if (!ctdb_sys_check_iface_exists(iface)) {
886                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
887                         free(tmp);
888                         return -1;
889                 }
890         }
891         free(tmp);
892
893         /* Verify that we dont have an entry for this ip yet */
894         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
895                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
896                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
897                                 ctdb_addr_to_str(addr)));
898                         return -1;
899                 }               
900         }
901
902         /* create a new vnn structure for this ip address */
903         vnn = talloc_zero(ctdb, struct ctdb_vnn);
904         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
905         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
906         tmp = talloc_strdup(vnn, ifaces);
907         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
908         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
909                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
910                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
911                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
912                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
913                 num++;
914         }
915         talloc_free(tmp);
916         vnn->ifaces[num] = NULL;
917         vnn->public_address      = *addr;
918         vnn->public_netmask_bits = mask;
919         vnn->pnn                 = -1;
920         if (ctdb_sys_have_ip(addr)) {
921                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
922                 vnn->pnn = ctdb->pnn;
923         }
924
925         for (i=0; vnn->ifaces[i]; i++) {
926                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
927                 if (ret != 0) {
928                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
929                                            "for public_address[%s]\n",
930                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
931                         talloc_free(vnn);
932                         return -1;
933                 }
934                 if (i == 0) {
935                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
936                 }
937         }
938
939         DLIST_ADD(ctdb->vnn, vnn);
940
941         return 0;
942 }
943
944 /*
945   setup the event script directory
946 */
947 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
948 {
949         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
950         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
951         return 0;
952 }
953
954 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
955                                   struct timeval t, void *private_data)
956 {
957         struct ctdb_context *ctdb = talloc_get_type(private_data, 
958                                                         struct ctdb_context);
959         struct ctdb_vnn *vnn;
960
961         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
962                 int i;
963
964                 for (i=0; vnn->ifaces[i] != NULL; i++) {
965                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
966                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
967                                         vnn->ifaces[i],
968                                         ctdb_addr_to_str(&vnn->public_address)));
969                         }
970                 }
971         }
972
973         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
974                 timeval_current_ofs(30, 0), 
975                 ctdb_check_interfaces_event, ctdb);
976 }
977
978
979 static int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
980 {
981         if (ctdb->check_public_ifaces_ctx != NULL) {
982                 talloc_free(ctdb->check_public_ifaces_ctx);
983                 ctdb->check_public_ifaces_ctx = NULL;
984         }
985
986         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
987         if (ctdb->check_public_ifaces_ctx == NULL) {
988                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
989         }
990
991         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
992                 timeval_current_ofs(30, 0), 
993                 ctdb_check_interfaces_event, ctdb);
994
995         return 0;
996 }
997
998
999 /*
1000   setup the public address lists from a file
1001 */
1002 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
1003 {
1004         char **lines;
1005         int nlines;
1006         int i;
1007
1008         lines = file_lines_load(alist, &nlines, ctdb);
1009         if (lines == NULL) {
1010                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
1011                 return -1;
1012         }
1013         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1014                 nlines--;
1015         }
1016
1017         for (i=0;i<nlines;i++) {
1018                 unsigned mask;
1019                 ctdb_sock_addr addr;
1020                 const char *addrstr;
1021                 const char *ifaces;
1022                 char *tok, *line;
1023
1024                 line = lines[i];
1025                 while ((*line == ' ') || (*line == '\t')) {
1026                         line++;
1027                 }
1028                 if (*line == '#') {
1029                         continue;
1030                 }
1031                 if (strcmp(line, "") == 0) {
1032                         continue;
1033                 }
1034                 tok = strtok(line, " \t");
1035                 addrstr = tok;
1036                 tok = strtok(NULL, " \t");
1037                 if (tok == NULL) {
1038                         if (NULL == ctdb->default_public_interface) {
1039                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1040                                          i+1));
1041                                 talloc_free(lines);
1042                                 return -1;
1043                         }
1044                         ifaces = ctdb->default_public_interface;
1045                 } else {
1046                         ifaces = tok;
1047                 }
1048
1049                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1050                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1051                         talloc_free(lines);
1052                         return -1;
1053                 }
1054                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
1055                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1056                         talloc_free(lines);
1057                         return -1;
1058                 }
1059         }
1060
1061
1062         ctdb_start_monitoring_interfaces(ctdb);
1063
1064         talloc_free(lines);
1065         return 0;
1066 }
1067
1068 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1069                               const char *iface,
1070                               const char *ip)
1071 {
1072         struct ctdb_vnn *svnn;
1073         struct ctdb_iface *cur = NULL;
1074         bool ok;
1075         int ret;
1076
1077         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1078         CTDB_NO_MEMORY(ctdb, svnn);
1079
1080         svnn->ifaces = talloc_array(svnn, const char *, 2);
1081         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1082         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1083         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1084         svnn->ifaces[1] = NULL;
1085
1086         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1087         if (!ok) {
1088                 talloc_free(svnn);
1089                 return -1;
1090         }
1091
1092         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1093         if (ret != 0) {
1094                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1095                                    "for single_ip[%s]\n",
1096                                    svnn->ifaces[0],
1097                                    ctdb_addr_to_str(&svnn->public_address)));
1098                 talloc_free(svnn);
1099                 return -1;
1100         }
1101
1102         /* assume the single public ip interface is initially "good" */
1103         cur = ctdb_find_iface(ctdb, iface);
1104         if (cur == NULL) {
1105                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1106                 return -1;
1107         }
1108         cur->link_up = true;
1109
1110         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1111         if (ret != 0) {
1112                 talloc_free(svnn);
1113                 return -1;
1114         }
1115
1116         ctdb->single_ip_vnn = svnn;
1117         return 0;
1118 }
1119
1120 /* Given a physical node, return the number of
1121    public addresses that is currently assigned to this node.
1122 */
1123 static int node_ip_coverage(struct ctdb_context *ctdb, 
1124         int32_t pnn,
1125         struct ctdb_public_ip_list *ips)
1126 {
1127         int num=0;
1128
1129         for (;ips;ips=ips->next) {
1130                 if (ips->pnn == pnn) {
1131                         num++;
1132                 }
1133         }
1134         return num;
1135 }
1136
1137
1138 /* Check if this is a public ip known to the node, i.e. can that
1139    node takeover this ip ?
1140 */
1141 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1142                 struct ctdb_public_ip_list *ip)
1143 {
1144         struct ctdb_all_public_ips *public_ips;
1145         int i;
1146
1147         public_ips = ctdb->nodes[pnn]->available_public_ips;
1148
1149         if (public_ips == NULL) {
1150                 return -1;
1151         }
1152
1153         for (i=0;i<public_ips->num;i++) {
1154                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1155                         /* yes, this node can serve this public ip */
1156                         return 0;
1157                 }
1158         }
1159
1160         return -1;
1161 }
1162
1163
1164 /* search the node lists list for a node to takeover this ip.
1165    pick the node that currently are serving the least number of ips
1166    so that the ips get spread out evenly.
1167 */
1168 static int find_takeover_node(struct ctdb_context *ctdb, 
1169                 struct ctdb_node_map *nodemap, uint32_t mask, 
1170                 struct ctdb_public_ip_list *ip,
1171                 struct ctdb_public_ip_list *all_ips)
1172 {
1173         int pnn, min=0, num;
1174         int i;
1175
1176         pnn    = -1;
1177         for (i=0;i<nodemap->num;i++) {
1178                 if (nodemap->nodes[i].flags & mask) {
1179                         /* This node is not healty and can not be used to serve
1180                            a public address 
1181                         */
1182                         continue;
1183                 }
1184
1185                 /* verify that this node can serve this ip */
1186                 if (can_node_serve_ip(ctdb, i, ip)) {
1187                         /* no it couldnt   so skip to the next node */
1188                         continue;
1189                 }
1190
1191                 num = node_ip_coverage(ctdb, i, all_ips);
1192                 /* was this the first node we checked ? */
1193                 if (pnn == -1) {
1194                         pnn = i;
1195                         min  = num;
1196                 } else {
1197                         if (num < min) {
1198                                 pnn = i;
1199                                 min  = num;
1200                         }
1201                 }
1202         }       
1203         if (pnn == -1) {
1204                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1205                         ctdb_addr_to_str(&ip->addr)));
1206
1207                 return -1;
1208         }
1209
1210         ip->pnn = pnn;
1211         return 0;
1212 }
1213
1214 #define IP_KEYLEN       4
1215 static uint32_t *ip_key(ctdb_sock_addr *ip)
1216 {
1217         static uint32_t key[IP_KEYLEN];
1218
1219         bzero(key, sizeof(key));
1220
1221         switch (ip->sa.sa_family) {
1222         case AF_INET:
1223                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1224                 break;
1225         case AF_INET6: {
1226                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1227                 key[0]  = htonl(s6_a32[0]);
1228                 key[1]  = htonl(s6_a32[1]);
1229                 key[2]  = htonl(s6_a32[2]);
1230                 key[3]  = htonl(s6_a32[3]);
1231                 break;
1232         }
1233         default:
1234                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1235                 return key;
1236         }
1237
1238         return key;
1239 }
1240
1241 static void *add_ip_callback(void *parm, void *data)
1242 {
1243         struct ctdb_public_ip_list *this_ip = parm; 
1244         struct ctdb_public_ip_list *prev_ip = data; 
1245
1246         if (prev_ip == NULL) {
1247                 return parm;
1248         }
1249         if (this_ip->pnn == -1) {
1250                 this_ip->pnn = prev_ip->pnn;
1251         }
1252
1253         return parm;
1254 }
1255
1256 static int getips_count_callback(void *param, void *data)
1257 {
1258         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1259         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1260
1261         new_ip->next = *ip_list;
1262         *ip_list     = new_ip;
1263         return 0;
1264 }
1265
1266 static struct ctdb_public_ip_list *
1267 create_merged_ip_list(struct ctdb_context *ctdb)
1268 {
1269         int i, j;
1270         struct ctdb_public_ip_list *ip_list;
1271         struct ctdb_all_public_ips *public_ips;
1272
1273         if (ctdb->ip_tree != NULL) {
1274                 talloc_free(ctdb->ip_tree);
1275                 ctdb->ip_tree = NULL;
1276         }
1277         ctdb->ip_tree = trbt_create(ctdb, 0);
1278
1279         for (i=0;i<ctdb->num_nodes;i++) {
1280                 public_ips = ctdb->nodes[i]->known_public_ips;
1281
1282                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1283                         continue;
1284                 }
1285
1286                 /* there were no public ips for this node */
1287                 if (public_ips == NULL) {
1288                         continue;
1289                 }               
1290
1291                 for (j=0;j<public_ips->num;j++) {
1292                         struct ctdb_public_ip_list *tmp_ip; 
1293
1294                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1295                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1296                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1297                         tmp_ip->addr = public_ips->ips[j].addr;
1298                         tmp_ip->next = NULL;
1299
1300                         trbt_insertarray32_callback(ctdb->ip_tree,
1301                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1302                                 add_ip_callback,
1303                                 tmp_ip);
1304                 }
1305         }
1306
1307         ip_list = NULL;
1308         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1309
1310         return ip_list;
1311 }
1312
1313 /* 
1314  * This is the length of the longtest common prefix between the IPs.
1315  * It is calculated by XOR-ing the 2 IPs together and counting the
1316  * number of leading zeroes.  The implementation means that all
1317  * addresses end up being 128 bits long.
1318  *
1319  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1320  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1321  * lots of nodes and IP addresses?
1322  */
1323 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1324 {
1325         uint32_t ip1_k[IP_KEYLEN];
1326         uint32_t *t;
1327         int i;
1328         uint32_t x;
1329
1330         uint32_t distance = 0;
1331
1332         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1333         t = ip_key(ip2);
1334         for (i=0; i<IP_KEYLEN; i++) {
1335                 x = ip1_k[i] ^ t[i];
1336                 if (x == 0) {
1337                         distance += 32;
1338                 } else {
1339                         /* Count number of leading zeroes. 
1340                          * FIXME? This could be optimised...
1341                          */
1342                         while ((x & (1 << 31)) == 0) {
1343                                 x <<= 1;
1344                                 distance += 1;
1345                         }
1346                 }
1347         }
1348
1349         return distance;
1350 }
1351
1352 /* Calculate the IP distance for the given IP relative to IPs on the
1353    given node.  The ips argument is generally the all_ips variable
1354    used in the main part of the algorithm.
1355  */
1356 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1357                                   struct ctdb_public_ip_list *ips,
1358                                   int pnn)
1359 {
1360         struct ctdb_public_ip_list *t;
1361         uint32_t d;
1362
1363         uint32_t sum = 0;
1364
1365         for (t=ips; t != NULL; t=t->next) {
1366                 if (t->pnn != pnn) {
1367                         continue;
1368                 }
1369
1370                 /* Optimisation: We never calculate the distance
1371                  * between an address and itself.  This allows us to
1372                  * calculate the effect of removing an address from a
1373                  * node by simply calculating the distance between
1374                  * that address and all of the exitsing addresses.
1375                  * Moreover, we assume that we're only ever dealing
1376                  * with addresses from all_ips so we can identify an
1377                  * address via a pointer rather than doing a more
1378                  * expensive address comparison. */
1379                 if (&(t->addr) == ip) {
1380                         continue;
1381                 }
1382
1383                 d = ip_distance(ip, &(t->addr));
1384                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1385         }
1386
1387         return sum;
1388 }
1389
1390 /* Return the LCP2 imbalance metric for addresses currently assigned
1391    to the given node.
1392  */
1393 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1394 {
1395         struct ctdb_public_ip_list *t;
1396
1397         uint32_t imbalance = 0;
1398
1399         for (t=all_ips; t!=NULL; t=t->next) {
1400                 if (t->pnn != pnn) {
1401                         continue;
1402                 }
1403                 /* Pass the rest of the IPs rather than the whole
1404                    all_ips input list.
1405                 */
1406                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1407         }
1408
1409         return imbalance;
1410 }
1411
1412 /* Allocate any unassigned IPs just by looping through the IPs and
1413  * finding the best node for each.
1414  */
1415 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1416                                       struct ctdb_node_map *nodemap,
1417                                       uint32_t mask,
1418                                       struct ctdb_public_ip_list *all_ips)
1419 {
1420         struct ctdb_public_ip_list *tmp_ip;
1421
1422         /* loop over all ip's and find a physical node to cover for 
1423            each unassigned ip.
1424         */
1425         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1426                 if (tmp_ip->pnn == -1) {
1427                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1428                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1429                                         ctdb_addr_to_str(&tmp_ip->addr)));
1430                         }
1431                 }
1432         }
1433 }
1434
1435 /* Basic non-deterministic rebalancing algorithm.
1436  */
1437 static bool basic_failback(struct ctdb_context *ctdb,
1438                            struct ctdb_node_map *nodemap,
1439                            uint32_t mask,
1440                            struct ctdb_public_ip_list *all_ips,
1441                            int num_ips,
1442                            int *retries)
1443 {
1444         int i;
1445         int maxnode, maxnum=0, minnode, minnum=0, num;
1446         struct ctdb_public_ip_list *tmp_ip;
1447
1448         /* for each ip address, loop over all nodes that can serve
1449            this ip and make sure that the difference between the node
1450            serving the most and the node serving the least ip's are
1451            not greater than 1.
1452         */
1453         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1454                 if (tmp_ip->pnn == -1) {
1455                         continue;
1456                 }
1457
1458                 /* Get the highest and lowest number of ips's served by any 
1459                    valid node which can serve this ip.
1460                 */
1461                 maxnode = -1;
1462                 minnode = -1;
1463                 for (i=0;i<nodemap->num;i++) {
1464                         if (nodemap->nodes[i].flags & mask) {
1465                                 continue;
1466                         }
1467
1468                         /* only check nodes that can actually serve this ip */
1469                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1470                                 /* no it couldnt   so skip to the next node */
1471                                 continue;
1472                         }
1473
1474                         num = node_ip_coverage(ctdb, i, all_ips);
1475                         if (maxnode == -1) {
1476                                 maxnode = i;
1477                                 maxnum  = num;
1478                         } else {
1479                                 if (num > maxnum) {
1480                                         maxnode = i;
1481                                         maxnum  = num;
1482                                 }
1483                         }
1484                         if (minnode == -1) {
1485                                 minnode = i;
1486                                 minnum  = num;
1487                         } else {
1488                                 if (num < minnum) {
1489                                         minnode = i;
1490                                         minnum  = num;
1491                                 }
1492                         }
1493                 }
1494                 if (maxnode == -1) {
1495                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1496                                 ctdb_addr_to_str(&tmp_ip->addr)));
1497
1498                         continue;
1499                 }
1500
1501                 /* If we want deterministic IPs then dont try to reallocate 
1502                    them to spread out the load.
1503                 */
1504                 if (1 == ctdb->tunable.deterministic_public_ips) {
1505                         continue;
1506                 }
1507
1508                 /* if the spread between the smallest and largest coverage by
1509                    a node is >=2 we steal one of the ips from the node with
1510                    most coverage to even things out a bit.
1511                    try to do this a limited number of times since we dont
1512                    want to spend too much time balancing the ip coverage.
1513                 */
1514                 if ( (maxnum > minnum+1)
1515                      && (*retries < (num_ips + 5)) ){
1516                         struct ctdb_public_ip_list *tmp;
1517
1518                         /* mark one of maxnode's vnn's as unassigned and try
1519                            again
1520                         */
1521                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1522                                 if (tmp->pnn == maxnode) {
1523                                         tmp->pnn = -1;
1524                                         (*retries)++;
1525                                         return true;
1526                                 }
1527                         }
1528                 }
1529         }
1530
1531         return false;
1532 }
1533
1534 struct ctdb_rebalancenodes {
1535         struct ctdb_rebalancenodes *next;
1536         uint32_t pnn;
1537 };
1538 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1539
1540
1541 /* set this flag to force the node to be rebalanced even if it just didnt
1542    become healthy again.
1543 */
1544 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1545 {
1546         struct ctdb_rebalancenodes *rebalance;
1547
1548         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1549                 if (rebalance->pnn == pnn) {
1550                         return;
1551                 }
1552         }
1553
1554         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1555         rebalance->pnn = pnn;
1556         rebalance->next = force_rebalance_list;
1557         force_rebalance_list = rebalance;
1558 }
1559
1560 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1561  * that we can unit test it.
1562  */
1563 static void lcp2_init(struct ctdb_context * tmp_ctx,
1564                struct ctdb_node_map * nodemap,
1565                uint32_t mask,
1566                struct ctdb_public_ip_list *all_ips,
1567                uint32_t **lcp2_imbalances,
1568                bool **newly_healthy)
1569 {
1570         int i;
1571         struct ctdb_public_ip_list *tmp_ip;
1572
1573         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1574         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1575         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1576         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1577
1578         for (i=0;i<nodemap->num;i++) {
1579                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1580                 /* First step: is the node "healthy"? */
1581                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1582         }
1583
1584         /* 2nd step: if a ndoe has IPs assigned then it must have been
1585          * healthy before, so we remove it from consideration... */
1586         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1587                 if (tmp_ip->pnn != -1) {
1588                         (*newly_healthy)[tmp_ip->pnn] = false;
1589                 }
1590         }
1591
1592         /* 3rd step: if a node is forced to re-balance then
1593            we allow failback onto the node */
1594         while (force_rebalance_list != NULL) {
1595                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1596
1597                 if (force_rebalance_list->pnn <= nodemap->num) {
1598                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1599                 }
1600
1601                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1602                 talloc_free(force_rebalance_list);
1603                 force_rebalance_list = next;
1604         }
1605 }
1606
1607 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1608  * the IP/node combination that will cost the least.
1609  */
1610 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1611                               struct ctdb_node_map *nodemap,
1612                               uint32_t mask,
1613                               struct ctdb_public_ip_list *all_ips,
1614                               uint32_t *lcp2_imbalances)
1615 {
1616         struct ctdb_public_ip_list *tmp_ip;
1617         int dstnode;
1618
1619         int minnode;
1620         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1621         struct ctdb_public_ip_list *minip;
1622
1623         bool should_loop = true;
1624         bool have_unassigned = true;
1625
1626         while (have_unassigned && should_loop) {
1627                 should_loop = false;
1628
1629                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1630                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1631
1632                 minnode = -1;
1633                 mindsum = 0;
1634                 minip = NULL;
1635
1636                 /* loop over each unassigned ip. */
1637                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638                         if (tmp_ip->pnn != -1) {
1639                                 continue;
1640                         }
1641
1642                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1643                                 /* only check nodes that can actually serve this ip */
1644                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1645                                         /* no it couldnt   so skip to the next node */
1646                                         continue;
1647                                 }
1648                                 if (nodemap->nodes[dstnode].flags & mask) {
1649                                         continue;
1650                                 }
1651
1652                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1653                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1654                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1655                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1656                                                    dstnode,
1657                                                    dstimbl - lcp2_imbalances[dstnode]));
1658
1659
1660                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1661                                         minnode = dstnode;
1662                                         minimbl = dstimbl;
1663                                         mindsum = dstdsum;
1664                                         minip = tmp_ip;
1665                                         should_loop = true;
1666                                 }
1667                         }
1668                 }
1669
1670                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1671
1672                 /* If we found one then assign it to the given node. */
1673                 if (minnode != -1) {
1674                         minip->pnn = minnode;
1675                         lcp2_imbalances[minnode] = minimbl;
1676                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1677                                           ctdb_addr_to_str(&(minip->addr)),
1678                                           minnode,
1679                                           mindsum));
1680                 }
1681
1682                 /* There might be a better way but at least this is clear. */
1683                 have_unassigned = false;
1684                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1685                         if (tmp_ip->pnn == -1) {
1686                                 have_unassigned = true;
1687                         }
1688                 }
1689         }
1690
1691         /* We know if we have an unassigned addresses so we might as
1692          * well optimise.
1693          */
1694         if (have_unassigned) {
1695                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1696                         if (tmp_ip->pnn == -1) {
1697                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1698                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1699                         }
1700                 }
1701         }
1702 }
1703
1704 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1705  * to move IPs from, determines the best IP/destination node
1706  * combination to move from the source node.
1707  */
1708 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1709                                     struct ctdb_node_map *nodemap,
1710                                     struct ctdb_public_ip_list *all_ips,
1711                                     int srcnode,
1712                                     uint32_t candimbl,
1713                                     uint32_t *lcp2_imbalances,
1714                                     bool *newly_healthy)
1715 {
1716         int dstnode, mindstnode;
1717         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1718         uint32_t minsrcimbl, mindstimbl;
1719         struct ctdb_public_ip_list *minip;
1720         struct ctdb_public_ip_list *tmp_ip;
1721
1722         /* Find an IP and destination node that best reduces imbalance. */
1723         minip = NULL;
1724         minsrcimbl = 0;
1725         mindstnode = -1;
1726         mindstimbl = 0;
1727
1728         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1729         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1730
1731         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1732                 /* Only consider addresses on srcnode. */
1733                 if (tmp_ip->pnn != srcnode) {
1734                         continue;
1735                 }
1736
1737                 /* What is this IP address costing the source node? */
1738                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1739                 srcimbl = candimbl - srcdsum;
1740
1741                 /* Consider this IP address would cost each potential
1742                  * destination node.  Destination nodes are limited to
1743                  * those that are newly healthy, since we don't want
1744                  * to do gratuitous failover of IPs just to make minor
1745                  * balance improvements.
1746                  */
1747                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1748                         if (! newly_healthy[dstnode]) {
1749                                 continue;
1750                         }
1751                         /* only check nodes that can actually serve this ip */
1752                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1753                                 /* no it couldnt   so skip to the next node */
1754                                 continue;
1755                         }
1756
1757                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1758                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1759                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1760                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1761                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1762                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1763
1764                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1765                             ((mindstnode == -1) ||                              \
1766                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1767
1768                                 minip = tmp_ip;
1769                                 minsrcimbl = srcimbl;
1770                                 mindstnode = dstnode;
1771                                 mindstimbl = dstimbl;
1772                         }
1773                 }
1774         }
1775         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1776
1777         if (mindstnode != -1) {
1778                 /* We found a move that makes things better... */
1779                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1780                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1781                                   ctdb_addr_to_str(&(minip->addr)),
1782                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1783
1784
1785                 lcp2_imbalances[srcnode] = srcimbl;
1786                 lcp2_imbalances[mindstnode] = mindstimbl;
1787                 minip->pnn = mindstnode;
1788
1789                 return true;
1790         }
1791
1792         return false;
1793         
1794 }
1795
1796 struct lcp2_imbalance_pnn {
1797         uint32_t imbalance;
1798         int pnn;
1799 };
1800
1801 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1802 {
1803         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1804         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1805
1806         if (lipa->imbalance > lipb->imbalance) {
1807                 return -1;
1808         } else if (lipa->imbalance == lipb->imbalance) {
1809                 return 0;
1810         } else {
1811                 return 1;
1812         }
1813 }
1814
1815 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1816  * node with the highest LCP2 imbalance, and then determines the best
1817  * IP/destination node combination to move from the source node.
1818  */
1819 static bool lcp2_failback(struct ctdb_context *ctdb,
1820                           struct ctdb_node_map *nodemap,
1821                           uint32_t mask,
1822                           struct ctdb_public_ip_list *all_ips,
1823                           uint32_t *lcp2_imbalances,
1824                           bool *newly_healthy)
1825 {
1826         int i, num_newly_healthy;
1827         struct lcp2_imbalance_pnn * lips;
1828         bool ret;
1829
1830         /* It is only worth continuing if we have suitable target
1831          * nodes to transfer IPs to.  This check is much cheaper than
1832          * continuing on...
1833          */
1834         num_newly_healthy = 0;
1835         for (i = 0; i < nodemap->num; i++) {
1836                 if (newly_healthy[i]) {
1837                         num_newly_healthy++;
1838                 }
1839         }
1840         if (num_newly_healthy == 0) {
1841                 return false;
1842         }
1843
1844         /* Put the imbalances and nodes into an array, sort them and
1845          * iterate through candidates.  Usually the 1st one will be
1846          * used, so this doesn't cost much...
1847          */
1848         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
1849         for (i = 0; i < nodemap->num; i++) {
1850                 lips[i].imbalance = lcp2_imbalances[i];
1851                 lips[i].pnn = i;
1852         }
1853         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
1854               lcp2_cmp_imbalance_pnn);
1855
1856         ret = false;
1857         for (i = 0; i < nodemap->num; i++) {
1858                 /* This means that all nodes had 0 or 1 addresses, so
1859                  * can't be imbalanced.
1860                  */
1861                 if (lips[i].imbalance == 0) {
1862                         break;
1863                 }
1864
1865                 if (lcp2_failback_candidate(ctdb,
1866                                             nodemap,
1867                                             all_ips,
1868                                             lips[i].pnn,
1869                                             lips[i].imbalance,
1870                                             lcp2_imbalances,
1871                                             newly_healthy)) {
1872                         ret = true;
1873                         break;
1874                 }
1875         }
1876
1877         talloc_free(lips);
1878         return ret;
1879 }
1880
1881 /* The calculation part of the IP allocation algorithm. */
1882 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1883                                    struct ctdb_node_map *nodemap,
1884                                    struct ctdb_public_ip_list **all_ips_p)
1885 {
1886         int i, num_healthy, retries, num_ips;
1887         uint32_t mask;
1888         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1889         uint32_t *lcp2_imbalances;
1890         bool *newly_healthy;
1891
1892         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1893
1894         /* Count how many completely healthy nodes we have */
1895         num_healthy = 0;
1896         for (i=0;i<nodemap->num;i++) {
1897                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1898                         num_healthy++;
1899                 }
1900         }
1901
1902         if (num_healthy > 0) {
1903                 /* We have healthy nodes, so only consider them for 
1904                    serving public addresses
1905                 */
1906                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1907         } else {
1908                 /* We didnt have any completely healthy nodes so
1909                    use "disabled" nodes as a fallback
1910                 */
1911                 mask = NODE_FLAGS_INACTIVE;
1912         }
1913
1914         /* since nodes only know about those public addresses that
1915            can be served by that particular node, no single node has
1916            a full list of all public addresses that exist in the cluster.
1917            Walk over all node structures and create a merged list of
1918            all public addresses that exist in the cluster.
1919
1920            keep the tree of ips around as ctdb->ip_tree
1921         */
1922         all_ips = create_merged_ip_list(ctdb);
1923         *all_ips_p = all_ips; /* minimal code changes */
1924
1925         /* Count how many ips we have */
1926         num_ips = 0;
1927         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1928                 num_ips++;
1929         }
1930
1931         /* If we want deterministic ip allocations, i.e. that the ip addresses
1932            will always be allocated the same way for a specific set of
1933            available/unavailable nodes.
1934         */
1935         if (1 == ctdb->tunable.deterministic_public_ips) {              
1936                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1937                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1938                         tmp_ip->pnn = i%nodemap->num;
1939                 }
1940         }
1941
1942
1943         /* mark all public addresses with a masked node as being served by
1944            node -1
1945         */
1946         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1947                 if (tmp_ip->pnn == -1) {
1948                         continue;
1949                 }
1950                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1951                         tmp_ip->pnn = -1;
1952                 }
1953         }
1954
1955         /* verify that the assigned nodes can serve that public ip
1956            and set it to -1 if not
1957         */
1958         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1959                 if (tmp_ip->pnn == -1) {
1960                         continue;
1961                 }
1962                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1963                         /* this node can not serve this ip. */
1964                         tmp_ip->pnn = -1;
1965                 }
1966         }
1967
1968         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1969                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
1970         }
1971
1972         /* now we must redistribute all public addresses with takeover node
1973            -1 among the nodes available
1974         */
1975         retries = 0;
1976 try_again:
1977         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1978                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
1979         } else {
1980                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
1981         }
1982
1983         /* If we dont want ips to fail back after a node becomes healthy
1984            again, we wont even try to reallocat the ip addresses so that
1985            they are evenly spread out.
1986            This can NOT be used at the same time as DeterministicIPs !
1987         */
1988         if (1 == ctdb->tunable.no_ip_failback) {
1989                 if (1 == ctdb->tunable.deterministic_public_ips) {
1990                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1991                 }
1992                 goto finished;
1993         }
1994
1995
1996         /* now, try to make sure the ip adresses are evenly distributed
1997            across the node.
1998         */
1999         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2000                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
2001                         goto try_again;
2002                 }
2003         } else {
2004                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
2005                         goto try_again;
2006                 }
2007         }
2008
2009         /* finished distributing the public addresses, now just send the 
2010            info out to the nodes
2011         */
2012 finished:
2013
2014         /* at this point ->pnn is the node which will own each IP
2015            or -1 if there is no node that can cover this ip
2016         */
2017
2018         return;
2019 }
2020
2021 /*
2022   make any IP alias changes for public addresses that are necessary 
2023  */
2024 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2025 {
2026         int i;
2027         struct ctdb_public_ip ip;
2028         struct ctdb_public_ipv4 ipv4;
2029         uint32_t *nodes;
2030         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2031         TDB_DATA data;
2032         struct timeval timeout;
2033         struct client_async_data *async_data;
2034         struct ctdb_client_control_state *state;
2035         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2036
2037         /*
2038          * ip failover is completely disabled, just send out the 
2039          * ipreallocated event.
2040          */
2041         if (ctdb->tunable.disable_ip_failover != 0) {
2042                 goto ipreallocated;
2043         }
2044
2045         ZERO_STRUCT(ip);
2046
2047         /* Do the IP reassignment calculations */
2048         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2049
2050         /* now tell all nodes to delete any alias that they should not
2051            have.  This will be a NOOP on nodes that don't currently
2052            hold the given alias */
2053         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2054         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2055
2056         for (i=0;i<nodemap->num;i++) {
2057                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2058                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2059                         continue;
2060                 }
2061
2062                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2063                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2064                                 /* This node should be serving this
2065                                    vnn so dont tell it to release the ip
2066                                 */
2067                                 continue;
2068                         }
2069                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2070                                 ipv4.pnn = tmp_ip->pnn;
2071                                 ipv4.sin = tmp_ip->addr.ip;
2072
2073                                 timeout = TAKEOVER_TIMEOUT();
2074                                 data.dsize = sizeof(ipv4);
2075                                 data.dptr  = (uint8_t *)&ipv4;
2076                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2077                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2078                                                 data, async_data,
2079                                                 &timeout, NULL);
2080                         } else {
2081                                 ip.pnn  = tmp_ip->pnn;
2082                                 ip.addr = tmp_ip->addr;
2083
2084                                 timeout = TAKEOVER_TIMEOUT();
2085                                 data.dsize = sizeof(ip);
2086                                 data.dptr  = (uint8_t *)&ip;
2087                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2088                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2089                                                 data, async_data,
2090                                                 &timeout, NULL);
2091                         }
2092
2093                         if (state == NULL) {
2094                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2095                                 talloc_free(tmp_ctx);
2096                                 return -1;
2097                         }
2098                 
2099                         ctdb_client_async_add(async_data, state);
2100                 }
2101         }
2102         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2103                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2104                 talloc_free(tmp_ctx);
2105                 return -1;
2106         }
2107         talloc_free(async_data);
2108
2109
2110         /* tell all nodes to get their own IPs */
2111         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2112         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2113         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2114                 if (tmp_ip->pnn == -1) {
2115                         /* this IP won't be taken over */
2116                         continue;
2117                 }
2118
2119                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2120                         ipv4.pnn = tmp_ip->pnn;
2121                         ipv4.sin = tmp_ip->addr.ip;
2122
2123                         timeout = TAKEOVER_TIMEOUT();
2124                         data.dsize = sizeof(ipv4);
2125                         data.dptr  = (uint8_t *)&ipv4;
2126                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2127                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2128                                         data, async_data,
2129                                         &timeout, NULL);
2130                 } else {
2131                         ip.pnn  = tmp_ip->pnn;
2132                         ip.addr = tmp_ip->addr;
2133
2134                         timeout = TAKEOVER_TIMEOUT();
2135                         data.dsize = sizeof(ip);
2136                         data.dptr  = (uint8_t *)&ip;
2137                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2138                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2139                                         data, async_data,
2140                                         &timeout, NULL);
2141                 }
2142                 if (state == NULL) {
2143                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2144                         talloc_free(tmp_ctx);
2145                         return -1;
2146                 }
2147                 
2148                 ctdb_client_async_add(async_data, state);
2149         }
2150         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2151                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2152                 talloc_free(tmp_ctx);
2153                 return -1;
2154         }
2155
2156 ipreallocated:
2157         /* tell all nodes to update natwg */
2158         /* send the flags update natgw on all connected nodes */
2159         data.dptr  = discard_const("ipreallocated");
2160         data.dsize = strlen((char *)data.dptr) + 1; 
2161         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2162         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2163                                       nodes, 0, TAKEOVER_TIMEOUT(),
2164                                       false, data,
2165                                       NULL, NULL,
2166                                       NULL) != 0) {
2167                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2168         }
2169
2170         talloc_free(tmp_ctx);
2171         return 0;
2172 }
2173
2174
2175 /*
2176   destroy a ctdb_client_ip structure
2177  */
2178 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2179 {
2180         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2181                 ctdb_addr_to_str(&ip->addr),
2182                 ntohs(ip->addr.ip.sin_port),
2183                 ip->client_id));
2184
2185         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2186         return 0;
2187 }
2188
2189 /*
2190   called by a client to inform us of a TCP connection that it is managing
2191   that should tickled with an ACK when IP takeover is done
2192   we handle both the old ipv4 style of packets as well as the new ipv4/6
2193   pdus.
2194  */
2195 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2196                                 TDB_DATA indata)
2197 {
2198         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2199         struct ctdb_control_tcp *old_addr = NULL;
2200         struct ctdb_control_tcp_addr new_addr;
2201         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2202         struct ctdb_tcp_list *tcp;
2203         struct ctdb_tcp_connection t;
2204         int ret;
2205         TDB_DATA data;
2206         struct ctdb_client_ip *ip;
2207         struct ctdb_vnn *vnn;
2208         ctdb_sock_addr addr;
2209
2210         switch (indata.dsize) {
2211         case sizeof(struct ctdb_control_tcp):
2212                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2213                 ZERO_STRUCT(new_addr);
2214                 tcp_sock = &new_addr;
2215                 tcp_sock->src.ip  = old_addr->src;
2216                 tcp_sock->dest.ip = old_addr->dest;
2217                 break;
2218         case sizeof(struct ctdb_control_tcp_addr):
2219                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2220                 break;
2221         default:
2222                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2223                                  "to ctdb_control_tcp_client. size was %d but "
2224                                  "only allowed sizes are %lu and %lu\n",
2225                                  (int)indata.dsize,
2226                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2227                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2228                 return -1;
2229         }
2230
2231         addr = tcp_sock->src;
2232         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2233         addr = tcp_sock->dest;
2234         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2235
2236         ZERO_STRUCT(addr);
2237         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2238         vnn = find_public_ip_vnn(ctdb, &addr);
2239         if (vnn == NULL) {
2240                 switch (addr.sa.sa_family) {
2241                 case AF_INET:
2242                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2243                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2244                                         ctdb_addr_to_str(&addr)));
2245                         }
2246                         break;
2247                 case AF_INET6:
2248                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2249                                 ctdb_addr_to_str(&addr)));
2250                         break;
2251                 default:
2252                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2253                 }
2254
2255                 return 0;
2256         }
2257
2258         if (vnn->pnn != ctdb->pnn) {
2259                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2260                         ctdb_addr_to_str(&addr),
2261                         client_id, client->pid));
2262                 /* failing this call will tell smbd to die */
2263                 return -1;
2264         }
2265
2266         ip = talloc(client, struct ctdb_client_ip);
2267         CTDB_NO_MEMORY(ctdb, ip);
2268
2269         ip->ctdb      = ctdb;
2270         ip->addr      = addr;
2271         ip->client_id = client_id;
2272         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2273         DLIST_ADD(ctdb->client_ip_list, ip);
2274
2275         tcp = talloc(client, struct ctdb_tcp_list);
2276         CTDB_NO_MEMORY(ctdb, tcp);
2277
2278         tcp->connection.src_addr = tcp_sock->src;
2279         tcp->connection.dst_addr = tcp_sock->dest;
2280
2281         DLIST_ADD(client->tcp_list, tcp);
2282
2283         t.src_addr = tcp_sock->src;
2284         t.dst_addr = tcp_sock->dest;
2285
2286         data.dptr = (uint8_t *)&t;
2287         data.dsize = sizeof(t);
2288
2289         switch (addr.sa.sa_family) {
2290         case AF_INET:
2291                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2292                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2293                         ctdb_addr_to_str(&tcp_sock->src),
2294                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2295                 break;
2296         case AF_INET6:
2297                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2298                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2299                         ctdb_addr_to_str(&tcp_sock->src),
2300                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2301                 break;
2302         default:
2303                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2304         }
2305
2306
2307         /* tell all nodes about this tcp connection */
2308         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2309                                        CTDB_CONTROL_TCP_ADD,
2310                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2311         if (ret != 0) {
2312                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2313                 return -1;
2314         }
2315
2316         return 0;
2317 }
2318
2319 /*
2320   find a tcp address on a list
2321  */
2322 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2323                                            struct ctdb_tcp_connection *tcp)
2324 {
2325         int i;
2326
2327         if (array == NULL) {
2328                 return NULL;
2329         }
2330
2331         for (i=0;i<array->num;i++) {
2332                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2333                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2334                         return &array->connections[i];
2335                 }
2336         }
2337         return NULL;
2338 }
2339
2340
2341
2342 /*
2343   called by a daemon to inform us of a TCP connection that one of its
2344   clients managing that should tickled with an ACK when IP takeover is
2345   done
2346  */
2347 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2348 {
2349         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2350         struct ctdb_tcp_array *tcparray;
2351         struct ctdb_tcp_connection tcp;
2352         struct ctdb_vnn *vnn;
2353
2354         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2355         if (vnn == NULL) {
2356                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2357                         ctdb_addr_to_str(&p->dst_addr)));
2358
2359                 return -1;
2360         }
2361
2362
2363         tcparray = vnn->tcp_array;
2364
2365         /* If this is the first tickle */
2366         if (tcparray == NULL) {
2367                 tcparray = talloc_size(ctdb->nodes, 
2368                         offsetof(struct ctdb_tcp_array, connections) +
2369                         sizeof(struct ctdb_tcp_connection) * 1);
2370                 CTDB_NO_MEMORY(ctdb, tcparray);
2371                 vnn->tcp_array = tcparray;
2372
2373                 tcparray->num = 0;
2374                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2375                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2376
2377                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2378                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2379                 tcparray->num++;
2380
2381                 if (tcp_update_needed) {
2382                         vnn->tcp_update_needed = true;
2383                 }
2384                 return 0;
2385         }
2386
2387
2388         /* Do we already have this tickle ?*/
2389         tcp.src_addr = p->src_addr;
2390         tcp.dst_addr = p->dst_addr;
2391         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2392                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2393                         ctdb_addr_to_str(&tcp.dst_addr),
2394                         ntohs(tcp.dst_addr.ip.sin_port),
2395                         vnn->pnn));
2396                 return 0;
2397         }
2398
2399         /* A new tickle, we must add it to the array */
2400         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2401                                         struct ctdb_tcp_connection,
2402                                         tcparray->num+1);
2403         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2404
2405         vnn->tcp_array = tcparray;
2406         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2407         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2408         tcparray->num++;
2409                                 
2410         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2411                 ctdb_addr_to_str(&tcp.dst_addr),
2412                 ntohs(tcp.dst_addr.ip.sin_port),
2413                 vnn->pnn));
2414
2415         if (tcp_update_needed) {
2416                 vnn->tcp_update_needed = true;
2417         }
2418
2419         return 0;
2420 }
2421
2422
2423 /*
2424   called by a daemon to inform us of a TCP connection that one of its
2425   clients managing that should tickled with an ACK when IP takeover is
2426   done
2427  */
2428 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2429 {
2430         struct ctdb_tcp_connection *tcpp;
2431         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2432
2433         if (vnn == NULL) {
2434                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2435                         ctdb_addr_to_str(&conn->dst_addr)));
2436                 return;
2437         }
2438
2439         /* if the array is empty we cant remove it
2440            and we dont need to do anything
2441          */
2442         if (vnn->tcp_array == NULL) {
2443                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2444                         ctdb_addr_to_str(&conn->dst_addr),
2445                         ntohs(conn->dst_addr.ip.sin_port)));
2446                 return;
2447         }
2448
2449
2450         /* See if we know this connection
2451            if we dont know this connection  then we dont need to do anything
2452          */
2453         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2454         if (tcpp == NULL) {
2455                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2456                         ctdb_addr_to_str(&conn->dst_addr),
2457                         ntohs(conn->dst_addr.ip.sin_port)));
2458                 return;
2459         }
2460
2461
2462         /* We need to remove this entry from the array.
2463            Instead of allocating a new array and copying data to it
2464            we cheat and just copy the last entry in the existing array
2465            to the entry that is to be removed and just shring the 
2466            ->num field
2467          */
2468         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2469         vnn->tcp_array->num--;
2470
2471         /* If we deleted the last entry we also need to remove the entire array
2472          */
2473         if (vnn->tcp_array->num == 0) {
2474                 talloc_free(vnn->tcp_array);
2475                 vnn->tcp_array = NULL;
2476         }               
2477
2478         vnn->tcp_update_needed = true;
2479
2480         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2481                 ctdb_addr_to_str(&conn->src_addr),
2482                 ntohs(conn->src_addr.ip.sin_port)));
2483 }
2484
2485
2486 /*
2487   called by a daemon to inform us of a TCP connection that one of its
2488   clients used are no longer needed in the tickle database
2489  */
2490 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2491 {
2492         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2493
2494         ctdb_remove_tcp_connection(ctdb, conn);
2495
2496         return 0;
2497 }
2498
2499
2500 /*
2501   called when a daemon restarts - send all tickes for all public addresses
2502   we are serving immediately to the new node.
2503  */
2504 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2505 {
2506 /*XXX here we should send all tickes we are serving to the new node */
2507         return 0;
2508 }
2509
2510
2511 /*
2512   called when a client structure goes away - hook to remove
2513   elements from the tcp_list in all daemons
2514  */
2515 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2516 {
2517         while (client->tcp_list) {
2518                 struct ctdb_tcp_list *tcp = client->tcp_list;
2519                 DLIST_REMOVE(client->tcp_list, tcp);
2520                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2521         }
2522 }
2523
2524
2525 /*
2526   release all IPs on shutdown
2527  */
2528 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2529 {
2530         struct ctdb_vnn *vnn;
2531
2532         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2533                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2534                         ctdb_vnn_unassign_iface(ctdb, vnn);
2535                         continue;
2536                 }
2537                 if (!vnn->iface) {
2538                         continue;
2539                 }
2540                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2541                                   ctdb_vnn_iface_string(vnn),
2542                                   ctdb_addr_to_str(&vnn->public_address),
2543                                   vnn->public_netmask_bits);
2544                 release_kill_clients(ctdb, &vnn->public_address);
2545                 ctdb_vnn_unassign_iface(ctdb, vnn);
2546         }
2547 }
2548
2549
2550 /*
2551   get list of public IPs
2552  */
2553 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2554                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2555 {
2556         int i, num, len;
2557         struct ctdb_all_public_ips *ips;
2558         struct ctdb_vnn *vnn;
2559         bool only_available = false;
2560
2561         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2562                 only_available = true;
2563         }
2564
2565         /* count how many public ip structures we have */
2566         num = 0;
2567         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2568                 num++;
2569         }
2570
2571         len = offsetof(struct ctdb_all_public_ips, ips) + 
2572                 num*sizeof(struct ctdb_public_ip);
2573         ips = talloc_zero_size(outdata, len);
2574         CTDB_NO_MEMORY(ctdb, ips);
2575
2576         i = 0;
2577         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2578                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2579                         continue;
2580                 }
2581                 ips->ips[i].pnn  = vnn->pnn;
2582                 ips->ips[i].addr = vnn->public_address;
2583                 i++;
2584         }
2585         ips->num = i;
2586         len = offsetof(struct ctdb_all_public_ips, ips) +
2587                 i*sizeof(struct ctdb_public_ip);
2588
2589         outdata->dsize = len;
2590         outdata->dptr  = (uint8_t *)ips;
2591
2592         return 0;
2593 }
2594
2595
2596 /*
2597   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2598  */
2599 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2600                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2601 {
2602         int i, num, len;
2603         struct ctdb_all_public_ipsv4 *ips;
2604         struct ctdb_vnn *vnn;
2605
2606         /* count how many public ip structures we have */
2607         num = 0;
2608         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2609                 if (vnn->public_address.sa.sa_family != AF_INET) {
2610                         continue;
2611                 }
2612                 num++;
2613         }
2614
2615         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2616                 num*sizeof(struct ctdb_public_ipv4);
2617         ips = talloc_zero_size(outdata, len);
2618         CTDB_NO_MEMORY(ctdb, ips);
2619
2620         outdata->dsize = len;
2621         outdata->dptr  = (uint8_t *)ips;
2622
2623         ips->num = num;
2624         i = 0;
2625         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2626                 if (vnn->public_address.sa.sa_family != AF_INET) {
2627                         continue;
2628                 }
2629                 ips->ips[i].pnn = vnn->pnn;
2630                 ips->ips[i].sin = vnn->public_address.ip;
2631                 i++;
2632         }
2633
2634         return 0;
2635 }
2636
2637 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2638                                         struct ctdb_req_control *c,
2639                                         TDB_DATA indata,
2640                                         TDB_DATA *outdata)
2641 {
2642         int i, num, len;
2643         ctdb_sock_addr *addr;
2644         struct ctdb_control_public_ip_info *info;
2645         struct ctdb_vnn *vnn;
2646
2647         addr = (ctdb_sock_addr *)indata.dptr;
2648
2649         vnn = find_public_ip_vnn(ctdb, addr);
2650         if (vnn == NULL) {
2651                 /* if it is not a public ip   it could be our 'single ip' */
2652                 if (ctdb->single_ip_vnn) {
2653                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2654                                 vnn = ctdb->single_ip_vnn;
2655                         }
2656                 }
2657         }
2658         if (vnn == NULL) {
2659                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2660                                  "'%s'not a public address\n",
2661                                  ctdb_addr_to_str(addr)));
2662                 return -1;
2663         }
2664
2665         /* count how many public ip structures we have */
2666         num = 0;
2667         for (;vnn->ifaces[num];) {
2668                 num++;
2669         }
2670
2671         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2672                 num*sizeof(struct ctdb_control_iface_info);
2673         info = talloc_zero_size(outdata, len);
2674         CTDB_NO_MEMORY(ctdb, info);
2675
2676         info->ip.addr = vnn->public_address;
2677         info->ip.pnn = vnn->pnn;
2678         info->active_idx = 0xFFFFFFFF;
2679
2680         for (i=0; vnn->ifaces[i]; i++) {
2681                 struct ctdb_iface *cur;
2682
2683                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2684                 if (cur == NULL) {
2685                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2686                                            vnn->ifaces[i]));
2687                         return -1;
2688                 }
2689                 if (vnn->iface == cur) {
2690                         info->active_idx = i;
2691                 }
2692                 strcpy(info->ifaces[i].name, cur->name);
2693                 info->ifaces[i].link_state = cur->link_up;
2694                 info->ifaces[i].references = cur->references;
2695         }
2696         info->num = i;
2697         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2698                 i*sizeof(struct ctdb_control_iface_info);
2699
2700         outdata->dsize = len;
2701         outdata->dptr  = (uint8_t *)info;
2702
2703         return 0;
2704 }
2705
2706 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2707                                 struct ctdb_req_control *c,
2708                                 TDB_DATA *outdata)
2709 {
2710         int i, num, len;
2711         struct ctdb_control_get_ifaces *ifaces;
2712         struct ctdb_iface *cur;
2713
2714         /* count how many public ip structures we have */
2715         num = 0;
2716         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2717                 num++;
2718         }
2719
2720         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2721                 num*sizeof(struct ctdb_control_iface_info);
2722         ifaces = talloc_zero_size(outdata, len);
2723         CTDB_NO_MEMORY(ctdb, ifaces);
2724
2725         i = 0;
2726         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2727                 strcpy(ifaces->ifaces[i].name, cur->name);
2728                 ifaces->ifaces[i].link_state = cur->link_up;
2729                 ifaces->ifaces[i].references = cur->references;
2730                 i++;
2731         }
2732         ifaces->num = i;
2733         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2734                 i*sizeof(struct ctdb_control_iface_info);
2735
2736         outdata->dsize = len;
2737         outdata->dptr  = (uint8_t *)ifaces;
2738
2739         return 0;
2740 }
2741
2742 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2743                                     struct ctdb_req_control *c,
2744                                     TDB_DATA indata)
2745 {
2746         struct ctdb_control_iface_info *info;
2747         struct ctdb_iface *iface;
2748         bool link_up = false;
2749
2750         info = (struct ctdb_control_iface_info *)indata.dptr;
2751
2752         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2753                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2754                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2755                                   len, len, info->name));
2756                 return -1;
2757         }
2758
2759         switch (info->link_state) {
2760         case 0:
2761                 link_up = false;
2762                 break;
2763         case 1:
2764                 link_up = true;
2765                 break;
2766         default:
2767                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2768                                   (unsigned int)info->link_state));
2769                 return -1;
2770         }
2771
2772         if (info->references != 0) {
2773                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2774                                   (unsigned int)info->references));
2775                 return -1;
2776         }
2777
2778         iface = ctdb_find_iface(ctdb, info->name);
2779         if (iface == NULL) {
2780                 return -1;
2781         }
2782
2783         if (link_up == iface->link_up) {
2784                 return 0;
2785         }
2786
2787         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2788               ("iface[%s] has changed it's link status %s => %s\n",
2789                iface->name,
2790                iface->link_up?"up":"down",
2791                link_up?"up":"down"));
2792
2793         iface->link_up = link_up;
2794         return 0;
2795 }
2796
2797
2798 /* 
2799    structure containing the listening socket and the list of tcp connections
2800    that the ctdb daemon is to kill
2801 */
2802 struct ctdb_kill_tcp {
2803         struct ctdb_vnn *vnn;
2804         struct ctdb_context *ctdb;
2805         int capture_fd;
2806         struct fd_event *fde;
2807         trbt_tree_t *connections;
2808         void *private_data;
2809 };
2810
2811 /*
2812   a tcp connection that is to be killed
2813  */
2814 struct ctdb_killtcp_con {
2815         ctdb_sock_addr src_addr;
2816         ctdb_sock_addr dst_addr;
2817         int count;
2818         struct ctdb_kill_tcp *killtcp;
2819 };
2820
2821 /* this function is used to create a key to represent this socketpair
2822    in the killtcp tree.
2823    this key is used to insert and lookup matching socketpairs that are
2824    to be tickled and RST
2825 */
2826 #define KILLTCP_KEYLEN  10
2827 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2828 {
2829         static uint32_t key[KILLTCP_KEYLEN];
2830
2831         bzero(key, sizeof(key));
2832
2833         if (src->sa.sa_family != dst->sa.sa_family) {
2834                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2835                 return key;
2836         }
2837         
2838         switch (src->sa.sa_family) {
2839         case AF_INET:
2840                 key[0]  = dst->ip.sin_addr.s_addr;
2841                 key[1]  = src->ip.sin_addr.s_addr;
2842                 key[2]  = dst->ip.sin_port;
2843                 key[3]  = src->ip.sin_port;
2844                 break;
2845         case AF_INET6: {
2846                 uint32_t *dst6_addr32 =
2847                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2848                 uint32_t *src6_addr32 =
2849                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2850                 key[0]  = dst6_addr32[3];
2851                 key[1]  = src6_addr32[3];
2852                 key[2]  = dst6_addr32[2];
2853                 key[3]  = src6_addr32[2];
2854                 key[4]  = dst6_addr32[1];
2855                 key[5]  = src6_addr32[1];
2856                 key[6]  = dst6_addr32[0];
2857                 key[7]  = src6_addr32[0];
2858                 key[8]  = dst->ip6.sin6_port;
2859                 key[9]  = src->ip6.sin6_port;
2860                 break;
2861         }
2862         default:
2863                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2864                 return key;
2865         }
2866
2867         return key;
2868 }
2869
2870 /*
2871   called when we get a read event on the raw socket
2872  */
2873 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2874                                 uint16_t flags, void *private_data)
2875 {
2876         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2877         struct ctdb_killtcp_con *con;
2878         ctdb_sock_addr src, dst;
2879         uint32_t ack_seq, seq;
2880
2881         if (!(flags & EVENT_FD_READ)) {
2882                 return;
2883         }
2884
2885         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2886                                 killtcp->private_data,
2887                                 &src, &dst,
2888                                 &ack_seq, &seq) != 0) {
2889                 /* probably a non-tcp ACK packet */
2890                 return;
2891         }
2892
2893         /* check if we have this guy in our list of connections
2894            to kill
2895         */
2896         con = trbt_lookuparray32(killtcp->connections, 
2897                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2898         if (con == NULL) {
2899                 /* no this was some other packet we can just ignore */
2900                 return;
2901         }
2902
2903         /* This one has been tickled !
2904            now reset him and remove him from the list.
2905          */
2906         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2907                 ntohs(con->dst_addr.ip.sin_port),
2908                 ctdb_addr_to_str(&con->src_addr),
2909                 ntohs(con->src_addr.ip.sin_port)));
2910
2911         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2912         talloc_free(con);
2913 }
2914
2915
2916 /* when traversing the list of all tcp connections to send tickle acks to
2917    (so that we can capture the ack coming back and kill the connection
2918     by a RST)
2919    this callback is called for each connection we are currently trying to kill
2920 */
2921 static int tickle_connection_traverse(void *param, void *data)
2922 {
2923         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2924
2925         /* have tried too many times, just give up */
2926         if (con->count >= 5) {
2927                 /* can't delete in traverse: reparent to delete_cons */
2928                 talloc_steal(param, con);
2929                 return 0;
2930         }
2931
2932         /* othervise, try tickling it again */
2933         con->count++;
2934         ctdb_sys_send_tcp(
2935                 (ctdb_sock_addr *)&con->dst_addr,
2936                 (ctdb_sock_addr *)&con->src_addr,
2937                 0, 0, 0);
2938         return 0;
2939 }
2940
2941
2942 /* 
2943    called every second until all sentenced connections have been reset
2944  */
2945 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2946                                               struct timeval t, void *private_data)
2947 {
2948         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2949         void *delete_cons = talloc_new(NULL);
2950
2951         /* loop over all connections sending tickle ACKs */
2952         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2953
2954         /* now we've finished traverse, it's safe to do deletion. */
2955         talloc_free(delete_cons);
2956
2957         /* If there are no more connections to kill we can remove the
2958            entire killtcp structure
2959          */
2960         if ( (killtcp->connections == NULL) || 
2961              (killtcp->connections->root == NULL) ) {
2962                 talloc_free(killtcp);
2963                 return;
2964         }
2965
2966         /* try tickling them again in a seconds time
2967          */
2968         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2969                         ctdb_tickle_sentenced_connections, killtcp);
2970 }
2971
2972 /*
2973   destroy the killtcp structure
2974  */
2975 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2976 {
2977         struct ctdb_vnn *tmpvnn;
2978
2979         /* verify that this vnn is still active */
2980         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
2981                 if (tmpvnn == killtcp->vnn) {
2982                         break;
2983                 }
2984         }
2985
2986         if (tmpvnn == NULL) {
2987                 return 0;
2988         }
2989
2990         if (killtcp->vnn->killtcp != killtcp) {
2991                 return 0;
2992         }
2993
2994         killtcp->vnn->killtcp = NULL;
2995
2996         return 0;
2997 }
2998
2999
3000 /* nothing fancy here, just unconditionally replace any existing
3001    connection structure with the new one.
3002
3003    dont even free the old one if it did exist, that one is talloc_stolen
3004    by the same node in the tree anyway and will be deleted when the new data 
3005    is deleted
3006 */
3007 static void *add_killtcp_callback(void *parm, void *data)
3008 {
3009         return parm;
3010 }
3011
3012 /*
3013   add a tcp socket to the list of connections we want to RST
3014  */
3015 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3016                                        ctdb_sock_addr *s,
3017                                        ctdb_sock_addr *d)
3018 {
3019         ctdb_sock_addr src, dst;
3020         struct ctdb_kill_tcp *killtcp;
3021         struct ctdb_killtcp_con *con;
3022         struct ctdb_vnn *vnn;
3023
3024         ctdb_canonicalize_ip(s, &src);
3025         ctdb_canonicalize_ip(d, &dst);
3026
3027         vnn = find_public_ip_vnn(ctdb, &dst);
3028         if (vnn == NULL) {
3029                 vnn = find_public_ip_vnn(ctdb, &src);
3030         }
3031         if (vnn == NULL) {
3032                 /* if it is not a public ip   it could be our 'single ip' */
3033                 if (ctdb->single_ip_vnn) {
3034                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3035                                 vnn = ctdb->single_ip_vnn;
3036                         }
3037                 }
3038         }
3039         if (vnn == NULL) {
3040                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3041                 return -1;
3042         }
3043
3044         killtcp = vnn->killtcp;
3045         
3046         /* If this is the first connection to kill we must allocate
3047            a new structure
3048          */
3049         if (killtcp == NULL) {
3050                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3051                 CTDB_NO_MEMORY(ctdb, killtcp);
3052
3053                 killtcp->vnn         = vnn;
3054                 killtcp->ctdb        = ctdb;
3055                 killtcp->capture_fd  = -1;
3056                 killtcp->connections = trbt_create(killtcp, 0);
3057
3058                 vnn->killtcp         = killtcp;
3059                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3060         }
3061
3062
3063
3064         /* create a structure that describes this connection we want to
3065            RST and store it in killtcp->connections
3066         */
3067         con = talloc(killtcp, struct ctdb_killtcp_con);
3068         CTDB_NO_MEMORY(ctdb, con);
3069         con->src_addr = src;
3070         con->dst_addr = dst;
3071         con->count    = 0;
3072         con->killtcp  = killtcp;
3073
3074
3075         trbt_insertarray32_callback(killtcp->connections,
3076                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3077                         add_killtcp_callback, con);
3078
3079         /* 
3080            If we dont have a socket to listen on yet we must create it
3081          */
3082         if (killtcp->capture_fd == -1) {
3083                 const char *iface = ctdb_vnn_iface_string(vnn);
3084                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3085                 if (killtcp->capture_fd == -1) {
3086                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3087                                           "socket on iface '%s' for killtcp (%s)\n",
3088                                           iface, strerror(errno)));
3089                         goto failed;
3090                 }
3091         }
3092
3093
3094         if (killtcp->fde == NULL) {
3095                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3096                                             EVENT_FD_READ,
3097                                             capture_tcp_handler, killtcp);
3098                 tevent_fd_set_auto_close(killtcp->fde);
3099
3100                 /* We also need to set up some events to tickle all these connections
3101                    until they are all reset
3102                 */
3103                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3104                                 ctdb_tickle_sentenced_connections, killtcp);
3105         }
3106
3107         /* tickle him once now */
3108         ctdb_sys_send_tcp(
3109                 &con->dst_addr,
3110                 &con->src_addr,
3111                 0, 0, 0);
3112
3113         return 0;
3114
3115 failed:
3116         talloc_free(vnn->killtcp);
3117         vnn->killtcp = NULL;
3118         return -1;
3119 }
3120
3121 /*
3122   kill a TCP connection.
3123  */
3124 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3125 {
3126         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3127
3128         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3129 }
3130
3131 /*
3132   called by a daemon to inform us of the entire list of TCP tickles for
3133   a particular public address.
3134   this control should only be sent by the node that is currently serving
3135   that public address.
3136  */
3137 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3138 {
3139         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3140         struct ctdb_tcp_array *tcparray;
3141         struct ctdb_vnn *vnn;
3142
3143         /* We must at least have tickles.num or else we cant verify the size
3144            of the received data blob
3145          */
3146         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3147                                         tickles.connections)) {
3148                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3149                 return -1;
3150         }
3151
3152         /* verify that the size of data matches what we expect */
3153         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3154                                 tickles.connections)
3155                          + sizeof(struct ctdb_tcp_connection)
3156                                  * list->tickles.num) {
3157                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3158                 return -1;
3159         }       
3160
3161         vnn = find_public_ip_vnn(ctdb, &list->addr);
3162         if (vnn == NULL) {
3163                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3164                         ctdb_addr_to_str(&list->addr)));
3165
3166                 return 1;
3167         }
3168
3169         /* remove any old ticklelist we might have */
3170         talloc_free(vnn->tcp_array);
3171         vnn->tcp_array = NULL;
3172
3173         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3174         CTDB_NO_MEMORY(ctdb, tcparray);
3175
3176         tcparray->num = list->tickles.num;
3177
3178         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3179         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3180
3181         memcpy(tcparray->connections, &list->tickles.connections[0], 
3182                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3183
3184         /* We now have a new fresh tickle list array for this vnn */
3185         vnn->tcp_array = talloc_steal(vnn, tcparray);
3186         
3187         return 0;
3188 }
3189
3190 /*
3191   called to return the full list of tickles for the puclic address associated 
3192   with the provided vnn
3193  */
3194 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3195 {
3196         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3197         struct ctdb_control_tcp_tickle_list *list;
3198         struct ctdb_tcp_array *tcparray;
3199         int num;
3200         struct ctdb_vnn *vnn;
3201
3202         vnn = find_public_ip_vnn(ctdb, addr);
3203         if (vnn == NULL) {
3204                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3205                         ctdb_addr_to_str(addr)));
3206
3207                 return 1;
3208         }
3209
3210         tcparray = vnn->tcp_array;
3211         if (tcparray) {
3212                 num = tcparray->num;
3213         } else {
3214                 num = 0;
3215         }
3216
3217         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3218                                 tickles.connections)
3219                         + sizeof(struct ctdb_tcp_connection) * num;
3220
3221         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3222         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3223         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3224
3225         list->addr = *addr;
3226         list->tickles.num = num;
3227         if (num) {
3228                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3229                         sizeof(struct ctdb_tcp_connection) * num);
3230         }
3231
3232         return 0;
3233 }
3234
3235
3236 /*
3237   set the list of all tcp tickles for a public address
3238  */
3239 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3240                               struct timeval timeout, uint32_t destnode, 
3241                               ctdb_sock_addr *addr,
3242                               struct ctdb_tcp_array *tcparray)
3243 {
3244         int ret, num;
3245         TDB_DATA data;
3246         struct ctdb_control_tcp_tickle_list *list;
3247
3248         if (tcparray) {
3249                 num = tcparray->num;
3250         } else {
3251                 num = 0;
3252         }
3253
3254         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3255                                 tickles.connections) +
3256                         sizeof(struct ctdb_tcp_connection) * num;
3257         data.dptr = talloc_size(ctdb, data.dsize);
3258         CTDB_NO_MEMORY(ctdb, data.dptr);
3259
3260         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3261         list->addr = *addr;
3262         list->tickles.num = num;
3263         if (tcparray) {
3264                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3265         }
3266
3267         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3268                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3269                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3270         if (ret != 0) {
3271                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3272                 return -1;
3273         }
3274
3275         talloc_free(data.dptr);
3276
3277         return ret;
3278 }
3279
3280
3281 /*
3282   perform tickle updates if required
3283  */
3284 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3285                                 struct timed_event *te, 
3286                                 struct timeval t, void *private_data)
3287 {
3288         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3289         int ret;
3290         struct ctdb_vnn *vnn;
3291
3292         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3293                 /* we only send out updates for public addresses that 
3294                    we have taken over
3295                  */
3296                 if (ctdb->pnn != vnn->pnn) {
3297                         continue;
3298                 }
3299                 /* We only send out the updates if we need to */
3300                 if (!vnn->tcp_update_needed) {
3301                         continue;
3302                 }
3303                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3304                                 TAKEOVER_TIMEOUT(),
3305                                 CTDB_BROADCAST_CONNECTED,
3306                                 &vnn->public_address,
3307                                 vnn->tcp_array);
3308                 if (ret != 0) {
3309                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3310                                 ctdb_addr_to_str(&vnn->public_address)));
3311                 }
3312         }
3313
3314         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3315                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3316                              ctdb_update_tcp_tickles, ctdb);
3317 }               
3318         
3319
3320 /*
3321   start periodic update of tcp tickles
3322  */
3323 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3324 {
3325         ctdb->tickle_update_context = talloc_new(ctdb);
3326
3327         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3328                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3329                              ctdb_update_tcp_tickles, ctdb);
3330 }
3331
3332
3333
3334
3335 struct control_gratious_arp {
3336         struct ctdb_context *ctdb;
3337         ctdb_sock_addr addr;
3338         const char *iface;
3339         int count;
3340 };
3341
3342 /*
3343   send a control_gratuitous arp
3344  */
3345 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3346                                   struct timeval t, void *private_data)
3347 {
3348         int ret;
3349         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3350                                                         struct control_gratious_arp);
3351
3352         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3353         if (ret != 0) {
3354                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3355                                  arp->iface, strerror(errno)));
3356         }
3357
3358
3359         arp->count++;
3360         if (arp->count == CTDB_ARP_REPEAT) {
3361                 talloc_free(arp);
3362                 return;
3363         }
3364
3365         event_add_timed(arp->ctdb->ev, arp, 
3366                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3367                         send_gratious_arp, arp);
3368 }
3369
3370
3371 /*
3372   send a gratious arp 
3373  */
3374 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3375 {
3376         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3377         struct control_gratious_arp *arp;
3378
3379         /* verify the size of indata */
3380         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3381                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3382                                  (unsigned)indata.dsize, 
3383                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3384                 return -1;
3385         }
3386         if (indata.dsize != 
3387                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3388                 + gratious_arp->len ) ){
3389
3390                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3391                         "but should be %u bytes\n", 
3392                          (unsigned)indata.dsize, 
3393                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3394                 return -1;
3395         }
3396
3397
3398         arp = talloc(ctdb, struct control_gratious_arp);
3399         CTDB_NO_MEMORY(ctdb, arp);
3400
3401         arp->ctdb  = ctdb;
3402         arp->addr   = gratious_arp->addr;
3403         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3404         CTDB_NO_MEMORY(ctdb, arp->iface);
3405         arp->count = 0;
3406         
3407         event_add_timed(arp->ctdb->ev, arp, 
3408                         timeval_zero(), send_gratious_arp, arp);
3409
3410         return 0;
3411 }
3412
3413 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3414 {
3415         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3416         int ret;
3417
3418         /* verify the size of indata */
3419         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3420                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3421                 return -1;
3422         }
3423         if (indata.dsize != 
3424                 ( offsetof(struct ctdb_control_ip_iface, iface)
3425                 + pub->len ) ){
3426
3427                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3428                         "but should be %u bytes\n", 
3429                          (unsigned)indata.dsize, 
3430                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3431                 return -1;
3432         }
3433
3434         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
3435
3436         if (ret != 0) {
3437                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3438                 return -1;
3439         }
3440
3441         return 0;
3442 }
3443
3444 /*
3445   called when releaseip event finishes for del_public_address
3446  */
3447 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3448                                 void *private_data)
3449 {
3450         talloc_free(private_data);
3451 }
3452
3453 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3454 {
3455         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3456         struct ctdb_vnn *vnn;
3457         int ret;
3458
3459         /* verify the size of indata */
3460         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3461                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3462                 return -1;
3463         }
3464         if (indata.dsize != 
3465                 ( offsetof(struct ctdb_control_ip_iface, iface)
3466                 + pub->len ) ){
3467
3468                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3469                         "but should be %u bytes\n", 
3470                          (unsigned)indata.dsize, 
3471                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3472                 return -1;
3473         }
3474
3475         /* walk over all public addresses until we find a match */
3476         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3477                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3478                         TALLOC_CTX *mem_ctx;
3479
3480                         DLIST_REMOVE(ctdb->vnn, vnn);
3481                         if (vnn->pnn != ctdb->pnn) {
3482                                 if (vnn->iface != NULL) {
3483                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3484                                 }
3485                                 talloc_free(vnn);
3486                                 return 0;
3487                         }
3488                         vnn->pnn = -1;
3489
3490                         mem_ctx = talloc_new(ctdb);
3491                         talloc_steal(mem_ctx, vnn);
3492                         ret = ctdb_event_script_callback(ctdb, 
3493                                          mem_ctx, delete_ip_callback, mem_ctx,
3494                                          false,
3495                                          CTDB_EVENT_RELEASE_IP,
3496                                          "%s %s %u",
3497                                          ctdb_vnn_iface_string(vnn),
3498                                          ctdb_addr_to_str(&vnn->public_address),
3499                                          vnn->public_netmask_bits);
3500                         if (vnn->iface != NULL) {
3501                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3502                         }
3503                         if (ret != 0) {
3504                                 return -1;
3505                         }
3506                         return 0;
3507                 }
3508         }
3509
3510         return -1;
3511 }
3512
3513 /* This function is called from the recovery daemon to verify that a remote
3514    node has the expected ip allocation.
3515    This is verified against ctdb->ip_tree
3516 */
3517 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3518 {
3519         struct ctdb_public_ip_list *tmp_ip; 
3520         int i;
3521
3522         if (ctdb->ip_tree == NULL) {
3523                 /* dont know the expected allocation yet, assume remote node
3524                    is correct. */
3525                 return 0;
3526         }
3527
3528         if (ips == NULL) {
3529                 return 0;
3530         }
3531
3532         for (i=0; i<ips->num; i++) {
3533                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3534                 if (tmp_ip == NULL) {
3535                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3536                         return -1;
3537                 }
3538
3539                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3540                         continue;
3541                 }
3542
3543                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3544                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3545                         return -1;
3546                 }
3547         }
3548
3549         return 0;
3550 }
3551
3552 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3553 {
3554         struct ctdb_public_ip_list *tmp_ip; 
3555
3556         if (ctdb->ip_tree == NULL) {
3557                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3558                 return -1;
3559         }
3560
3561         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3562         if (tmp_ip == NULL) {
3563                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3564                 return -1;
3565         }
3566
3567         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3568         tmp_ip->pnn = ip->pnn;
3569
3570         return 0;
3571 }