830b751b03625efd20ab5f0ca9f78515c440c62e
[vlendec/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tevent/tevent.h"
23 #include "lib/tdb/include/tdb.h"
24 #include "lib/util/dlinklist.h"
25 #include "system/network.h"
26 #include "system/filesys.h"
27 #include "system/wait.h"
28 #include "../include/ctdb_private.h"
29 #include "../common/rb_tree.h"
30
31
32 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33
34 #define CTDB_ARP_INTERVAL 1
35 #define CTDB_ARP_REPEAT   3
36
37 struct ctdb_iface {
38         struct ctdb_iface *prev, *next;
39         const char *name;
40         bool link_up;
41         uint32_t references;
42 };
43
44 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
45 {
46         if (vnn->iface) {
47                 return vnn->iface->name;
48         }
49
50         return "__none__";
51 }
52
53 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
54 {
55         struct ctdb_iface *i;
56
57         /* Verify that we dont have an entry for this ip yet */
58         for (i=ctdb->ifaces;i;i=i->next) {
59                 if (strcmp(i->name, iface) == 0) {
60                         return 0;
61                 }
62         }
63
64         /* create a new structure for this interface */
65         i = talloc_zero(ctdb, struct ctdb_iface);
66         CTDB_NO_MEMORY_FATAL(ctdb, i);
67         i->name = talloc_strdup(i, iface);
68         CTDB_NO_MEMORY(ctdb, i->name);
69         i->link_up = false;
70
71         DLIST_ADD(ctdb->ifaces, i);
72
73         return 0;
74 }
75
76 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
77                                           const char *iface)
78 {
79         struct ctdb_iface *i;
80
81         /* Verify that we dont have an entry for this ip yet */
82         for (i=ctdb->ifaces;i;i=i->next) {
83                 if (strcmp(i->name, iface) == 0) {
84                         return i;
85                 }
86         }
87
88         return NULL;
89 }
90
91 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
92                                               struct ctdb_vnn *vnn)
93 {
94         int i;
95         struct ctdb_iface *cur = NULL;
96         struct ctdb_iface *best = NULL;
97
98         for (i=0; vnn->ifaces[i]; i++) {
99
100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
101                 if (cur == NULL) {
102                         continue;
103                 }
104
105                 if (!cur->link_up) {
106                         continue;
107                 }
108
109                 if (best == NULL) {
110                         best = cur;
111                         continue;
112                 }
113
114                 if (cur->references < best->references) {
115                         best = cur;
116                         continue;
117                 }
118         }
119
120         return best;
121 }
122
123 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
124                                      struct ctdb_vnn *vnn)
125 {
126         struct ctdb_iface *best = NULL;
127
128         if (vnn->iface) {
129                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
130                                    "still assigned to iface '%s'\n",
131                                    ctdb_addr_to_str(&vnn->public_address),
132                                    ctdb_vnn_iface_string(vnn)));
133                 return 0;
134         }
135
136         best = ctdb_vnn_best_iface(ctdb, vnn);
137         if (best == NULL) {
138                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
139                                   "cannot assign to iface any iface\n",
140                                   ctdb_addr_to_str(&vnn->public_address)));
141                 return -1;
142         }
143
144         vnn->iface = best;
145         best->references++;
146         vnn->pnn = ctdb->pnn;
147
148         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
149                            "now assigned to iface '%s' refs[%d]\n",
150                            ctdb_addr_to_str(&vnn->public_address),
151                            ctdb_vnn_iface_string(vnn),
152                            best->references));
153         return 0;
154 }
155
156 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
157                                     struct ctdb_vnn *vnn)
158 {
159         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
160                            "now unassigned (old iface '%s' refs[%d])\n",
161                            ctdb_addr_to_str(&vnn->public_address),
162                            ctdb_vnn_iface_string(vnn),
163                            vnn->iface?vnn->iface->references:0));
164         if (vnn->iface) {
165                 vnn->iface->references--;
166         }
167         vnn->iface = NULL;
168         if (vnn->pnn == ctdb->pnn) {
169                 vnn->pnn = -1;
170         }
171 }
172
173 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
174                                struct ctdb_vnn *vnn)
175 {
176         int i;
177
178         if (vnn->iface && vnn->iface->link_up) {
179                 return true;
180         }
181
182         for (i=0; vnn->ifaces[i]; i++) {
183                 struct ctdb_iface *cur;
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (cur->link_up) {
191                         return true;
192                 }
193         }
194
195         return false;
196 }
197
198 struct ctdb_takeover_arp {
199         struct ctdb_context *ctdb;
200         uint32_t count;
201         ctdb_sock_addr addr;
202         struct ctdb_tcp_array *tcparray;
203         struct ctdb_vnn *vnn;
204 };
205
206
207 /*
208   lists of tcp endpoints
209  */
210 struct ctdb_tcp_list {
211         struct ctdb_tcp_list *prev, *next;
212         struct ctdb_tcp_connection connection;
213 };
214
215 /*
216   list of clients to kill on IP release
217  */
218 struct ctdb_client_ip {
219         struct ctdb_client_ip *prev, *next;
220         struct ctdb_context *ctdb;
221         ctdb_sock_addr addr;
222         uint32_t client_id;
223 };
224
225
226 /*
227   send a gratuitous arp
228  */
229 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
230                                   struct timeval t, void *private_data)
231 {
232         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
233                                                         struct ctdb_takeover_arp);
234         int i, ret;
235         struct ctdb_tcp_array *tcparray;
236         const char *iface = ctdb_vnn_iface_string(arp->vnn);
237
238         ret = ctdb_sys_send_arp(&arp->addr, iface);
239         if (ret != 0) {
240                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
241                                   iface, strerror(errno)));
242         }
243
244         tcparray = arp->tcparray;
245         if (tcparray) {
246                 for (i=0;i<tcparray->num;i++) {
247                         struct ctdb_tcp_connection *tcon;
248
249                         tcon = &tcparray->connections[i];
250                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
251                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
252                                 ctdb_addr_to_str(&tcon->src_addr),
253                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
254                         ret = ctdb_sys_send_tcp(
255                                 &tcon->src_addr, 
256                                 &tcon->dst_addr,
257                                 0, 0, 0);
258                         if (ret != 0) {
259                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
260                                         ctdb_addr_to_str(&tcon->src_addr)));
261                         }
262                 }
263         }
264
265         arp->count++;
266
267         if (arp->count == CTDB_ARP_REPEAT) {
268                 talloc_free(arp);
269                 return;
270         }
271
272         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
273                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
274                         ctdb_control_send_arp, arp);
275 }
276
277 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
278                                        struct ctdb_vnn *vnn)
279 {
280         struct ctdb_takeover_arp *arp;
281         struct ctdb_tcp_array *tcparray;
282
283         if (!vnn->takeover_ctx) {
284                 vnn->takeover_ctx = talloc_new(vnn);
285                 if (!vnn->takeover_ctx) {
286                         return -1;
287                 }
288         }
289
290         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
291         if (!arp) {
292                 return -1;
293         }
294
295         arp->ctdb = ctdb;
296         arp->addr = vnn->public_address;
297         arp->vnn  = vnn;
298
299         tcparray = vnn->tcp_array;
300         if (tcparray) {
301                 /* add all of the known tcp connections for this IP to the
302                    list of tcp connections to send tickle acks for */
303                 arp->tcparray = talloc_steal(arp, tcparray);
304
305                 vnn->tcp_array = NULL;
306                 vnn->tcp_update_needed = true;
307         }
308
309         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
310                         timeval_zero(), ctdb_control_send_arp, arp);
311
312         return 0;
313 }
314
315 struct takeover_callback_state {
316         struct ctdb_req_control *c;
317         ctdb_sock_addr *addr;
318         struct ctdb_vnn *vnn;
319 };
320
321 struct ctdb_do_takeip_state {
322         struct ctdb_req_control *c;
323         struct ctdb_vnn *vnn;
324 };
325
326 /*
327   called when takeip event finishes
328  */
329 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
330                                     void *private_data)
331 {
332         struct ctdb_do_takeip_state *state =
333                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
334         int32_t ret;
335         TDB_DATA data;
336
337         if (status != 0) {
338                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339         
340                 if (status == -ETIME) {
341                         ctdb_ban_self(ctdb);
342                 }
343                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
344                                  ctdb_addr_to_str(&state->vnn->public_address),
345                                  ctdb_vnn_iface_string(state->vnn)));
346                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347
348                 node->flags |= NODE_FLAGS_UNHEALTHY;
349                 talloc_free(state);
350                 return;
351         }
352
353         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
354         if (ret != 0) {
355                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
356                 talloc_free(state);
357                 return;
358         }
359
360         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
361         data.dsize = strlen((char *)data.dptr) + 1;
362         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
363
364         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
365
366
367         /* the control succeeded */
368         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
369         talloc_free(state);
370         return;
371 }
372
373 /*
374   take over an ip address
375  */
376 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
377                               struct ctdb_req_control *c,
378                               struct ctdb_vnn *vnn)
379 {
380         int ret;
381         struct ctdb_do_takeip_state *state;
382
383         ret = ctdb_vnn_assign_iface(ctdb, vnn);
384         if (ret != 0) {
385                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
386                                  "assin a usable interface\n",
387                                  ctdb_addr_to_str(&vnn->public_address),
388                                  vnn->public_netmask_bits));
389                 return -1;
390         }
391
392         state = talloc(vnn, struct ctdb_do_takeip_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(ctdb, c);
396         state->vnn   = vnn;
397
398         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
399                             ctdb_addr_to_str(&vnn->public_address),
400                             vnn->public_netmask_bits,
401                             ctdb_vnn_iface_string(vnn)));
402
403         ret = ctdb_event_script_callback(ctdb,
404                                          state,
405                                          ctdb_do_takeip_callback,
406                                          state,
407                                          false,
408                                          CTDB_EVENT_TAKE_IP,
409                                          "%s %s %u",
410                                          ctdb_vnn_iface_string(vnn),
411                                          ctdb_addr_to_str(&vnn->public_address),
412                                          vnn->public_netmask_bits);
413
414         if (ret != 0) {
415                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
416                         ctdb_addr_to_str(&vnn->public_address),
417                         ctdb_vnn_iface_string(vnn)));
418                 talloc_free(state);
419                 return -1;
420         }
421
422         return 0;
423 }
424
425 struct ctdb_do_updateip_state {
426         struct ctdb_req_control *c;
427         struct ctdb_iface *old;
428         struct ctdb_vnn *vnn;
429 };
430
431 /*
432   called when updateip event finishes
433  */
434 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
435                                       void *private_data)
436 {
437         struct ctdb_do_updateip_state *state =
438                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
439         int32_t ret;
440
441         if (status != 0) {
442                 if (status == -ETIME) {
443                         ctdb_ban_self(ctdb);
444                 }
445                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
446                         ctdb_addr_to_str(&state->vnn->public_address),
447                         state->old->name,
448                         ctdb_vnn_iface_string(state->vnn)));
449
450                 /*
451                  * All we can do is reset the old interface
452                  * and let the next run fix it
453                  */
454                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
455                 state->vnn->iface = state->old;
456                 state->vnn->iface->references++;
457
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459                 talloc_free(state);
460                 return;
461         }
462
463         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
464         if (ret != 0) {
465                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
466                 talloc_free(state);
467                 return;
468         }
469
470         /* the control succeeded */
471         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
472         talloc_free(state);
473         return;
474 }
475
476 /*
477   update (move) an ip address
478  */
479 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
480                                 struct ctdb_req_control *c,
481                                 struct ctdb_vnn *vnn)
482 {
483         int ret;
484         struct ctdb_do_updateip_state *state;
485         struct ctdb_iface *old = vnn->iface;
486         const char *new_name;
487
488         ctdb_vnn_unassign_iface(ctdb, vnn);
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
492                                  "assin a usable interface (old iface '%s')\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits,
495                                  old->name));
496                 return -1;
497         }
498
499         new_name = ctdb_vnn_iface_string(vnn);
500         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
501                 /* A benign update from one interface onto itself.
502                  * no need to run the eventscripts in this case, just return
503                  * success.
504                  */
505                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
506                 return 0;
507         }
508
509         state = talloc(vnn, struct ctdb_do_updateip_state);
510         CTDB_NO_MEMORY(ctdb, state);
511
512         state->c = talloc_steal(ctdb, c);
513         state->old = old;
514         state->vnn = vnn;
515
516         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
517                             "interface %s to %s\n",
518                             ctdb_addr_to_str(&vnn->public_address),
519                             vnn->public_netmask_bits,
520                             old->name,
521                             new_name));
522
523         ret = ctdb_event_script_callback(ctdb,
524                                          state,
525                                          ctdb_do_updateip_callback,
526                                          state,
527                                          false,
528                                          CTDB_EVENT_UPDATE_IP,
529                                          "%s %s %s %u",
530                                          state->old->name,
531                                          new_name,
532                                          ctdb_addr_to_str(&vnn->public_address),
533                                          vnn->public_netmask_bits);
534         if (ret != 0) {
535                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
536                                  ctdb_addr_to_str(&vnn->public_address),
537                                  old->name, new_name));
538                 talloc_free(state);
539                 return -1;
540         }
541
542         return 0;
543 }
544
545 /*
546   Find the vnn of the node that has a public ip address
547   returns -1 if the address is not known as a public address
548  */
549 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
550 {
551         struct ctdb_vnn *vnn;
552
553         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
554                 if (ctdb_same_ip(&vnn->public_address, addr)) {
555                         return vnn;
556                 }
557         }
558
559         return NULL;
560 }
561
562 /*
563   take over an ip address
564  */
565 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
566                                  struct ctdb_req_control *c,
567                                  TDB_DATA indata,
568                                  bool *async_reply)
569 {
570         int ret;
571         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
572         struct ctdb_vnn *vnn;
573         bool have_ip = false;
574         bool do_updateip = false;
575         bool do_takeip = false;
576         struct ctdb_iface *best_iface = NULL;
577
578         if (pip->pnn != ctdb->pnn) {
579                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
580                                  "with pnn %d, but we're node %d\n",
581                                  ctdb_addr_to_str(&pip->addr),
582                                  pip->pnn, ctdb->pnn));
583                 return -1;
584         }
585
586         /* update out vnn list */
587         vnn = find_public_ip_vnn(ctdb, &pip->addr);
588         if (vnn == NULL) {
589                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
590                         ctdb_addr_to_str(&pip->addr)));
591                 return 0;
592         }
593
594         have_ip = ctdb_sys_have_ip(&pip->addr);
595         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
596         if (best_iface == NULL) {
597                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
598                                  "a usable interface (old %s, have_ip %d)\n",
599                                  ctdb_addr_to_str(&vnn->public_address),
600                                  vnn->public_netmask_bits,
601                                  ctdb_vnn_iface_string(vnn),
602                                  have_ip));
603                 return -1;
604         }
605
606         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
607                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
608                 have_ip = false;
609         }
610
611         if (vnn->iface == NULL && have_ip) {
612                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
613                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
614                                  ctdb_addr_to_str(&vnn->public_address)));
615                 return 0;
616         }
617
618         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
619                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
620                                   "and we have it on iface[%s], but it was assigned to node %d"
621                                   "and we are node %d, banning ourself\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
624                 ctdb_ban_self(ctdb);
625                 return -1;
626         }
627
628         if (vnn->pnn == -1 && have_ip) {
629                 vnn->pnn = ctdb->pnn;
630                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
631                                   "and we already have it on iface[%s], update local daemon\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                   ctdb_vnn_iface_string(vnn)));
634                 return 0;
635         }
636
637         if (vnn->iface) {
638                 if (vnn->iface->link_up) {
639                         /* only move when the rebalance gains something */
640                         if (vnn->iface->references > (best_iface->references + 1)) {
641                                 do_updateip = true;
642                         }
643                 } else if (vnn->iface != best_iface) {
644                         do_updateip = true;
645                 }
646         }
647
648         if (!have_ip) {
649                 if (do_updateip) {
650                         ctdb_vnn_unassign_iface(ctdb, vnn);
651                         do_updateip = false;
652                 }
653                 do_takeip = true;
654         }
655
656         if (do_takeip) {
657                 ret = ctdb_do_takeip(ctdb, c, vnn);
658                 if (ret != 0) {
659                         return -1;
660                 }
661         } else if (do_updateip) {
662                 ret = ctdb_do_updateip(ctdb, c, vnn);
663                 if (ret != 0) {
664                         return -1;
665                 }
666         } else {
667                 /*
668                  * The interface is up and the kernel known the ip
669                  * => do nothing
670                  */
671                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
672                         ctdb_addr_to_str(&pip->addr),
673                         vnn->public_netmask_bits,
674                         ctdb_vnn_iface_string(vnn)));
675                 return 0;
676         }
677
678         /* tell ctdb_control.c that we will be replying asynchronously */
679         *async_reply = true;
680
681         return 0;
682 }
683
684 /*
685   takeover an ip address old v4 style
686  */
687 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
688                                 struct ctdb_req_control *c,
689                                 TDB_DATA indata, 
690                                 bool *async_reply)
691 {
692         TDB_DATA data;
693         
694         data.dsize = sizeof(struct ctdb_public_ip);
695         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
696         CTDB_NO_MEMORY(ctdb, data.dptr);
697         
698         memcpy(data.dptr, indata.dptr, indata.dsize);
699         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
700 }
701
702 /*
703   kill any clients that are registered with a IP that is being released
704  */
705 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
706 {
707         struct ctdb_client_ip *ip;
708
709         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
710                 ctdb_addr_to_str(addr)));
711
712         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
713                 ctdb_sock_addr tmp_addr;
714
715                 tmp_addr = ip->addr;
716                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
717                         ip->client_id,
718                         ctdb_addr_to_str(&ip->addr)));
719
720                 if (ctdb_same_ip(&tmp_addr, addr)) {
721                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
722                                                                      ip->client_id, 
723                                                                      struct ctdb_client);
724                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
725                                 ip->client_id,
726                                 ctdb_addr_to_str(&ip->addr),
727                                 client->pid));
728
729                         if (client->pid != 0) {
730                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
731                                         (unsigned)client->pid,
732                                         ctdb_addr_to_str(addr),
733                                         ip->client_id));
734                                 kill(client->pid, SIGKILL);
735                         }
736                 }
737         }
738 }
739
740 /*
741   called when releaseip event finishes
742  */
743 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
744                                 void *private_data)
745 {
746         struct takeover_callback_state *state = 
747                 talloc_get_type(private_data, struct takeover_callback_state);
748         TDB_DATA data;
749
750         if (status == -ETIME) {
751                 ctdb_ban_self(ctdb);
752         }
753
754         /* send a message to all clients of this node telling them
755            that the cluster has been reconfigured and they should
756            release any sockets on this IP */
757         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
758         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
759         data.dsize = strlen((char *)data.dptr)+1;
760
761         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
762
763         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
764
765         /* kill clients that have registered with this IP */
766         release_kill_clients(ctdb, state->addr);
767
768         ctdb_vnn_unassign_iface(ctdb, state->vnn);
769
770         /* the control succeeded */
771         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
772         talloc_free(state);
773 }
774
775 /*
776   release an ip address
777  */
778 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
779                                 struct ctdb_req_control *c,
780                                 TDB_DATA indata, 
781                                 bool *async_reply)
782 {
783         int ret;
784         struct takeover_callback_state *state;
785         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
786         struct ctdb_vnn *vnn;
787
788         /* update our vnn list */
789         vnn = find_public_ip_vnn(ctdb, &pip->addr);
790         if (vnn == NULL) {
791                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
792                         ctdb_addr_to_str(&pip->addr)));
793                 return 0;
794         }
795         vnn->pnn = pip->pnn;
796
797         /* stop any previous arps */
798         talloc_free(vnn->takeover_ctx);
799         vnn->takeover_ctx = NULL;
800
801         if (!ctdb_sys_have_ip(&pip->addr)) {
802                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits, 
805                         ctdb_vnn_iface_string(vnn)));
806                 ctdb_vnn_unassign_iface(ctdb, vnn);
807                 return 0;
808         }
809
810         if (vnn->iface == NULL) {
811                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
812                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
813                                  ctdb_addr_to_str(&vnn->public_address)));
814                 return 0;
815         }
816
817         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
818                 ctdb_addr_to_str(&pip->addr),
819                 vnn->public_netmask_bits, 
820                 ctdb_vnn_iface_string(vnn),
821                 pip->pnn));
822
823         state = talloc(ctdb, struct takeover_callback_state);
824         CTDB_NO_MEMORY(ctdb, state);
825
826         state->c = talloc_steal(state, c);
827         state->addr = talloc(state, ctdb_sock_addr);       
828         CTDB_NO_MEMORY(ctdb, state->addr);
829         *state->addr = pip->addr;
830         state->vnn   = vnn;
831
832         ret = ctdb_event_script_callback(ctdb, 
833                                          state, release_ip_callback, state,
834                                          false,
835                                          CTDB_EVENT_RELEASE_IP,
836                                          "%s %s %u",
837                                          ctdb_vnn_iface_string(vnn),
838                                          ctdb_addr_to_str(&pip->addr),
839                                          vnn->public_netmask_bits);
840         if (ret != 0) {
841                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
842                         ctdb_addr_to_str(&pip->addr),
843                         ctdb_vnn_iface_string(vnn)));
844                 talloc_free(state);
845                 return -1;
846         }
847
848         /* tell the control that we will be reply asynchronously */
849         *async_reply = true;
850         return 0;
851 }
852
853 /*
854   release an ip address old v4 style
855  */
856 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
857                                 struct ctdb_req_control *c,
858                                 TDB_DATA indata, 
859                                 bool *async_reply)
860 {
861         TDB_DATA data;
862         
863         data.dsize = sizeof(struct ctdb_public_ip);
864         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
865         CTDB_NO_MEMORY(ctdb, data.dptr);
866         
867         memcpy(data.dptr, indata.dptr, indata.dsize);
868         return ctdb_control_release_ip(ctdb, c, data, async_reply);
869 }
870
871
872 static int ctdb_add_public_address(struct ctdb_context *ctdb,
873                                    ctdb_sock_addr *addr,
874                                    unsigned mask, const char *ifaces)
875 {
876         struct ctdb_vnn      *vnn;
877         uint32_t num = 0;
878         char *tmp;
879         const char *iface;
880         int i;
881         int ret;
882
883         tmp = strdup(ifaces);
884         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
885                 if (!ctdb_sys_check_iface_exists(iface)) {
886                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
887                         free(tmp);
888                         return -1;
889                 }
890         }
891         free(tmp);
892
893         /* Verify that we dont have an entry for this ip yet */
894         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
895                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
896                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
897                                 ctdb_addr_to_str(addr)));
898                         return -1;
899                 }               
900         }
901
902         /* create a new vnn structure for this ip address */
903         vnn = talloc_zero(ctdb, struct ctdb_vnn);
904         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
905         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
906         tmp = talloc_strdup(vnn, ifaces);
907         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
908         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
909                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
910                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
911                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
912                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
913                 num++;
914         }
915         talloc_free(tmp);
916         vnn->ifaces[num] = NULL;
917         vnn->public_address      = *addr;
918         vnn->public_netmask_bits = mask;
919         vnn->pnn                 = -1;
920         if (ctdb_sys_have_ip(addr)) {
921                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
922                 vnn->pnn = ctdb->pnn;
923         }
924
925         for (i=0; vnn->ifaces[i]; i++) {
926                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
927                 if (ret != 0) {
928                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
929                                            "for public_address[%s]\n",
930                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
931                         talloc_free(vnn);
932                         return -1;
933                 }
934                 if (i == 0) {
935                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
936                 }
937         }
938
939         DLIST_ADD(ctdb->vnn, vnn);
940
941         return 0;
942 }
943
944 /*
945   setup the event script directory
946 */
947 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
948 {
949         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
950         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
951         return 0;
952 }
953
954 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
955                                   struct timeval t, void *private_data)
956 {
957         struct ctdb_context *ctdb = talloc_get_type(private_data, 
958                                                         struct ctdb_context);
959         struct ctdb_vnn *vnn;
960
961         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
962                 int i;
963
964                 for (i=0; vnn->ifaces[i] != NULL; i++) {
965                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
966                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
967                                         vnn->ifaces[i],
968                                         ctdb_addr_to_str(&vnn->public_address)));
969                         }
970                 }
971         }
972
973         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
974                 timeval_current_ofs(30, 0), 
975                 ctdb_check_interfaces_event, ctdb);
976 }
977
978
979 static int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
980 {
981         if (ctdb->check_public_ifaces_ctx != NULL) {
982                 talloc_free(ctdb->check_public_ifaces_ctx);
983                 ctdb->check_public_ifaces_ctx = NULL;
984         }
985
986         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
987         if (ctdb->check_public_ifaces_ctx == NULL) {
988                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
989         }
990
991         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
992                 timeval_current_ofs(30, 0), 
993                 ctdb_check_interfaces_event, ctdb);
994
995         return 0;
996 }
997
998
999 /*
1000   setup the public address lists from a file
1001 */
1002 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
1003 {
1004         char **lines;
1005         int nlines;
1006         int i;
1007
1008         lines = file_lines_load(alist, &nlines, ctdb);
1009         if (lines == NULL) {
1010                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
1011                 return -1;
1012         }
1013         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1014                 nlines--;
1015         }
1016
1017         for (i=0;i<nlines;i++) {
1018                 unsigned mask;
1019                 ctdb_sock_addr addr;
1020                 const char *addrstr;
1021                 const char *ifaces;
1022                 char *tok, *line;
1023
1024                 line = lines[i];
1025                 while ((*line == ' ') || (*line == '\t')) {
1026                         line++;
1027                 }
1028                 if (*line == '#') {
1029                         continue;
1030                 }
1031                 if (strcmp(line, "") == 0) {
1032                         continue;
1033                 }
1034                 tok = strtok(line, " \t");
1035                 addrstr = tok;
1036                 tok = strtok(NULL, " \t");
1037                 if (tok == NULL) {
1038                         if (NULL == ctdb->default_public_interface) {
1039                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1040                                          i+1));
1041                                 talloc_free(lines);
1042                                 return -1;
1043                         }
1044                         ifaces = ctdb->default_public_interface;
1045                 } else {
1046                         ifaces = tok;
1047                 }
1048
1049                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1050                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1051                         talloc_free(lines);
1052                         return -1;
1053                 }
1054                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
1055                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1056                         talloc_free(lines);
1057                         return -1;
1058                 }
1059         }
1060
1061
1062         ctdb_start_monitoring_interfaces(ctdb);
1063
1064         talloc_free(lines);
1065         return 0;
1066 }
1067
1068 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1069                               const char *iface,
1070                               const char *ip)
1071 {
1072         struct ctdb_vnn *svnn;
1073         struct ctdb_iface *cur = NULL;
1074         bool ok;
1075         int ret;
1076
1077         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1078         CTDB_NO_MEMORY(ctdb, svnn);
1079
1080         svnn->ifaces = talloc_array(svnn, const char *, 2);
1081         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1082         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1083         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1084         svnn->ifaces[1] = NULL;
1085
1086         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1087         if (!ok) {
1088                 talloc_free(svnn);
1089                 return -1;
1090         }
1091
1092         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1093         if (ret != 0) {
1094                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1095                                    "for single_ip[%s]\n",
1096                                    svnn->ifaces[0],
1097                                    ctdb_addr_to_str(&svnn->public_address)));
1098                 talloc_free(svnn);
1099                 return -1;
1100         }
1101
1102         /* assume the single public ip interface is initially "good" */
1103         cur = ctdb_find_iface(ctdb, iface);
1104         if (cur == NULL) {
1105                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1106                 return -1;
1107         }
1108         cur->link_up = true;
1109
1110         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1111         if (ret != 0) {
1112                 talloc_free(svnn);
1113                 return -1;
1114         }
1115
1116         ctdb->single_ip_vnn = svnn;
1117         return 0;
1118 }
1119
1120 /* Given a physical node, return the number of
1121    public addresses that is currently assigned to this node.
1122 */
1123 static int node_ip_coverage(struct ctdb_context *ctdb, 
1124         int32_t pnn,
1125         struct ctdb_public_ip_list *ips)
1126 {
1127         int num=0;
1128
1129         for (;ips;ips=ips->next) {
1130                 if (ips->pnn == pnn) {
1131                         num++;
1132                 }
1133         }
1134         return num;
1135 }
1136
1137
1138 /* Check if this is a public ip known to the node, i.e. can that
1139    node takeover this ip ?
1140 */
1141 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1142                 struct ctdb_public_ip_list *ip)
1143 {
1144         struct ctdb_all_public_ips *public_ips;
1145         int i;
1146
1147         public_ips = ctdb->nodes[pnn]->available_public_ips;
1148
1149         if (public_ips == NULL) {
1150                 return -1;
1151         }
1152
1153         for (i=0;i<public_ips->num;i++) {
1154                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1155                         /* yes, this node can serve this public ip */
1156                         return 0;
1157                 }
1158         }
1159
1160         return -1;
1161 }
1162
1163
1164 /* search the node lists list for a node to takeover this ip.
1165    pick the node that currently are serving the least number of ips
1166    so that the ips get spread out evenly.
1167 */
1168 static int find_takeover_node(struct ctdb_context *ctdb, 
1169                 struct ctdb_node_map *nodemap, uint32_t mask, 
1170                 struct ctdb_public_ip_list *ip,
1171                 struct ctdb_public_ip_list *all_ips)
1172 {
1173         int pnn, min=0, num;
1174         int i;
1175
1176         pnn    = -1;
1177         for (i=0;i<nodemap->num;i++) {
1178                 if (nodemap->nodes[i].flags & mask) {
1179                         /* This node is not healty and can not be used to serve
1180                            a public address 
1181                         */
1182                         continue;
1183                 }
1184
1185                 /* verify that this node can serve this ip */
1186                 if (can_node_serve_ip(ctdb, i, ip)) {
1187                         /* no it couldnt   so skip to the next node */
1188                         continue;
1189                 }
1190
1191                 num = node_ip_coverage(ctdb, i, all_ips);
1192                 /* was this the first node we checked ? */
1193                 if (pnn == -1) {
1194                         pnn = i;
1195                         min  = num;
1196                 } else {
1197                         if (num < min) {
1198                                 pnn = i;
1199                                 min  = num;
1200                         }
1201                 }
1202         }       
1203         if (pnn == -1) {
1204                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1205                         ctdb_addr_to_str(&ip->addr)));
1206
1207                 return -1;
1208         }
1209
1210         ip->pnn = pnn;
1211         return 0;
1212 }
1213
1214 #define IP_KEYLEN       4
1215 static uint32_t *ip_key(ctdb_sock_addr *ip)
1216 {
1217         static uint32_t key[IP_KEYLEN];
1218
1219         bzero(key, sizeof(key));
1220
1221         switch (ip->sa.sa_family) {
1222         case AF_INET:
1223                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1224                 break;
1225         case AF_INET6: {
1226                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1227                 key[0]  = htonl(s6_a32[0]);
1228                 key[1]  = htonl(s6_a32[1]);
1229                 key[2]  = htonl(s6_a32[2]);
1230                 key[3]  = htonl(s6_a32[3]);
1231                 break;
1232         }
1233         default:
1234                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1235                 return key;
1236         }
1237
1238         return key;
1239 }
1240
1241 static void *add_ip_callback(void *parm, void *data)
1242 {
1243         struct ctdb_public_ip_list *this_ip = parm; 
1244         struct ctdb_public_ip_list *prev_ip = data; 
1245
1246         if (prev_ip == NULL) {
1247                 return parm;
1248         }
1249         if (this_ip->pnn == -1) {
1250                 this_ip->pnn = prev_ip->pnn;
1251         }
1252
1253         return parm;
1254 }
1255
1256 static int getips_count_callback(void *param, void *data)
1257 {
1258         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1259         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1260
1261         new_ip->next = *ip_list;
1262         *ip_list     = new_ip;
1263         return 0;
1264 }
1265
1266 static struct ctdb_public_ip_list *
1267 create_merged_ip_list(struct ctdb_context *ctdb)
1268 {
1269         int i, j;
1270         struct ctdb_public_ip_list *ip_list;
1271         struct ctdb_all_public_ips *public_ips;
1272
1273         if (ctdb->ip_tree != NULL) {
1274                 talloc_free(ctdb->ip_tree);
1275                 ctdb->ip_tree = NULL;
1276         }
1277         ctdb->ip_tree = trbt_create(ctdb, 0);
1278
1279         for (i=0;i<ctdb->num_nodes;i++) {
1280                 public_ips = ctdb->nodes[i]->known_public_ips;
1281
1282                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1283                         continue;
1284                 }
1285
1286                 /* there were no public ips for this node */
1287                 if (public_ips == NULL) {
1288                         continue;
1289                 }               
1290
1291                 for (j=0;j<public_ips->num;j++) {
1292                         struct ctdb_public_ip_list *tmp_ip; 
1293
1294                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1295                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1296                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1297                         tmp_ip->addr = public_ips->ips[j].addr;
1298                         tmp_ip->next = NULL;
1299
1300                         trbt_insertarray32_callback(ctdb->ip_tree,
1301                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1302                                 add_ip_callback,
1303                                 tmp_ip);
1304                 }
1305         }
1306
1307         ip_list = NULL;
1308         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1309
1310         return ip_list;
1311 }
1312
1313 /* 
1314  * This is the length of the longtest common prefix between the IPs.
1315  * It is calculated by XOR-ing the 2 IPs together and counting the
1316  * number of leading zeroes.  The implementation means that all
1317  * addresses end up being 128 bits long.
1318  *
1319  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1320  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1321  * lots of nodes and IP addresses?
1322  */
1323 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1324 {
1325         uint32_t ip1_k[IP_KEYLEN];
1326         uint32_t *t;
1327         int i;
1328         uint32_t x;
1329
1330         uint32_t distance = 0;
1331
1332         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1333         t = ip_key(ip2);
1334         for (i=0; i<IP_KEYLEN; i++) {
1335                 x = ip1_k[i] ^ t[i];
1336                 if (x == 0) {
1337                         distance += 32;
1338                 } else {
1339                         /* Count number of leading zeroes. 
1340                          * FIXME? This could be optimised...
1341                          */
1342                         while ((x & (1 << 31)) == 0) {
1343                                 x <<= 1;
1344                                 distance += 1;
1345                         }
1346                 }
1347         }
1348
1349         return distance;
1350 }
1351
1352 /* Calculate the IP distance for the given IP relative to IPs on the
1353    given node.  The ips argument is generally the all_ips variable
1354    used in the main part of the algorithm.
1355  */
1356 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1357                                   struct ctdb_public_ip_list *ips,
1358                                   int pnn)
1359 {
1360         struct ctdb_public_ip_list *t;
1361         uint32_t d;
1362
1363         uint32_t sum = 0;
1364
1365         for (t=ips; t != NULL; t=t->next) {
1366                 if (t->pnn != pnn) {
1367                         continue;
1368                 }
1369
1370                 /* Optimisation: We never calculate the distance
1371                  * between an address and itself.  This allows us to
1372                  * calculate the effect of removing an address from a
1373                  * node by simply calculating the distance between
1374                  * that address and all of the exitsing addresses.
1375                  * Moreover, we assume that we're only ever dealing
1376                  * with addresses from all_ips so we can identify an
1377                  * address via a pointer rather than doing a more
1378                  * expensive address comparison. */
1379                 if (&(t->addr) == ip) {
1380                         continue;
1381                 }
1382
1383                 d = ip_distance(ip, &(t->addr));
1384                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1385         }
1386
1387         return sum;
1388 }
1389
1390 /* Return the LCP2 imbalance metric for addresses currently assigned
1391    to the given node.
1392  */
1393 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1394 {
1395         struct ctdb_public_ip_list *t;
1396
1397         uint32_t imbalance = 0;
1398
1399         for (t=all_ips; t!=NULL; t=t->next) {
1400                 if (t->pnn != pnn) {
1401                         continue;
1402                 }
1403                 /* Pass the rest of the IPs rather than the whole
1404                    all_ips input list.
1405                 */
1406                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1407         }
1408
1409         return imbalance;
1410 }
1411
1412 /* Allocate any unassigned IPs just by looping through the IPs and
1413  * finding the best node for each.
1414  */
1415 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1416                                       struct ctdb_node_map *nodemap,
1417                                       uint32_t mask,
1418                                       struct ctdb_public_ip_list *all_ips)
1419 {
1420         struct ctdb_public_ip_list *tmp_ip;
1421
1422         /* loop over all ip's and find a physical node to cover for 
1423            each unassigned ip.
1424         */
1425         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1426                 if (tmp_ip->pnn == -1) {
1427                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1428                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1429                                         ctdb_addr_to_str(&tmp_ip->addr)));
1430                         }
1431                 }
1432         }
1433 }
1434
1435 /* Basic non-deterministic rebalancing algorithm.
1436  */
1437 static bool basic_failback(struct ctdb_context *ctdb,
1438                            struct ctdb_node_map *nodemap,
1439                            uint32_t mask,
1440                            struct ctdb_public_ip_list *all_ips,
1441                            int num_ips,
1442                            int *retries)
1443 {
1444         int i;
1445         int maxnode, maxnum=0, minnode, minnum=0, num;
1446         struct ctdb_public_ip_list *tmp_ip;
1447
1448         /* for each ip address, loop over all nodes that can serve
1449            this ip and make sure that the difference between the node
1450            serving the most and the node serving the least ip's are
1451            not greater than 1.
1452         */
1453         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1454                 if (tmp_ip->pnn == -1) {
1455                         continue;
1456                 }
1457
1458                 /* Get the highest and lowest number of ips's served by any 
1459                    valid node which can serve this ip.
1460                 */
1461                 maxnode = -1;
1462                 minnode = -1;
1463                 for (i=0;i<nodemap->num;i++) {
1464                         if (nodemap->nodes[i].flags & mask) {
1465                                 continue;
1466                         }
1467
1468                         /* only check nodes that can actually serve this ip */
1469                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1470                                 /* no it couldnt   so skip to the next node */
1471                                 continue;
1472                         }
1473
1474                         num = node_ip_coverage(ctdb, i, all_ips);
1475                         if (maxnode == -1) {
1476                                 maxnode = i;
1477                                 maxnum  = num;
1478                         } else {
1479                                 if (num > maxnum) {
1480                                         maxnode = i;
1481                                         maxnum  = num;
1482                                 }
1483                         }
1484                         if (minnode == -1) {
1485                                 minnode = i;
1486                                 minnum  = num;
1487                         } else {
1488                                 if (num < minnum) {
1489                                         minnode = i;
1490                                         minnum  = num;
1491                                 }
1492                         }
1493                 }
1494                 if (maxnode == -1) {
1495                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1496                                 ctdb_addr_to_str(&tmp_ip->addr)));
1497
1498                         continue;
1499                 }
1500
1501                 /* If we want deterministic IPs then dont try to reallocate 
1502                    them to spread out the load.
1503                 */
1504                 if (1 == ctdb->tunable.deterministic_public_ips) {
1505                         continue;
1506                 }
1507
1508                 /* if the spread between the smallest and largest coverage by
1509                    a node is >=2 we steal one of the ips from the node with
1510                    most coverage to even things out a bit.
1511                    try to do this a limited number of times since we dont
1512                    want to spend too much time balancing the ip coverage.
1513                 */
1514                 if ( (maxnum > minnum+1)
1515                      && (*retries < (num_ips + 5)) ){
1516                         struct ctdb_public_ip_list *tmp;
1517
1518                         /* mark one of maxnode's vnn's as unassigned and try
1519                            again
1520                         */
1521                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1522                                 if (tmp->pnn == maxnode) {
1523                                         tmp->pnn = -1;
1524                                         (*retries)++;
1525                                         return true;
1526                                 }
1527                         }
1528                 }
1529         }
1530
1531         return false;
1532 }
1533
1534 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1535  * that we can unit test it.
1536  */
1537 static void lcp2_init(struct ctdb_context * tmp_ctx,
1538                struct ctdb_node_map * nodemap,
1539                uint32_t mask,
1540                struct ctdb_public_ip_list *all_ips,
1541                uint32_t **lcp2_imbalances,
1542                bool **newly_healthy)
1543 {
1544         int i;
1545         struct ctdb_public_ip_list *tmp_ip;
1546
1547         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1548         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1549         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1550         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1551
1552         for (i=0;i<nodemap->num;i++) {
1553                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1554                 /* First step: is the node "healthy"? */
1555                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1556         }
1557
1558         /* 2nd step: if a ndoe has IPs assigned then it must have been
1559          * healthy before, so we remove it from consideration... */
1560         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1561                 if (tmp_ip->pnn != -1) {
1562                         (*newly_healthy)[tmp_ip->pnn] = false;
1563                 }
1564         }
1565 }
1566
1567 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1568  * the IP/node combination that will cost the least.
1569  */
1570 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1571                               struct ctdb_node_map *nodemap,
1572                               uint32_t mask,
1573                               struct ctdb_public_ip_list *all_ips,
1574                               uint32_t *lcp2_imbalances)
1575 {
1576         struct ctdb_public_ip_list *tmp_ip;
1577         int dstnode;
1578
1579         int minnode;
1580         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1581         struct ctdb_public_ip_list *minip;
1582
1583         bool should_loop = true;
1584         bool have_unassigned = true;
1585
1586         while (have_unassigned && should_loop) {
1587                 should_loop = false;
1588
1589                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1590                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1591
1592                 minnode = -1;
1593                 mindsum = 0;
1594                 minip = NULL;
1595
1596                 /* loop over each unassigned ip. */
1597                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1598                         if (tmp_ip->pnn != -1) {
1599                                 continue;
1600                         }
1601
1602                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1603                                 /* only check nodes that can actually serve this ip */
1604                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1605                                         /* no it couldnt   so skip to the next node */
1606                                         continue;
1607                                 }
1608                                 if (nodemap->nodes[dstnode].flags & mask) {
1609                                         continue;
1610                                 }
1611
1612                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1613                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1614                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1615                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1616                                                    dstnode,
1617                                                    dstimbl - lcp2_imbalances[dstnode]));
1618
1619
1620                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1621                                         minnode = dstnode;
1622                                         minimbl = dstimbl;
1623                                         mindsum = dstdsum;
1624                                         minip = tmp_ip;
1625                                         should_loop = true;
1626                                 }
1627                         }
1628                 }
1629
1630                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1631
1632                 /* If we found one then assign it to the given node. */
1633                 if (minnode != -1) {
1634                         minip->pnn = minnode;
1635                         lcp2_imbalances[minnode] = minimbl;
1636                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1637                                           ctdb_addr_to_str(&(minip->addr)),
1638                                           minnode,
1639                                           mindsum));
1640                 }
1641
1642                 /* There might be a better way but at least this is clear. */
1643                 have_unassigned = false;
1644                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1645                         if (tmp_ip->pnn == -1) {
1646                                 have_unassigned = true;
1647                         }
1648                 }
1649         }
1650
1651         /* We know if we have an unassigned addresses so we might as
1652          * well optimise.
1653          */
1654         if (have_unassigned) {
1655                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1656                         if (tmp_ip->pnn == -1) {
1657                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1658                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1659                         }
1660                 }
1661         }
1662 }
1663
1664 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1665  * to move IPs from, determines the best IP/destination node
1666  * combination to move from the source node.
1667  */
1668 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1669                                     struct ctdb_node_map *nodemap,
1670                                     struct ctdb_public_ip_list *all_ips,
1671                                     int srcnode,
1672                                     uint32_t candimbl,
1673                                     uint32_t *lcp2_imbalances,
1674                                     bool *newly_healthy)
1675 {
1676         int dstnode, mindstnode;
1677         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1678         uint32_t minsrcimbl, mindstimbl;
1679         struct ctdb_public_ip_list *minip;
1680         struct ctdb_public_ip_list *tmp_ip;
1681
1682         /* Find an IP and destination node that best reduces imbalance. */
1683         minip = NULL;
1684         minsrcimbl = 0;
1685         mindstnode = -1;
1686         mindstimbl = 0;
1687
1688         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1689         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1690
1691         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1692                 /* Only consider addresses on srcnode. */
1693                 if (tmp_ip->pnn != srcnode) {
1694                         continue;
1695                 }
1696
1697                 /* What is this IP address costing the source node? */
1698                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1699                 srcimbl = candimbl - srcdsum;
1700
1701                 /* Consider this IP address would cost each potential
1702                  * destination node.  Destination nodes are limited to
1703                  * those that are newly healthy, since we don't want
1704                  * to do gratuitous failover of IPs just to make minor
1705                  * balance improvements.
1706                  */
1707                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1708                         if (! newly_healthy[dstnode]) {
1709                                 continue;
1710                         }
1711                         /* only check nodes that can actually serve this ip */
1712                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1713                                 /* no it couldnt   so skip to the next node */
1714                                 continue;
1715                         }
1716
1717                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1718                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1719                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1720                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1721                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1722                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1723
1724                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1725                             ((mindstnode == -1) ||                              \
1726                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1727
1728                                 minip = tmp_ip;
1729                                 minsrcimbl = srcimbl;
1730                                 mindstnode = dstnode;
1731                                 mindstimbl = dstimbl;
1732                         }
1733                 }
1734         }
1735         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1736
1737         if (mindstnode != -1) {
1738                 /* We found a move that makes things better... */
1739                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1740                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1741                                   ctdb_addr_to_str(&(minip->addr)),
1742                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1743
1744
1745                 lcp2_imbalances[srcnode] = srcimbl;
1746                 lcp2_imbalances[mindstnode] = mindstimbl;
1747                 minip->pnn = mindstnode;
1748
1749                 return true;
1750         }
1751
1752         return false;
1753         
1754 }
1755
1756 struct lcp2_imbalance_pnn {
1757         uint32_t imbalance;
1758         int pnn;
1759 };
1760
1761 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1762 {
1763         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1764         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1765
1766         if (lipa->imbalance > lipb->imbalance) {
1767                 return -1;
1768         } else if (lipa->imbalance == lipb->imbalance) {
1769                 return 0;
1770         } else {
1771                 return 1;
1772         }
1773 }
1774
1775 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1776  * node with the highest LCP2 imbalance, and then determines the best
1777  * IP/destination node combination to move from the source node.
1778  */
1779 static bool lcp2_failback(struct ctdb_context *ctdb,
1780                           struct ctdb_node_map *nodemap,
1781                           uint32_t mask,
1782                           struct ctdb_public_ip_list *all_ips,
1783                           uint32_t *lcp2_imbalances,
1784                           bool *newly_healthy)
1785 {
1786         int i, num_newly_healthy;
1787         struct lcp2_imbalance_pnn * lips;
1788         bool ret;
1789
1790         /* It is only worth continuing if we have suitable target
1791          * nodes to transfer IPs to.  This check is much cheaper than
1792          * continuing on...
1793          */
1794         num_newly_healthy = 0;
1795         for (i = 0; i < nodemap->num; i++) {
1796                 if (newly_healthy[i]) {
1797                         num_newly_healthy++;
1798                 }
1799         }
1800         if (num_newly_healthy == 0) {
1801                 return false;
1802         }
1803
1804         /* Put the imbalances and nodes into an array, sort them and
1805          * iterate through candidates.  Usually the 1st one will be
1806          * used, so this doesn't cost much...
1807          */
1808         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
1809         for (i = 0; i < nodemap->num; i++) {
1810                 lips[i].imbalance = lcp2_imbalances[i];
1811                 lips[i].pnn = i;
1812         }
1813         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
1814               lcp2_cmp_imbalance_pnn);
1815
1816         ret = false;
1817         for (i = 0; i < nodemap->num; i++) {
1818                 /* This means that all nodes had 0 or 1 addresses, so
1819                  * can't be imbalanced.
1820                  */
1821                 if (lips[i].imbalance == 0) {
1822                         break;
1823                 }
1824
1825                 if (lcp2_failback_candidate(ctdb,
1826                                             nodemap,
1827                                             all_ips,
1828                                             lips[i].pnn,
1829                                             lips[i].imbalance,
1830                                             lcp2_imbalances,
1831                                             newly_healthy)) {
1832                         ret = true;
1833                         break;
1834                 }
1835         }
1836
1837         talloc_free(lips);
1838         return ret;
1839 }
1840
1841 /* The calculation part of the IP allocation algorithm. */
1842 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1843                                    struct ctdb_node_map *nodemap,
1844                                    struct ctdb_public_ip_list **all_ips_p)
1845 {
1846         int i, num_healthy, retries, num_ips;
1847         uint32_t mask;
1848         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1849         uint32_t *lcp2_imbalances;
1850         bool *newly_healthy;
1851
1852         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1853
1854         /* Count how many completely healthy nodes we have */
1855         num_healthy = 0;
1856         for (i=0;i<nodemap->num;i++) {
1857                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1858                         num_healthy++;
1859                 }
1860         }
1861
1862         if (num_healthy > 0) {
1863                 /* We have healthy nodes, so only consider them for 
1864                    serving public addresses
1865                 */
1866                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1867         } else {
1868                 /* We didnt have any completely healthy nodes so
1869                    use "disabled" nodes as a fallback
1870                 */
1871                 mask = NODE_FLAGS_INACTIVE;
1872         }
1873
1874         /* since nodes only know about those public addresses that
1875            can be served by that particular node, no single node has
1876            a full list of all public addresses that exist in the cluster.
1877            Walk over all node structures and create a merged list of
1878            all public addresses that exist in the cluster.
1879
1880            keep the tree of ips around as ctdb->ip_tree
1881         */
1882         all_ips = create_merged_ip_list(ctdb);
1883         *all_ips_p = all_ips; /* minimal code changes */
1884
1885         /* Count how many ips we have */
1886         num_ips = 0;
1887         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1888                 num_ips++;
1889         }
1890
1891         /* If we want deterministic ip allocations, i.e. that the ip addresses
1892            will always be allocated the same way for a specific set of
1893            available/unavailable nodes.
1894         */
1895         if (1 == ctdb->tunable.deterministic_public_ips) {              
1896                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1897                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1898                         tmp_ip->pnn = i%nodemap->num;
1899                 }
1900         }
1901
1902
1903         /* mark all public addresses with a masked node as being served by
1904            node -1
1905         */
1906         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1907                 if (tmp_ip->pnn == -1) {
1908                         continue;
1909                 }
1910                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1911                         tmp_ip->pnn = -1;
1912                 }
1913         }
1914
1915         /* verify that the assigned nodes can serve that public ip
1916            and set it to -1 if not
1917         */
1918         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1919                 if (tmp_ip->pnn == -1) {
1920                         continue;
1921                 }
1922                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1923                         /* this node can not serve this ip. */
1924                         tmp_ip->pnn = -1;
1925                 }
1926         }
1927
1928         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1929                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
1930         }
1931
1932         /* now we must redistribute all public addresses with takeover node
1933            -1 among the nodes available
1934         */
1935         retries = 0;
1936 try_again:
1937         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1938                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
1939         } else {
1940                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
1941         }
1942
1943         /* If we dont want ips to fail back after a node becomes healthy
1944            again, we wont even try to reallocat the ip addresses so that
1945            they are evenly spread out.
1946            This can NOT be used at the same time as DeterministicIPs !
1947         */
1948         if (1 == ctdb->tunable.no_ip_failback) {
1949                 if (1 == ctdb->tunable.deterministic_public_ips) {
1950                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1951                 }
1952                 goto finished;
1953         }
1954
1955
1956         /* now, try to make sure the ip adresses are evenly distributed
1957            across the node.
1958         */
1959         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1960                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
1961                         goto try_again;
1962                 }
1963         } else {
1964                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
1965                         goto try_again;
1966                 }
1967         }
1968
1969         /* finished distributing the public addresses, now just send the 
1970            info out to the nodes
1971         */
1972 finished:
1973
1974         /* at this point ->pnn is the node which will own each IP
1975            or -1 if there is no node that can cover this ip
1976         */
1977
1978         return;
1979 }
1980
1981 /*
1982   make any IP alias changes for public addresses that are necessary 
1983  */
1984 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1985 {
1986         int i;
1987         struct ctdb_public_ip ip;
1988         struct ctdb_public_ipv4 ipv4;
1989         uint32_t *nodes;
1990         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1991         TDB_DATA data;
1992         struct timeval timeout;
1993         struct client_async_data *async_data;
1994         struct ctdb_client_control_state *state;
1995         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1996
1997         /*
1998          * ip failover is completely disabled, just send out the 
1999          * ipreallocated event.
2000          */
2001         if (ctdb->tunable.disable_ip_failover != 0) {
2002                 goto ipreallocated;
2003         }
2004
2005         ZERO_STRUCT(ip);
2006
2007         /* Do the IP reassignment calculations */
2008         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2009
2010         /* now tell all nodes to delete any alias that they should not
2011            have.  This will be a NOOP on nodes that don't currently
2012            hold the given alias */
2013         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2014         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2015
2016         for (i=0;i<nodemap->num;i++) {
2017                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2018                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2019                         continue;
2020                 }
2021
2022                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2023                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2024                                 /* This node should be serving this
2025                                    vnn so dont tell it to release the ip
2026                                 */
2027                                 continue;
2028                         }
2029                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2030                                 ipv4.pnn = tmp_ip->pnn;
2031                                 ipv4.sin = tmp_ip->addr.ip;
2032
2033                                 timeout = TAKEOVER_TIMEOUT();
2034                                 data.dsize = sizeof(ipv4);
2035                                 data.dptr  = (uint8_t *)&ipv4;
2036                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2037                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2038                                                 data, async_data,
2039                                                 &timeout, NULL);
2040                         } else {
2041                                 ip.pnn  = tmp_ip->pnn;
2042                                 ip.addr = tmp_ip->addr;
2043
2044                                 timeout = TAKEOVER_TIMEOUT();
2045                                 data.dsize = sizeof(ip);
2046                                 data.dptr  = (uint8_t *)&ip;
2047                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2048                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2049                                                 data, async_data,
2050                                                 &timeout, NULL);
2051                         }
2052
2053                         if (state == NULL) {
2054                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2055                                 talloc_free(tmp_ctx);
2056                                 return -1;
2057                         }
2058                 
2059                         ctdb_client_async_add(async_data, state);
2060                 }
2061         }
2062         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2063                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2064                 talloc_free(tmp_ctx);
2065                 return -1;
2066         }
2067         talloc_free(async_data);
2068
2069
2070         /* tell all nodes to get their own IPs */
2071         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2072         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2073         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2074                 if (tmp_ip->pnn == -1) {
2075                         /* this IP won't be taken over */
2076                         continue;
2077                 }
2078
2079                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2080                         ipv4.pnn = tmp_ip->pnn;
2081                         ipv4.sin = tmp_ip->addr.ip;
2082
2083                         timeout = TAKEOVER_TIMEOUT();
2084                         data.dsize = sizeof(ipv4);
2085                         data.dptr  = (uint8_t *)&ipv4;
2086                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2087                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2088                                         data, async_data,
2089                                         &timeout, NULL);
2090                 } else {
2091                         ip.pnn  = tmp_ip->pnn;
2092                         ip.addr = tmp_ip->addr;
2093
2094                         timeout = TAKEOVER_TIMEOUT();
2095                         data.dsize = sizeof(ip);
2096                         data.dptr  = (uint8_t *)&ip;
2097                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2098                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2099                                         data, async_data,
2100                                         &timeout, NULL);
2101                 }
2102                 if (state == NULL) {
2103                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2104                         talloc_free(tmp_ctx);
2105                         return -1;
2106                 }
2107                 
2108                 ctdb_client_async_add(async_data, state);
2109         }
2110         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2111                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2112                 talloc_free(tmp_ctx);
2113                 return -1;
2114         }
2115
2116 ipreallocated:
2117         /* tell all nodes to update natwg */
2118         /* send the flags update natgw on all connected nodes */
2119         data.dptr  = discard_const("ipreallocated");
2120         data.dsize = strlen((char *)data.dptr) + 1; 
2121         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2122         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2123                                       nodes, 0, TAKEOVER_TIMEOUT(),
2124                                       false, data,
2125                                       NULL, NULL,
2126                                       NULL) != 0) {
2127                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2128         }
2129
2130         talloc_free(tmp_ctx);
2131         return 0;
2132 }
2133
2134
2135 /*
2136   destroy a ctdb_client_ip structure
2137  */
2138 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2139 {
2140         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2141                 ctdb_addr_to_str(&ip->addr),
2142                 ntohs(ip->addr.ip.sin_port),
2143                 ip->client_id));
2144
2145         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2146         return 0;
2147 }
2148
2149 /*
2150   called by a client to inform us of a TCP connection that it is managing
2151   that should tickled with an ACK when IP takeover is done
2152   we handle both the old ipv4 style of packets as well as the new ipv4/6
2153   pdus.
2154  */
2155 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2156                                 TDB_DATA indata)
2157 {
2158         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2159         struct ctdb_control_tcp *old_addr = NULL;
2160         struct ctdb_control_tcp_addr new_addr;
2161         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2162         struct ctdb_tcp_list *tcp;
2163         struct ctdb_tcp_connection t;
2164         int ret;
2165         TDB_DATA data;
2166         struct ctdb_client_ip *ip;
2167         struct ctdb_vnn *vnn;
2168         ctdb_sock_addr addr;
2169
2170         switch (indata.dsize) {
2171         case sizeof(struct ctdb_control_tcp):
2172                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2173                 ZERO_STRUCT(new_addr);
2174                 tcp_sock = &new_addr;
2175                 tcp_sock->src.ip  = old_addr->src;
2176                 tcp_sock->dest.ip = old_addr->dest;
2177                 break;
2178         case sizeof(struct ctdb_control_tcp_addr):
2179                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2180                 break;
2181         default:
2182                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2183                                  "to ctdb_control_tcp_client. size was %d but "
2184                                  "only allowed sizes are %lu and %lu\n",
2185                                  (int)indata.dsize,
2186                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2187                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2188                 return -1;
2189         }
2190
2191         addr = tcp_sock->src;
2192         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2193         addr = tcp_sock->dest;
2194         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2195
2196         ZERO_STRUCT(addr);
2197         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2198         vnn = find_public_ip_vnn(ctdb, &addr);
2199         if (vnn == NULL) {
2200                 switch (addr.sa.sa_family) {
2201                 case AF_INET:
2202                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2203                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2204                                         ctdb_addr_to_str(&addr)));
2205                         }
2206                         break;
2207                 case AF_INET6:
2208                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2209                                 ctdb_addr_to_str(&addr)));
2210                         break;
2211                 default:
2212                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2213                 }
2214
2215                 return 0;
2216         }
2217
2218         if (vnn->pnn != ctdb->pnn) {
2219                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2220                         ctdb_addr_to_str(&addr),
2221                         client_id, client->pid));
2222                 /* failing this call will tell smbd to die */
2223                 return -1;
2224         }
2225
2226         ip = talloc(client, struct ctdb_client_ip);
2227         CTDB_NO_MEMORY(ctdb, ip);
2228
2229         ip->ctdb      = ctdb;
2230         ip->addr      = addr;
2231         ip->client_id = client_id;
2232         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2233         DLIST_ADD(ctdb->client_ip_list, ip);
2234
2235         tcp = talloc(client, struct ctdb_tcp_list);
2236         CTDB_NO_MEMORY(ctdb, tcp);
2237
2238         tcp->connection.src_addr = tcp_sock->src;
2239         tcp->connection.dst_addr = tcp_sock->dest;
2240
2241         DLIST_ADD(client->tcp_list, tcp);
2242
2243         t.src_addr = tcp_sock->src;
2244         t.dst_addr = tcp_sock->dest;
2245
2246         data.dptr = (uint8_t *)&t;
2247         data.dsize = sizeof(t);
2248
2249         switch (addr.sa.sa_family) {
2250         case AF_INET:
2251                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2252                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2253                         ctdb_addr_to_str(&tcp_sock->src),
2254                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2255                 break;
2256         case AF_INET6:
2257                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2258                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2259                         ctdb_addr_to_str(&tcp_sock->src),
2260                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2261                 break;
2262         default:
2263                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2264         }
2265
2266
2267         /* tell all nodes about this tcp connection */
2268         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2269                                        CTDB_CONTROL_TCP_ADD,
2270                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2271         if (ret != 0) {
2272                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2273                 return -1;
2274         }
2275
2276         return 0;
2277 }
2278
2279 /*
2280   find a tcp address on a list
2281  */
2282 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2283                                            struct ctdb_tcp_connection *tcp)
2284 {
2285         int i;
2286
2287         if (array == NULL) {
2288                 return NULL;
2289         }
2290
2291         for (i=0;i<array->num;i++) {
2292                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2293                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2294                         return &array->connections[i];
2295                 }
2296         }
2297         return NULL;
2298 }
2299
2300
2301
2302 /*
2303   called by a daemon to inform us of a TCP connection that one of its
2304   clients managing that should tickled with an ACK when IP takeover is
2305   done
2306  */
2307 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2308 {
2309         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2310         struct ctdb_tcp_array *tcparray;
2311         struct ctdb_tcp_connection tcp;
2312         struct ctdb_vnn *vnn;
2313
2314         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2315         if (vnn == NULL) {
2316                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2317                         ctdb_addr_to_str(&p->dst_addr)));
2318
2319                 return -1;
2320         }
2321
2322
2323         tcparray = vnn->tcp_array;
2324
2325         /* If this is the first tickle */
2326         if (tcparray == NULL) {
2327                 tcparray = talloc_size(ctdb->nodes, 
2328                         offsetof(struct ctdb_tcp_array, connections) +
2329                         sizeof(struct ctdb_tcp_connection) * 1);
2330                 CTDB_NO_MEMORY(ctdb, tcparray);
2331                 vnn->tcp_array = tcparray;
2332
2333                 tcparray->num = 0;
2334                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2335                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2336
2337                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2338                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2339                 tcparray->num++;
2340
2341                 if (tcp_update_needed) {
2342                         vnn->tcp_update_needed = true;
2343                 }
2344                 return 0;
2345         }
2346
2347
2348         /* Do we already have this tickle ?*/
2349         tcp.src_addr = p->src_addr;
2350         tcp.dst_addr = p->dst_addr;
2351         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2352                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2353                         ctdb_addr_to_str(&tcp.dst_addr),
2354                         ntohs(tcp.dst_addr.ip.sin_port),
2355                         vnn->pnn));
2356                 return 0;
2357         }
2358
2359         /* A new tickle, we must add it to the array */
2360         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2361                                         struct ctdb_tcp_connection,
2362                                         tcparray->num+1);
2363         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2364
2365         vnn->tcp_array = tcparray;
2366         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2367         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2368         tcparray->num++;
2369                                 
2370         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2371                 ctdb_addr_to_str(&tcp.dst_addr),
2372                 ntohs(tcp.dst_addr.ip.sin_port),
2373                 vnn->pnn));
2374
2375         if (tcp_update_needed) {
2376                 vnn->tcp_update_needed = true;
2377         }
2378
2379         return 0;
2380 }
2381
2382
2383 /*
2384   called by a daemon to inform us of a TCP connection that one of its
2385   clients managing that should tickled with an ACK when IP takeover is
2386   done
2387  */
2388 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2389 {
2390         struct ctdb_tcp_connection *tcpp;
2391         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2392
2393         if (vnn == NULL) {
2394                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2395                         ctdb_addr_to_str(&conn->dst_addr)));
2396                 return;
2397         }
2398
2399         /* if the array is empty we cant remove it
2400            and we dont need to do anything
2401          */
2402         if (vnn->tcp_array == NULL) {
2403                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2404                         ctdb_addr_to_str(&conn->dst_addr),
2405                         ntohs(conn->dst_addr.ip.sin_port)));
2406                 return;
2407         }
2408
2409
2410         /* See if we know this connection
2411            if we dont know this connection  then we dont need to do anything
2412          */
2413         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2414         if (tcpp == NULL) {
2415                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2416                         ctdb_addr_to_str(&conn->dst_addr),
2417                         ntohs(conn->dst_addr.ip.sin_port)));
2418                 return;
2419         }
2420
2421
2422         /* We need to remove this entry from the array.
2423            Instead of allocating a new array and copying data to it
2424            we cheat and just copy the last entry in the existing array
2425            to the entry that is to be removed and just shring the 
2426            ->num field
2427          */
2428         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2429         vnn->tcp_array->num--;
2430
2431         /* If we deleted the last entry we also need to remove the entire array
2432          */
2433         if (vnn->tcp_array->num == 0) {
2434                 talloc_free(vnn->tcp_array);
2435                 vnn->tcp_array = NULL;
2436         }               
2437
2438         vnn->tcp_update_needed = true;
2439
2440         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2441                 ctdb_addr_to_str(&conn->src_addr),
2442                 ntohs(conn->src_addr.ip.sin_port)));
2443 }
2444
2445
2446 /*
2447   called by a daemon to inform us of a TCP connection that one of its
2448   clients used are no longer needed in the tickle database
2449  */
2450 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2451 {
2452         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2453
2454         ctdb_remove_tcp_connection(ctdb, conn);
2455
2456         return 0;
2457 }
2458
2459
2460 /*
2461   called when a daemon restarts - send all tickes for all public addresses
2462   we are serving immediately to the new node.
2463  */
2464 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2465 {
2466 /*XXX here we should send all tickes we are serving to the new node */
2467         return 0;
2468 }
2469
2470
2471 /*
2472   called when a client structure goes away - hook to remove
2473   elements from the tcp_list in all daemons
2474  */
2475 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2476 {
2477         while (client->tcp_list) {
2478                 struct ctdb_tcp_list *tcp = client->tcp_list;
2479                 DLIST_REMOVE(client->tcp_list, tcp);
2480                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2481         }
2482 }
2483
2484
2485 /*
2486   release all IPs on shutdown
2487  */
2488 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2489 {
2490         struct ctdb_vnn *vnn;
2491
2492         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2493                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2494                         ctdb_vnn_unassign_iface(ctdb, vnn);
2495                         continue;
2496                 }
2497                 if (!vnn->iface) {
2498                         continue;
2499                 }
2500                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2501                                   ctdb_vnn_iface_string(vnn),
2502                                   ctdb_addr_to_str(&vnn->public_address),
2503                                   vnn->public_netmask_bits);
2504                 release_kill_clients(ctdb, &vnn->public_address);
2505                 ctdb_vnn_unassign_iface(ctdb, vnn);
2506         }
2507 }
2508
2509
2510 /*
2511   get list of public IPs
2512  */
2513 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2514                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2515 {
2516         int i, num, len;
2517         struct ctdb_all_public_ips *ips;
2518         struct ctdb_vnn *vnn;
2519         bool only_available = false;
2520
2521         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2522                 only_available = true;
2523         }
2524
2525         /* count how many public ip structures we have */
2526         num = 0;
2527         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2528                 num++;
2529         }
2530
2531         len = offsetof(struct ctdb_all_public_ips, ips) + 
2532                 num*sizeof(struct ctdb_public_ip);
2533         ips = talloc_zero_size(outdata, len);
2534         CTDB_NO_MEMORY(ctdb, ips);
2535
2536         i = 0;
2537         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2538                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2539                         continue;
2540                 }
2541                 ips->ips[i].pnn  = vnn->pnn;
2542                 ips->ips[i].addr = vnn->public_address;
2543                 i++;
2544         }
2545         ips->num = i;
2546         len = offsetof(struct ctdb_all_public_ips, ips) +
2547                 i*sizeof(struct ctdb_public_ip);
2548
2549         outdata->dsize = len;
2550         outdata->dptr  = (uint8_t *)ips;
2551
2552         return 0;
2553 }
2554
2555
2556 /*
2557   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2558  */
2559 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2560                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2561 {
2562         int i, num, len;
2563         struct ctdb_all_public_ipsv4 *ips;
2564         struct ctdb_vnn *vnn;
2565
2566         /* count how many public ip structures we have */
2567         num = 0;
2568         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2569                 if (vnn->public_address.sa.sa_family != AF_INET) {
2570                         continue;
2571                 }
2572                 num++;
2573         }
2574
2575         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2576                 num*sizeof(struct ctdb_public_ipv4);
2577         ips = talloc_zero_size(outdata, len);
2578         CTDB_NO_MEMORY(ctdb, ips);
2579
2580         outdata->dsize = len;
2581         outdata->dptr  = (uint8_t *)ips;
2582
2583         ips->num = num;
2584         i = 0;
2585         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2586                 if (vnn->public_address.sa.sa_family != AF_INET) {
2587                         continue;
2588                 }
2589                 ips->ips[i].pnn = vnn->pnn;
2590                 ips->ips[i].sin = vnn->public_address.ip;
2591                 i++;
2592         }
2593
2594         return 0;
2595 }
2596
2597 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2598                                         struct ctdb_req_control *c,
2599                                         TDB_DATA indata,
2600                                         TDB_DATA *outdata)
2601 {
2602         int i, num, len;
2603         ctdb_sock_addr *addr;
2604         struct ctdb_control_public_ip_info *info;
2605         struct ctdb_vnn *vnn;
2606
2607         addr = (ctdb_sock_addr *)indata.dptr;
2608
2609         vnn = find_public_ip_vnn(ctdb, addr);
2610         if (vnn == NULL) {
2611                 /* if it is not a public ip   it could be our 'single ip' */
2612                 if (ctdb->single_ip_vnn) {
2613                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2614                                 vnn = ctdb->single_ip_vnn;
2615                         }
2616                 }
2617         }
2618         if (vnn == NULL) {
2619                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2620                                  "'%s'not a public address\n",
2621                                  ctdb_addr_to_str(addr)));
2622                 return -1;
2623         }
2624
2625         /* count how many public ip structures we have */
2626         num = 0;
2627         for (;vnn->ifaces[num];) {
2628                 num++;
2629         }
2630
2631         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2632                 num*sizeof(struct ctdb_control_iface_info);
2633         info = talloc_zero_size(outdata, len);
2634         CTDB_NO_MEMORY(ctdb, info);
2635
2636         info->ip.addr = vnn->public_address;
2637         info->ip.pnn = vnn->pnn;
2638         info->active_idx = 0xFFFFFFFF;
2639
2640         for (i=0; vnn->ifaces[i]; i++) {
2641                 struct ctdb_iface *cur;
2642
2643                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2644                 if (cur == NULL) {
2645                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2646                                            vnn->ifaces[i]));
2647                         return -1;
2648                 }
2649                 if (vnn->iface == cur) {
2650                         info->active_idx = i;
2651                 }
2652                 strcpy(info->ifaces[i].name, cur->name);
2653                 info->ifaces[i].link_state = cur->link_up;
2654                 info->ifaces[i].references = cur->references;
2655         }
2656         info->num = i;
2657         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2658                 i*sizeof(struct ctdb_control_iface_info);
2659
2660         outdata->dsize = len;
2661         outdata->dptr  = (uint8_t *)info;
2662
2663         return 0;
2664 }
2665
2666 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2667                                 struct ctdb_req_control *c,
2668                                 TDB_DATA *outdata)
2669 {
2670         int i, num, len;
2671         struct ctdb_control_get_ifaces *ifaces;
2672         struct ctdb_iface *cur;
2673
2674         /* count how many public ip structures we have */
2675         num = 0;
2676         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2677                 num++;
2678         }
2679
2680         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2681                 num*sizeof(struct ctdb_control_iface_info);
2682         ifaces = talloc_zero_size(outdata, len);
2683         CTDB_NO_MEMORY(ctdb, ifaces);
2684
2685         i = 0;
2686         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2687                 strcpy(ifaces->ifaces[i].name, cur->name);
2688                 ifaces->ifaces[i].link_state = cur->link_up;
2689                 ifaces->ifaces[i].references = cur->references;
2690                 i++;
2691         }
2692         ifaces->num = i;
2693         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2694                 i*sizeof(struct ctdb_control_iface_info);
2695
2696         outdata->dsize = len;
2697         outdata->dptr  = (uint8_t *)ifaces;
2698
2699         return 0;
2700 }
2701
2702 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2703                                     struct ctdb_req_control *c,
2704                                     TDB_DATA indata)
2705 {
2706         struct ctdb_control_iface_info *info;
2707         struct ctdb_iface *iface;
2708         bool link_up = false;
2709
2710         info = (struct ctdb_control_iface_info *)indata.dptr;
2711
2712         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2713                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2714                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2715                                   len, len, info->name));
2716                 return -1;
2717         }
2718
2719         switch (info->link_state) {
2720         case 0:
2721                 link_up = false;
2722                 break;
2723         case 1:
2724                 link_up = true;
2725                 break;
2726         default:
2727                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2728                                   (unsigned int)info->link_state));
2729                 return -1;
2730         }
2731
2732         if (info->references != 0) {
2733                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2734                                   (unsigned int)info->references));
2735                 return -1;
2736         }
2737
2738         iface = ctdb_find_iface(ctdb, info->name);
2739         if (iface == NULL) {
2740                 return -1;
2741         }
2742
2743         if (link_up == iface->link_up) {
2744                 return 0;
2745         }
2746
2747         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2748               ("iface[%s] has changed it's link status %s => %s\n",
2749                iface->name,
2750                iface->link_up?"up":"down",
2751                link_up?"up":"down"));
2752
2753         iface->link_up = link_up;
2754         return 0;
2755 }
2756
2757
2758 /* 
2759    structure containing the listening socket and the list of tcp connections
2760    that the ctdb daemon is to kill
2761 */
2762 struct ctdb_kill_tcp {
2763         struct ctdb_vnn *vnn;
2764         struct ctdb_context *ctdb;
2765         int capture_fd;
2766         struct fd_event *fde;
2767         trbt_tree_t *connections;
2768         void *private_data;
2769 };
2770
2771 /*
2772   a tcp connection that is to be killed
2773  */
2774 struct ctdb_killtcp_con {
2775         ctdb_sock_addr src_addr;
2776         ctdb_sock_addr dst_addr;
2777         int count;
2778         struct ctdb_kill_tcp *killtcp;
2779 };
2780
2781 /* this function is used to create a key to represent this socketpair
2782    in the killtcp tree.
2783    this key is used to insert and lookup matching socketpairs that are
2784    to be tickled and RST
2785 */
2786 #define KILLTCP_KEYLEN  10
2787 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2788 {
2789         static uint32_t key[KILLTCP_KEYLEN];
2790
2791         bzero(key, sizeof(key));
2792
2793         if (src->sa.sa_family != dst->sa.sa_family) {
2794                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2795                 return key;
2796         }
2797         
2798         switch (src->sa.sa_family) {
2799         case AF_INET:
2800                 key[0]  = dst->ip.sin_addr.s_addr;
2801                 key[1]  = src->ip.sin_addr.s_addr;
2802                 key[2]  = dst->ip.sin_port;
2803                 key[3]  = src->ip.sin_port;
2804                 break;
2805         case AF_INET6: {
2806                 uint32_t *dst6_addr32 =
2807                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2808                 uint32_t *src6_addr32 =
2809                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2810                 key[0]  = dst6_addr32[3];
2811                 key[1]  = src6_addr32[3];
2812                 key[2]  = dst6_addr32[2];
2813                 key[3]  = src6_addr32[2];
2814                 key[4]  = dst6_addr32[1];
2815                 key[5]  = src6_addr32[1];
2816                 key[6]  = dst6_addr32[0];
2817                 key[7]  = src6_addr32[0];
2818                 key[8]  = dst->ip6.sin6_port;
2819                 key[9]  = src->ip6.sin6_port;
2820                 break;
2821         }
2822         default:
2823                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2824                 return key;
2825         }
2826
2827         return key;
2828 }
2829
2830 /*
2831   called when we get a read event on the raw socket
2832  */
2833 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2834                                 uint16_t flags, void *private_data)
2835 {
2836         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2837         struct ctdb_killtcp_con *con;
2838         ctdb_sock_addr src, dst;
2839         uint32_t ack_seq, seq;
2840
2841         if (!(flags & EVENT_FD_READ)) {
2842                 return;
2843         }
2844
2845         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2846                                 killtcp->private_data,
2847                                 &src, &dst,
2848                                 &ack_seq, &seq) != 0) {
2849                 /* probably a non-tcp ACK packet */
2850                 return;
2851         }
2852
2853         /* check if we have this guy in our list of connections
2854            to kill
2855         */
2856         con = trbt_lookuparray32(killtcp->connections, 
2857                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2858         if (con == NULL) {
2859                 /* no this was some other packet we can just ignore */
2860                 return;
2861         }
2862
2863         /* This one has been tickled !
2864            now reset him and remove him from the list.
2865          */
2866         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2867                 ntohs(con->dst_addr.ip.sin_port),
2868                 ctdb_addr_to_str(&con->src_addr),
2869                 ntohs(con->src_addr.ip.sin_port)));
2870
2871         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2872         talloc_free(con);
2873 }
2874
2875
2876 /* when traversing the list of all tcp connections to send tickle acks to
2877    (so that we can capture the ack coming back and kill the connection
2878     by a RST)
2879    this callback is called for each connection we are currently trying to kill
2880 */
2881 static int tickle_connection_traverse(void *param, void *data)
2882 {
2883         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2884
2885         /* have tried too many times, just give up */
2886         if (con->count >= 5) {
2887                 /* can't delete in traverse: reparent to delete_cons */
2888                 talloc_steal(param, con);
2889                 return 0;
2890         }
2891
2892         /* othervise, try tickling it again */
2893         con->count++;
2894         ctdb_sys_send_tcp(
2895                 (ctdb_sock_addr *)&con->dst_addr,
2896                 (ctdb_sock_addr *)&con->src_addr,
2897                 0, 0, 0);
2898         return 0;
2899 }
2900
2901
2902 /* 
2903    called every second until all sentenced connections have been reset
2904  */
2905 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2906                                               struct timeval t, void *private_data)
2907 {
2908         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2909         void *delete_cons = talloc_new(NULL);
2910
2911         /* loop over all connections sending tickle ACKs */
2912         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2913
2914         /* now we've finished traverse, it's safe to do deletion. */
2915         talloc_free(delete_cons);
2916
2917         /* If there are no more connections to kill we can remove the
2918            entire killtcp structure
2919          */
2920         if ( (killtcp->connections == NULL) || 
2921              (killtcp->connections->root == NULL) ) {
2922                 talloc_free(killtcp);
2923                 return;
2924         }
2925
2926         /* try tickling them again in a seconds time
2927          */
2928         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2929                         ctdb_tickle_sentenced_connections, killtcp);
2930 }
2931
2932 /*
2933   destroy the killtcp structure
2934  */
2935 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2936 {
2937         if (killtcp->vnn) {
2938                 killtcp->vnn->killtcp = NULL;
2939         }
2940         return 0;
2941 }
2942
2943
2944 /* nothing fancy here, just unconditionally replace any existing
2945    connection structure with the new one.
2946
2947    dont even free the old one if it did exist, that one is talloc_stolen
2948    by the same node in the tree anyway and will be deleted when the new data 
2949    is deleted
2950 */
2951 static void *add_killtcp_callback(void *parm, void *data)
2952 {
2953         return parm;
2954 }
2955
2956 /*
2957   add a tcp socket to the list of connections we want to RST
2958  */
2959 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2960                                        ctdb_sock_addr *s,
2961                                        ctdb_sock_addr *d)
2962 {
2963         ctdb_sock_addr src, dst;
2964         struct ctdb_kill_tcp *killtcp;
2965         struct ctdb_killtcp_con *con;
2966         struct ctdb_vnn *vnn;
2967
2968         ctdb_canonicalize_ip(s, &src);
2969         ctdb_canonicalize_ip(d, &dst);
2970
2971         vnn = find_public_ip_vnn(ctdb, &dst);
2972         if (vnn == NULL) {
2973                 vnn = find_public_ip_vnn(ctdb, &src);
2974         }
2975         if (vnn == NULL) {
2976                 /* if it is not a public ip   it could be our 'single ip' */
2977                 if (ctdb->single_ip_vnn) {
2978                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2979                                 vnn = ctdb->single_ip_vnn;
2980                         }
2981                 }
2982         }
2983         if (vnn == NULL) {
2984                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2985                 return -1;
2986         }
2987
2988         killtcp = vnn->killtcp;
2989         
2990         /* If this is the first connection to kill we must allocate
2991            a new structure
2992          */
2993         if (killtcp == NULL) {
2994                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2995                 CTDB_NO_MEMORY(ctdb, killtcp);
2996
2997                 killtcp->vnn         = vnn;
2998                 killtcp->ctdb        = ctdb;
2999                 killtcp->capture_fd  = -1;
3000                 killtcp->connections = trbt_create(killtcp, 0);
3001
3002                 vnn->killtcp         = killtcp;
3003                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3004         }
3005
3006
3007
3008         /* create a structure that describes this connection we want to
3009            RST and store it in killtcp->connections
3010         */
3011         con = talloc(killtcp, struct ctdb_killtcp_con);
3012         CTDB_NO_MEMORY(ctdb, con);
3013         con->src_addr = src;
3014         con->dst_addr = dst;
3015         con->count    = 0;
3016         con->killtcp  = killtcp;
3017
3018
3019         trbt_insertarray32_callback(killtcp->connections,
3020                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3021                         add_killtcp_callback, con);
3022
3023         /* 
3024            If we dont have a socket to listen on yet we must create it
3025          */
3026         if (killtcp->capture_fd == -1) {
3027                 const char *iface = ctdb_vnn_iface_string(vnn);
3028                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3029                 if (killtcp->capture_fd == -1) {
3030                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3031                                           "socket on iface '%s' for killtcp (%s)\n",
3032                                           iface, strerror(errno)));
3033                         goto failed;
3034                 }
3035         }
3036
3037
3038         if (killtcp->fde == NULL) {
3039                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3040                                             EVENT_FD_READ,
3041                                             capture_tcp_handler, killtcp);
3042                 tevent_fd_set_auto_close(killtcp->fde);
3043
3044                 /* We also need to set up some events to tickle all these connections
3045                    until they are all reset
3046                 */
3047                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3048                                 ctdb_tickle_sentenced_connections, killtcp);
3049         }
3050
3051         /* tickle him once now */
3052         ctdb_sys_send_tcp(
3053                 &con->dst_addr,
3054                 &con->src_addr,
3055                 0, 0, 0);
3056
3057         return 0;
3058
3059 failed:
3060         talloc_free(vnn->killtcp);
3061         vnn->killtcp = NULL;
3062         return -1;
3063 }
3064
3065 /*
3066   kill a TCP connection.
3067  */
3068 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3069 {
3070         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3071
3072         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3073 }
3074
3075 /*
3076   called by a daemon to inform us of the entire list of TCP tickles for
3077   a particular public address.
3078   this control should only be sent by the node that is currently serving
3079   that public address.
3080  */
3081 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3082 {
3083         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3084         struct ctdb_tcp_array *tcparray;
3085         struct ctdb_vnn *vnn;
3086
3087         /* We must at least have tickles.num or else we cant verify the size
3088            of the received data blob
3089          */
3090         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3091                                         tickles.connections)) {
3092                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3093                 return -1;
3094         }
3095
3096         /* verify that the size of data matches what we expect */
3097         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3098                                 tickles.connections)
3099                          + sizeof(struct ctdb_tcp_connection)
3100                                  * list->tickles.num) {
3101                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3102                 return -1;
3103         }       
3104
3105         vnn = find_public_ip_vnn(ctdb, &list->addr);
3106         if (vnn == NULL) {
3107                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3108                         ctdb_addr_to_str(&list->addr)));
3109
3110                 return 1;
3111         }
3112
3113         /* remove any old ticklelist we might have */
3114         talloc_free(vnn->tcp_array);
3115         vnn->tcp_array = NULL;
3116
3117         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3118         CTDB_NO_MEMORY(ctdb, tcparray);
3119
3120         tcparray->num = list->tickles.num;
3121
3122         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3123         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3124
3125         memcpy(tcparray->connections, &list->tickles.connections[0], 
3126                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3127
3128         /* We now have a new fresh tickle list array for this vnn */
3129         vnn->tcp_array = talloc_steal(vnn, tcparray);
3130         
3131         return 0;
3132 }
3133
3134 /*
3135   called to return the full list of tickles for the puclic address associated 
3136   with the provided vnn
3137  */
3138 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3139 {
3140         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3141         struct ctdb_control_tcp_tickle_list *list;
3142         struct ctdb_tcp_array *tcparray;
3143         int num;
3144         struct ctdb_vnn *vnn;
3145
3146         vnn = find_public_ip_vnn(ctdb, addr);
3147         if (vnn == NULL) {
3148                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3149                         ctdb_addr_to_str(addr)));
3150
3151                 return 1;
3152         }
3153
3154         tcparray = vnn->tcp_array;
3155         if (tcparray) {
3156                 num = tcparray->num;
3157         } else {
3158                 num = 0;
3159         }
3160
3161         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3162                                 tickles.connections)
3163                         + sizeof(struct ctdb_tcp_connection) * num;
3164
3165         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3166         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3167         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3168
3169         list->addr = *addr;
3170         list->tickles.num = num;
3171         if (num) {
3172                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3173                         sizeof(struct ctdb_tcp_connection) * num);
3174         }
3175
3176         return 0;
3177 }
3178
3179
3180 /*
3181   set the list of all tcp tickles for a public address
3182  */
3183 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3184                               struct timeval timeout, uint32_t destnode, 
3185                               ctdb_sock_addr *addr,
3186                               struct ctdb_tcp_array *tcparray)
3187 {
3188         int ret, num;
3189         TDB_DATA data;
3190         struct ctdb_control_tcp_tickle_list *list;
3191
3192         if (tcparray) {
3193                 num = tcparray->num;
3194         } else {
3195                 num = 0;
3196         }
3197
3198         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3199                                 tickles.connections) +
3200                         sizeof(struct ctdb_tcp_connection) * num;
3201         data.dptr = talloc_size(ctdb, data.dsize);
3202         CTDB_NO_MEMORY(ctdb, data.dptr);
3203
3204         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3205         list->addr = *addr;
3206         list->tickles.num = num;
3207         if (tcparray) {
3208                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3209         }
3210
3211         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3212                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3213                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3214         if (ret != 0) {
3215                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3216                 return -1;
3217         }
3218
3219         talloc_free(data.dptr);
3220
3221         return ret;
3222 }
3223
3224
3225 /*
3226   perform tickle updates if required
3227  */
3228 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3229                                 struct timed_event *te, 
3230                                 struct timeval t, void *private_data)
3231 {
3232         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3233         int ret;
3234         struct ctdb_vnn *vnn;
3235
3236         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3237                 /* we only send out updates for public addresses that 
3238                    we have taken over
3239                  */
3240                 if (ctdb->pnn != vnn->pnn) {
3241                         continue;
3242                 }
3243                 /* We only send out the updates if we need to */
3244                 if (!vnn->tcp_update_needed) {
3245                         continue;
3246                 }
3247                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3248                                 TAKEOVER_TIMEOUT(),
3249                                 CTDB_BROADCAST_CONNECTED,
3250                                 &vnn->public_address,
3251                                 vnn->tcp_array);
3252                 if (ret != 0) {
3253                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3254                                 ctdb_addr_to_str(&vnn->public_address)));
3255                 }
3256         }
3257
3258         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3259                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3260                              ctdb_update_tcp_tickles, ctdb);
3261 }               
3262         
3263
3264 /*
3265   start periodic update of tcp tickles
3266  */
3267 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3268 {
3269         ctdb->tickle_update_context = talloc_new(ctdb);
3270
3271         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3272                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3273                              ctdb_update_tcp_tickles, ctdb);
3274 }
3275
3276
3277
3278
3279 struct control_gratious_arp {
3280         struct ctdb_context *ctdb;
3281         ctdb_sock_addr addr;
3282         const char *iface;
3283         int count;
3284 };
3285
3286 /*
3287   send a control_gratuitous arp
3288  */
3289 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3290                                   struct timeval t, void *private_data)
3291 {
3292         int ret;
3293         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3294                                                         struct control_gratious_arp);
3295
3296         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3297         if (ret != 0) {
3298                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3299                                  arp->iface, strerror(errno)));
3300         }
3301
3302
3303         arp->count++;
3304         if (arp->count == CTDB_ARP_REPEAT) {
3305                 talloc_free(arp);
3306                 return;
3307         }
3308
3309         event_add_timed(arp->ctdb->ev, arp, 
3310                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3311                         send_gratious_arp, arp);
3312 }
3313
3314
3315 /*
3316   send a gratious arp 
3317  */
3318 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3319 {
3320         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3321         struct control_gratious_arp *arp;
3322
3323         /* verify the size of indata */
3324         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3325                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3326                                  (unsigned)indata.dsize, 
3327                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3328                 return -1;
3329         }
3330         if (indata.dsize != 
3331                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3332                 + gratious_arp->len ) ){
3333
3334                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3335                         "but should be %u bytes\n", 
3336                          (unsigned)indata.dsize, 
3337                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3338                 return -1;
3339         }
3340
3341
3342         arp = talloc(ctdb, struct control_gratious_arp);
3343         CTDB_NO_MEMORY(ctdb, arp);
3344
3345         arp->ctdb  = ctdb;
3346         arp->addr   = gratious_arp->addr;
3347         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3348         CTDB_NO_MEMORY(ctdb, arp->iface);
3349         arp->count = 0;
3350         
3351         event_add_timed(arp->ctdb->ev, arp, 
3352                         timeval_zero(), send_gratious_arp, arp);
3353
3354         return 0;
3355 }
3356
3357 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3358 {
3359         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3360         int ret;
3361
3362         /* verify the size of indata */
3363         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3364                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3365                 return -1;
3366         }
3367         if (indata.dsize != 
3368                 ( offsetof(struct ctdb_control_ip_iface, iface)
3369                 + pub->len ) ){
3370
3371                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3372                         "but should be %u bytes\n", 
3373                          (unsigned)indata.dsize, 
3374                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3375                 return -1;
3376         }
3377
3378         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
3379
3380         if (ret != 0) {
3381                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3382                 return -1;
3383         }
3384
3385         return 0;
3386 }
3387
3388 /*
3389   called when releaseip event finishes for del_public_address
3390  */
3391 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3392                                 void *private_data)
3393 {
3394         talloc_free(private_data);
3395 }
3396
3397 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3398 {
3399         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3400         struct ctdb_vnn *vnn;
3401         int ret;
3402
3403         /* verify the size of indata */
3404         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3405                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3406                 return -1;
3407         }
3408         if (indata.dsize != 
3409                 ( offsetof(struct ctdb_control_ip_iface, iface)
3410                 + pub->len ) ){
3411
3412                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3413                         "but should be %u bytes\n", 
3414                          (unsigned)indata.dsize, 
3415                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3416                 return -1;
3417         }
3418
3419         /* walk over all public addresses until we find a match */
3420         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3421                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3422                         TALLOC_CTX *mem_ctx;
3423
3424                         DLIST_REMOVE(ctdb->vnn, vnn);
3425                         if (vnn->pnn != ctdb->pnn) {
3426                                 if (vnn->iface != NULL) {
3427                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3428                                 }
3429                                 talloc_free(vnn);
3430                                 return 0;
3431                         }
3432                         vnn->pnn = -1;
3433
3434                         mem_ctx = talloc_new(ctdb);
3435                         talloc_steal(mem_ctx, vnn);
3436                         ret = ctdb_event_script_callback(ctdb, 
3437                                          mem_ctx, delete_ip_callback, mem_ctx,
3438                                          false,
3439                                          CTDB_EVENT_RELEASE_IP,
3440                                          "%s %s %u",
3441                                          ctdb_vnn_iface_string(vnn),
3442                                          ctdb_addr_to_str(&vnn->public_address),
3443                                          vnn->public_netmask_bits);
3444                         if (vnn->iface != NULL) {
3445                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3446                         }
3447                         if (ret != 0) {
3448                                 return -1;
3449                         }
3450                         return 0;
3451                 }
3452         }
3453
3454         return -1;
3455 }
3456
3457 /* This function is called from the recovery daemon to verify that a remote
3458    node has the expected ip allocation.
3459    This is verified against ctdb->ip_tree
3460 */
3461 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3462 {
3463         struct ctdb_public_ip_list *tmp_ip; 
3464         int i;
3465
3466         if (ctdb->ip_tree == NULL) {
3467                 /* dont know the expected allocation yet, assume remote node
3468                    is correct. */
3469                 return 0;
3470         }
3471
3472         if (ips == NULL) {
3473                 return 0;
3474         }
3475
3476         for (i=0; i<ips->num; i++) {
3477                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3478                 if (tmp_ip == NULL) {
3479                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3480                         return -1;
3481                 }
3482
3483                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3484                         continue;
3485                 }
3486
3487                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3488                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3489                         return -1;
3490                 }
3491         }
3492
3493         return 0;
3494 }
3495
3496 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3497 {
3498         struct ctdb_public_ip_list *tmp_ip; 
3499
3500         if (ctdb->ip_tree == NULL) {
3501                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3502                 return -1;
3503         }
3504
3505         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3506         if (tmp_ip == NULL) {
3507                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3508                 return -1;
3509         }
3510
3511         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3512         tmp_ip->pnn = ip->pnn;
3513
3514         return 0;
3515 }