new version 1.0.112-42
[sahlberg/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_takeover_arp {
37         struct ctdb_context *ctdb;
38         uint32_t count;
39         ctdb_sock_addr addr;
40         struct ctdb_tcp_array *tcparray;
41         struct ctdb_vnn *vnn;
42 };
43
44
45 /*
46   lists of tcp endpoints
47  */
48 struct ctdb_tcp_list {
49         struct ctdb_tcp_list *prev, *next;
50         struct ctdb_tcp_connection connection;
51 };
52
53 /*
54   list of clients to kill on IP release
55  */
56 struct ctdb_client_ip {
57         struct ctdb_client_ip *prev, *next;
58         struct ctdb_context *ctdb;
59         ctdb_sock_addr addr;
60         uint32_t client_id;
61 };
62
63
64 /*
65   send a gratuitous arp
66  */
67 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
68                                   struct timeval t, void *private_data)
69 {
70         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
71                                                         struct ctdb_takeover_arp);
72         int i, ret;
73         struct ctdb_tcp_array *tcparray;
74
75         ret = ctdb_sys_send_arp(&arp->addr, arp->vnn->iface);
76         if (ret != 0) {
77                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed (%s)\n", strerror(errno)));
78         }
79
80         tcparray = arp->tcparray;
81         if (tcparray) {
82                 for (i=0;i<tcparray->num;i++) {
83                         struct ctdb_tcp_connection *tcon;
84
85                         tcon = &tcparray->connections[i];
86                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
87                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
88                                 ctdb_addr_to_str(&tcon->src_addr),
89                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
90                         ret = ctdb_sys_send_tcp(
91                                 &tcon->src_addr, 
92                                 &tcon->dst_addr,
93                                 0, 0, 0);
94                         if (ret != 0) {
95                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
96                                         ctdb_addr_to_str(&tcon->src_addr)));
97                         }
98                 }
99         }
100
101         arp->count++;
102
103         if (arp->count == CTDB_ARP_REPEAT) {
104                 talloc_free(arp);
105                 return;
106         }
107
108         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
109                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
110                         ctdb_control_send_arp, arp);
111 }
112
113 struct takeover_callback_state {
114         struct ctdb_req_control *c;
115         ctdb_sock_addr *addr;
116         struct ctdb_vnn *vnn;
117 };
118
119 /*
120   called when takeip event finishes
121  */
122 static void takeover_ip_callback(struct ctdb_context *ctdb, int status, 
123                                  void *private_data)
124 {
125         struct takeover_callback_state *state = 
126                 talloc_get_type(private_data, struct takeover_callback_state);
127         struct ctdb_takeover_arp *arp;
128         struct ctdb_tcp_array *tcparray;
129
130         if (status != 0) {
131                 if (status == -ETIME) {
132                         ctdb_ban_self(ctdb);
133                 }
134                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
135                         ctdb_addr_to_str(state->addr),
136                         state->vnn->iface));
137                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
138                 talloc_free(state);
139                 return;
140         }
141
142         if (!state->vnn->takeover_ctx) {
143                 state->vnn->takeover_ctx = talloc_new(state->vnn);
144                 if (!state->vnn->takeover_ctx) {
145                         goto failed;
146                 }
147         }
148
149         arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
150         if (!arp) goto failed;
151         
152         arp->ctdb = ctdb;
153         arp->addr = *state->addr;
154         arp->vnn  = state->vnn;
155
156         tcparray = state->vnn->tcp_array;
157         if (tcparray) {
158                 /* add all of the known tcp connections for this IP to the
159                    list of tcp connections to send tickle acks for */
160                 arp->tcparray = talloc_steal(arp, tcparray);
161
162                 state->vnn->tcp_array = NULL;
163                 state->vnn->tcp_update_needed = true;
164         }
165
166         event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx, 
167                         timeval_zero(), ctdb_control_send_arp, arp);
168
169         /* the control succeeded */
170         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
171         talloc_free(state);
172         return;
173
174 failed:
175         ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
176         talloc_free(state);
177         return;
178 }
179
180 /*
181   Find the vnn of the node that has a public ip address
182   returns -1 if the address is not known as a public address
183  */
184 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
185 {
186         struct ctdb_vnn *vnn;
187
188         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
189                 if (ctdb_same_ip(&vnn->public_address, addr)) {
190                         return vnn;
191                 }
192         }
193
194         return NULL;
195 }
196
197
198 /*
199   take over an ip address
200  */
201 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
202                                  struct ctdb_req_control *c,
203                                  TDB_DATA indata, 
204                                  bool *async_reply)
205 {
206         int ret;
207         struct takeover_callback_state *state;
208         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
209         struct ctdb_vnn *vnn;
210
211         /* update out vnn list */
212         vnn = find_public_ip_vnn(ctdb, &pip->addr);
213         if (vnn == NULL) {
214                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n", 
215                         ctdb_addr_to_str(&pip->addr)));
216                 return 0;
217         }
218         vnn->pnn = pip->pnn;
219
220         /* if our kernel already has this IP, do nothing */
221         if (ctdb_sys_have_ip(&pip->addr)) {
222                 return 0;
223         }
224
225         state = talloc(vnn, struct takeover_callback_state);
226         CTDB_NO_MEMORY(ctdb, state);
227
228         state->c = talloc_steal(ctdb, c);
229         state->addr = talloc(ctdb, ctdb_sock_addr);
230         CTDB_NO_MEMORY(ctdb, state->addr);
231
232         *state->addr = pip->addr;
233         state->vnn   = vnn;
234
235         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n", 
236                 ctdb_addr_to_str(&pip->addr),
237                 vnn->public_netmask_bits, 
238                 vnn->iface));
239
240         ret = ctdb_event_script_callback(ctdb, 
241                                          state, takeover_ip_callback, state,
242                                          false,
243                                          CTDB_EVENT_TAKE_IP,
244                                          "%s %s %u",
245                                          vnn->iface, 
246                                          talloc_strdup(state, ctdb_addr_to_str(&pip->addr)),
247                                          vnn->public_netmask_bits);
248
249         if (ret != 0) {
250                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
251                         ctdb_addr_to_str(&pip->addr),
252                         vnn->iface));
253                 talloc_free(state);
254                 return -1;
255         }
256
257         /* tell ctdb_control.c that we will be replying asynchronously */
258         *async_reply = true;
259
260         return 0;
261 }
262
263 /*
264   takeover an ip address old v4 style
265  */
266 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
267                                 struct ctdb_req_control *c,
268                                 TDB_DATA indata, 
269                                 bool *async_reply)
270 {
271         TDB_DATA data;
272         
273         data.dsize = sizeof(struct ctdb_public_ip);
274         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
275         CTDB_NO_MEMORY(ctdb, data.dptr);
276         
277         memcpy(data.dptr, indata.dptr, indata.dsize);
278         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
279 }
280
281 /*
282   kill any clients that are registered with a IP that is being released
283  */
284 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
285 {
286         struct ctdb_client_ip *ip;
287
288         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
289                 ctdb_addr_to_str(addr)));
290
291         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
292                 ctdb_sock_addr tmp_addr;
293
294                 tmp_addr = ip->addr;
295                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
296                         ip->client_id,
297                         ctdb_addr_to_str(&ip->addr)));
298
299                 if (ctdb_same_ip(&tmp_addr, addr)) {
300                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
301                                                                      ip->client_id, 
302                                                                      struct ctdb_client);
303                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
304                                 ip->client_id,
305                                 ctdb_addr_to_str(&ip->addr),
306                                 client->pid));
307
308                         if (client->pid != 0) {
309                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
310                                         (unsigned)client->pid,
311                                         ctdb_addr_to_str(addr),
312                                         ip->client_id));
313                                 kill(client->pid, SIGKILL);
314                         }
315                 }
316         }
317 }
318
319 /*
320   called when releaseip event finishes
321  */
322 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
323                                 void *private_data)
324 {
325         struct takeover_callback_state *state = 
326                 talloc_get_type(private_data, struct takeover_callback_state);
327         TDB_DATA data;
328
329         if (status == -ETIME) {
330                 ctdb_ban_self(ctdb);
331         }
332
333         /* send a message to all clients of this node telling them
334            that the cluster has been reconfigured and they should
335            release any sockets on this IP */
336         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
337         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
338         data.dsize = strlen((char *)data.dptr)+1;
339
340         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
341
342         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
343
344         /* kill clients that have registered with this IP */
345         release_kill_clients(ctdb, state->addr);
346         
347         /* the control succeeded */
348         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
349         talloc_free(state);
350 }
351
352 /*
353   release an ip address
354  */
355 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
356                                 struct ctdb_req_control *c,
357                                 TDB_DATA indata, 
358                                 bool *async_reply)
359 {
360         int ret;
361         struct takeover_callback_state *state;
362         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
363         struct ctdb_vnn *vnn;
364
365         /* update our vnn list */
366         vnn = find_public_ip_vnn(ctdb, &pip->addr);
367         if (vnn == NULL) {
368                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
369                         ctdb_addr_to_str(&pip->addr)));
370                 return 0;
371         }
372         vnn->pnn = pip->pnn;
373
374         /* stop any previous arps */
375         talloc_free(vnn->takeover_ctx);
376         vnn->takeover_ctx = NULL;
377
378         if (!ctdb_sys_have_ip(&pip->addr)) {
379                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
380                         ctdb_addr_to_str(&pip->addr),
381                         vnn->public_netmask_bits, 
382                         vnn->iface));
383                 return 0;
384         }
385
386         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%u\n", 
387                 ctdb_addr_to_str(&pip->addr),
388                 vnn->public_netmask_bits, 
389                 vnn->iface,
390                 pip->pnn));
391
392         state = talloc(ctdb, struct takeover_callback_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(state, c);
396         state->addr = talloc(state, ctdb_sock_addr);       
397         CTDB_NO_MEMORY(ctdb, state->addr);
398         *state->addr = pip->addr;
399         state->vnn   = vnn;
400
401         ret = ctdb_event_script_callback(ctdb, 
402                                          state, release_ip_callback, state,
403                                          false,
404                                          CTDB_EVENT_RELEASE_IP,
405                                          "%s %s %u",
406                                          vnn->iface, 
407                                          talloc_strdup(state, ctdb_addr_to_str(&pip->addr)),
408                                          vnn->public_netmask_bits);
409         if (ret != 0) {
410                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
411                         ctdb_addr_to_str(&pip->addr),
412                         vnn->iface));
413                 talloc_free(state);
414                 return -1;
415         }
416
417         /* tell the control that we will be reply asynchronously */
418         *async_reply = true;
419         return 0;
420 }
421
422 /*
423   release an ip address old v4 style
424  */
425 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
426                                 struct ctdb_req_control *c,
427                                 TDB_DATA indata, 
428                                 bool *async_reply)
429 {
430         TDB_DATA data;
431         
432         data.dsize = sizeof(struct ctdb_public_ip);
433         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
434         CTDB_NO_MEMORY(ctdb, data.dptr);
435         
436         memcpy(data.dptr, indata.dptr, indata.dsize);
437         return ctdb_control_release_ip(ctdb, c, data, async_reply);
438 }
439
440
441 static int ctdb_add_public_address(struct ctdb_context *ctdb, ctdb_sock_addr *addr, unsigned mask, const char *iface)
442 {
443         struct ctdb_vnn      *vnn;
444
445         /* Verify that we dont have an entry for this ip yet */
446         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
447                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
448                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
449                                 ctdb_addr_to_str(addr)));
450                         return -1;
451                 }               
452         }
453
454         /* create a new vnn structure for this ip address */
455         vnn = talloc_zero(ctdb, struct ctdb_vnn);
456         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
457         vnn->iface = talloc_strdup(vnn, iface);
458         CTDB_NO_MEMORY(ctdb, vnn->iface);
459         vnn->public_address      = *addr;
460         vnn->public_netmask_bits = mask;
461         vnn->pnn                 = -1;
462         
463         DLIST_ADD(ctdb->vnn, vnn);
464
465         return 0;
466 }
467
468
469 /*
470   setup the event script directory
471 */
472 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
473 {
474         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
475         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
476         return 0;
477 }
478
479 /*
480   setup the public address lists from a file
481 */
482 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
483 {
484         char **lines;
485         int nlines;
486         int i;
487
488         lines = file_lines_load(alist, &nlines, ctdb);
489         if (lines == NULL) {
490                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
491                 return -1;
492         }
493         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
494                 nlines--;
495         }
496
497         for (i=0;i<nlines;i++) {
498                 unsigned mask;
499                 ctdb_sock_addr addr;
500                 const char *addrstr;
501                 const char *iface;
502                 char *tok, *line;
503
504                 line = lines[i];
505                 while ((*line == ' ') || (*line == '\t')) {
506                         line++;
507                 }
508                 if (*line == '#') {
509                         continue;
510                 }
511                 if (strcmp(line, "") == 0) {
512                         continue;
513                 }
514                 tok = strtok(line, " \t");
515                 addrstr = tok;
516                 tok = strtok(NULL, " \t");
517                 if (tok == NULL) {
518                         if (NULL == ctdb->default_public_interface) {
519                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
520                                          i+1));
521                                 talloc_free(lines);
522                                 return -1;
523                         }
524                         iface = ctdb->default_public_interface;
525                 } else {
526                         iface = tok;
527                 }
528
529                 if (!addrstr || !parse_ip_mask(addrstr, iface, &addr, &mask)) {
530                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
531                         talloc_free(lines);
532                         return -1;
533                 }
534                 if (ctdb_add_public_address(ctdb, &addr, mask, iface)) {
535                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
536                         talloc_free(lines);
537                         return -1;
538                 }
539         }
540
541         talloc_free(lines);
542         return 0;
543 }
544
545
546
547
548 struct ctdb_public_ip_list {
549         struct ctdb_public_ip_list *next;
550         uint32_t pnn;
551         ctdb_sock_addr addr;
552 };
553
554
555 /* Given a physical node, return the number of
556    public addresses that is currently assigned to this node.
557 */
558 static int node_ip_coverage(struct ctdb_context *ctdb, 
559         int32_t pnn,
560         struct ctdb_public_ip_list *ips)
561 {
562         int num=0;
563
564         for (;ips;ips=ips->next) {
565                 if (ips->pnn == pnn) {
566                         num++;
567                 }
568         }
569         return num;
570 }
571
572
573 /* Check if this is a public ip known to the node, i.e. can that
574    node takeover this ip ?
575 */
576 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
577                 struct ctdb_public_ip_list *ip)
578 {
579         struct ctdb_all_public_ips *public_ips;
580         int i;
581
582         public_ips = ctdb->nodes[pnn]->public_ips;
583
584         if (public_ips == NULL) {
585                 return -1;
586         }
587
588         for (i=0;i<public_ips->num;i++) {
589                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
590                         /* yes, this node can serve this public ip */
591                         return 0;
592                 }
593         }
594
595         return -1;
596 }
597
598
599 /* search the node lists list for a node to takeover this ip.
600    pick the node that currently are serving the least number of ips
601    so that the ips get spread out evenly.
602 */
603 static int find_takeover_node(struct ctdb_context *ctdb, 
604                 struct ctdb_node_map *nodemap, uint32_t mask, 
605                 struct ctdb_public_ip_list *ip,
606                 struct ctdb_public_ip_list *all_ips)
607 {
608         int pnn, min=0, num;
609         int i;
610
611         pnn    = -1;
612         for (i=0;i<nodemap->num;i++) {
613                 if (nodemap->nodes[i].flags & mask) {
614                         /* This node is not healty and can not be used to serve
615                            a public address 
616                         */
617                         continue;
618                 }
619
620                 /* verify that this node can serve this ip */
621                 if (can_node_serve_ip(ctdb, i, ip)) {
622                         /* no it couldnt   so skip to the next node */
623                         continue;
624                 }
625
626                 num = node_ip_coverage(ctdb, i, all_ips);
627                 /* was this the first node we checked ? */
628                 if (pnn == -1) {
629                         pnn = i;
630                         min  = num;
631                 } else {
632                         if (num < min) {
633                                 pnn = i;
634                                 min  = num;
635                         }
636                 }
637         }       
638         if (pnn == -1) {
639                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
640                         ctdb_addr_to_str(&ip->addr)));
641
642                 return -1;
643         }
644
645         ip->pnn = pnn;
646         return 0;
647 }
648
649 #define IP_KEYLEN       4
650 static uint32_t *ip_key(ctdb_sock_addr *ip)
651 {
652         static uint32_t key[IP_KEYLEN];
653
654         bzero(key, sizeof(key));
655
656         switch (ip->sa.sa_family) {
657         case AF_INET:
658                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
659                 break;
660         case AF_INET6:
661                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
662                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
663                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
664                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
665                 break;
666         default:
667                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
668                 return key;
669         }
670
671         return key;
672 }
673
674 static void *add_ip_callback(void *parm, void *data)
675 {
676         return parm;
677 }
678
679 void getips_count_callback(void *param, void *data)
680 {
681         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
682         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
683
684         new_ip->next = *ip_list;
685         *ip_list     = new_ip;
686 }
687
688 static struct ctdb_public_ip_list *
689 create_merged_ip_list(struct ctdb_context *ctdb)
690 {
691         int i, j;
692         struct ctdb_public_ip_list *ip_list;
693         struct ctdb_all_public_ips *public_ips;
694
695         if (ctdb->ip_tree != NULL) {
696                 talloc_free(ctdb->ip_tree);
697                 ctdb->ip_tree = NULL;
698         }
699         ctdb->ip_tree = trbt_create(ctdb, 0);
700
701         for (i=0;i<ctdb->num_nodes;i++) {
702                 public_ips = ctdb->nodes[i]->public_ips;
703
704                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
705                         continue;
706                 }
707
708                 /* there were no public ips for this node */
709                 if (public_ips == NULL) {
710                         continue;
711                 }               
712
713                 for (j=0;j<public_ips->num;j++) {
714                         struct ctdb_public_ip_list *tmp_ip; 
715
716                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
717                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
718                         tmp_ip->pnn  = public_ips->ips[j].pnn;
719                         tmp_ip->addr = public_ips->ips[j].addr;
720                         tmp_ip->next = NULL;
721
722                         trbt_insertarray32_callback(ctdb->ip_tree,
723                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
724                                 add_ip_callback,
725                                 tmp_ip);
726                 }
727         }
728
729         ip_list = NULL;
730         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
731
732         return ip_list;
733 }
734
735 /*
736   make any IP alias changes for public addresses that are necessary 
737  */
738 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
739 {
740         int i, num_healthy, retries, num_ips;
741         struct ctdb_public_ip ip;
742         struct ctdb_public_ipv4 ipv4;
743         uint32_t mask, *nodes;
744         struct ctdb_public_ip_list *all_ips, *tmp_ip;
745         int maxnode, maxnum=0, minnode, minnum=0, num;
746         TDB_DATA data;
747         struct timeval timeout;
748         struct client_async_data *async_data;
749         struct ctdb_client_control_state *state;
750         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
751
752         ZERO_STRUCT(ip);
753
754         /* Count how many completely healthy nodes we have */
755         num_healthy = 0;
756         for (i=0;i<nodemap->num;i++) {
757                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
758                         num_healthy++;
759                 }
760         }
761
762         if (num_healthy > 0) {
763                 /* We have healthy nodes, so only consider them for 
764                    serving public addresses
765                 */
766                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
767         } else {
768                 /* We didnt have any completely healthy nodes so
769                    use "disabled" nodes as a fallback
770                 */
771                 mask = NODE_FLAGS_INACTIVE;
772         }
773
774         /* since nodes only know about those public addresses that
775            can be served by that particular node, no single node has
776            a full list of all public addresses that exist in the cluster.
777            Walk over all node structures and create a merged list of
778            all public addresses that exist in the cluster.
779
780            keep the tree of ips around as ctdb->ip_tree
781         */
782         all_ips = create_merged_ip_list(ctdb);
783
784         /* Count how many ips we have */
785         num_ips = 0;
786         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
787                 num_ips++;
788         }
789
790         /* If we want deterministic ip allocations, i.e. that the ip addresses
791            will always be allocated the same way for a specific set of
792            available/unavailable nodes.
793         */
794         if (1 == ctdb->tunable.deterministic_public_ips) {              
795                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
796                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
797                         tmp_ip->pnn = i%nodemap->num;
798                 }
799         }
800
801
802         /* mark all public addresses with a masked node as being served by
803            node -1
804         */
805         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
806                 if (tmp_ip->pnn == -1) {
807                         continue;
808                 }
809                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
810                         tmp_ip->pnn = -1;
811                 }
812         }
813
814         /* verify that the assigned nodes can serve that public ip
815            and set it to -1 if not
816         */
817         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
818                 if (tmp_ip->pnn == -1) {
819                         continue;
820                 }
821                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
822                         /* this node can not serve this ip. */
823                         tmp_ip->pnn = -1;
824                 }
825         }
826
827
828         /* now we must redistribute all public addresses with takeover node
829            -1 among the nodes available
830         */
831         retries = 0;
832 try_again:
833         /* loop over all ip's and find a physical node to cover for 
834            each unassigned ip.
835         */
836         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
837                 if (tmp_ip->pnn == -1) {
838                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
839                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
840                                         ctdb_addr_to_str(&tmp_ip->addr)));
841                         }
842                 }
843         }
844
845         /* If we dont want ips to fail back after a node becomes healthy
846            again, we wont even try to reallocat the ip addresses so that
847            they are evenly spread out.
848            This can NOT be used at the same time as DeterministicIPs !
849         */
850         if (1 == ctdb->tunable.no_ip_failback) {
851                 if (1 == ctdb->tunable.deterministic_public_ips) {
852                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
853                 }
854                 goto finished;
855         }
856
857
858         /* now, try to make sure the ip adresses are evenly distributed
859            across the node.
860            for each ip address, loop over all nodes that can serve this
861            ip and make sure that the difference between the node
862            serving the most and the node serving the least ip's are not greater
863            than 1.
864         */
865         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
866                 if (tmp_ip->pnn == -1) {
867                         continue;
868                 }
869
870                 /* Get the highest and lowest number of ips's served by any 
871                    valid node which can serve this ip.
872                 */
873                 maxnode = -1;
874                 minnode = -1;
875                 for (i=0;i<nodemap->num;i++) {
876                         if (nodemap->nodes[i].flags & mask) {
877                                 continue;
878                         }
879
880                         /* only check nodes that can actually serve this ip */
881                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
882                                 /* no it couldnt   so skip to the next node */
883                                 continue;
884                         }
885
886                         num = node_ip_coverage(ctdb, i, all_ips);
887                         if (maxnode == -1) {
888                                 maxnode = i;
889                                 maxnum  = num;
890                         } else {
891                                 if (num > maxnum) {
892                                         maxnode = i;
893                                         maxnum  = num;
894                                 }
895                         }
896                         if (minnode == -1) {
897                                 minnode = i;
898                                 minnum  = num;
899                         } else {
900                                 if (num < minnum) {
901                                         minnode = i;
902                                         minnum  = num;
903                                 }
904                         }
905                 }
906                 if (maxnode == -1) {
907                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
908                                 ctdb_addr_to_str(&tmp_ip->addr)));
909
910                         continue;
911                 }
912
913                 /* If we want deterministic IPs then dont try to reallocate 
914                    them to spread out the load.
915                 */
916                 if (1 == ctdb->tunable.deterministic_public_ips) {
917                         continue;
918                 }
919
920                 /* if the spread between the smallest and largest coverage by
921                    a node is >=2 we steal one of the ips from the node with
922                    most coverage to even things out a bit.
923                    try to do this a limited number of times since we dont
924                    want to spend too much time balancing the ip coverage.
925                 */
926                 if ( (maxnum > minnum+1)
927                   && (retries < (num_ips + 5)) ){
928                         struct ctdb_public_ip_list *tmp;
929
930                         /* mark one of maxnode's vnn's as unassigned and try
931                            again
932                         */
933                         for (tmp=all_ips;tmp;tmp=tmp->next) {
934                                 if (tmp->pnn == maxnode) {
935                                         tmp->pnn = -1;
936                                         retries++;
937                                         goto try_again;
938                                 }
939                         }
940                 }
941         }
942
943
944         /* finished distributing the public addresses, now just send the 
945            info out to the nodes
946         */
947 finished:
948
949         /* at this point ->pnn is the node which will own each IP
950            or -1 if there is no node that can cover this ip
951         */
952
953         /* now tell all nodes to delete any alias that they should not
954            have.  This will be a NOOP on nodes that don't currently
955            hold the given alias */
956         async_data = talloc_zero(tmp_ctx, struct client_async_data);
957         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
958
959         for (i=0;i<nodemap->num;i++) {
960                 /* don't talk to unconnected nodes, but do talk to banned nodes */
961                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
962                         continue;
963                 }
964
965                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
966                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
967                                 /* This node should be serving this
968                                    vnn so dont tell it to release the ip
969                                 */
970                                 continue;
971                         }
972                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
973                                 ipv4.pnn = tmp_ip->pnn;
974                                 ipv4.sin = tmp_ip->addr.ip;
975
976                                 timeout = TAKEOVER_TIMEOUT();
977                                 data.dsize = sizeof(ipv4);
978                                 data.dptr  = (uint8_t *)&ipv4;
979                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
980                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
981                                                 data, async_data,
982                                                 &timeout, NULL);
983                         } else {
984                                 ip.pnn  = tmp_ip->pnn;
985                                 ip.addr = tmp_ip->addr;
986
987                                 timeout = TAKEOVER_TIMEOUT();
988                                 data.dsize = sizeof(ip);
989                                 data.dptr  = (uint8_t *)&ip;
990                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
991                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
992                                                 data, async_data,
993                                                 &timeout, NULL);
994                         }
995
996                         if (state == NULL) {
997                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
998                                 talloc_free(tmp_ctx);
999                                 return -1;
1000                         }
1001                 
1002                         ctdb_client_async_add(async_data, state);
1003                 }
1004         }
1005         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1006                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1007                 talloc_free(tmp_ctx);
1008                 return -1;
1009         }
1010         talloc_free(async_data);
1011
1012
1013         /* tell all nodes to get their own IPs */
1014         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1015         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1016         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1017                 if (tmp_ip->pnn == -1) {
1018                         /* this IP won't be taken over */
1019                         continue;
1020                 }
1021
1022                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1023                         ipv4.pnn = tmp_ip->pnn;
1024                         ipv4.sin = tmp_ip->addr.ip;
1025
1026                         timeout = TAKEOVER_TIMEOUT();
1027                         data.dsize = sizeof(ipv4);
1028                         data.dptr  = (uint8_t *)&ipv4;
1029                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1030                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1031                                         data, async_data,
1032                                         &timeout, NULL);
1033                 } else {
1034                         ip.pnn  = tmp_ip->pnn;
1035                         ip.addr = tmp_ip->addr;
1036
1037                         timeout = TAKEOVER_TIMEOUT();
1038                         data.dsize = sizeof(ip);
1039                         data.dptr  = (uint8_t *)&ip;
1040                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1041                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1042                                         data, async_data,
1043                                         &timeout, NULL);
1044                 }
1045                 if (state == NULL) {
1046                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1047                         talloc_free(tmp_ctx);
1048                         return -1;
1049                 }
1050                 
1051                 ctdb_client_async_add(async_data, state);
1052         }
1053         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1054                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1055                 talloc_free(tmp_ctx);
1056                 return -1;
1057         }
1058
1059
1060         /* tell all nodes to update natwg */
1061         /* send the flags update natgw on all connected nodes */
1062         data.dptr  = discard_const("ipreallocated");
1063         data.dsize = strlen((char *)data.dptr) + 1; 
1064         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1065         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
1066                                       nodes, 0, TAKEOVER_TIMEOUT(),
1067                                       false, data,
1068                                       NULL, NULL,
1069                                       NULL) != 0) {
1070                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
1071         }
1072
1073         talloc_free(tmp_ctx);
1074         return 0;
1075 }
1076
1077
1078 /*
1079   destroy a ctdb_client_ip structure
1080  */
1081 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1082 {
1083         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1084                 ctdb_addr_to_str(&ip->addr),
1085                 ntohs(ip->addr.ip.sin_port),
1086                 ip->client_id));
1087
1088         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1089         return 0;
1090 }
1091
1092 /*
1093   called by a client to inform us of a TCP connection that it is managing
1094   that should tickled with an ACK when IP takeover is done
1095   we handle both the old ipv4 style of packets as well as the new ipv4/6
1096   pdus.
1097  */
1098 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1099                                 TDB_DATA indata)
1100 {
1101         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1102         struct ctdb_control_tcp *old_addr = NULL;
1103         struct ctdb_control_tcp_addr new_addr;
1104         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1105         struct ctdb_tcp_list *tcp;
1106         struct ctdb_control_tcp_vnn t;
1107         int ret;
1108         TDB_DATA data;
1109         struct ctdb_client_ip *ip;
1110         struct ctdb_vnn *vnn;
1111         ctdb_sock_addr addr;
1112
1113         switch (indata.dsize) {
1114         case sizeof(struct ctdb_control_tcp):
1115                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1116                 ZERO_STRUCT(new_addr);
1117                 tcp_sock = &new_addr;
1118                 tcp_sock->src.ip  = old_addr->src;
1119                 tcp_sock->dest.ip = old_addr->dest;
1120                 break;
1121         case sizeof(struct ctdb_control_tcp_addr):
1122                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1123                 break;
1124         default:
1125                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1126                                  "to ctdb_control_tcp_client. size was %d but "
1127                                  "only allowed sizes are %lu and %lu\n",
1128                                  (int)indata.dsize,
1129                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1130                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1131                 return -1;
1132         }
1133
1134         addr = tcp_sock->src;
1135         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1136         addr = tcp_sock->dest;
1137         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1138
1139         ZERO_STRUCT(addr);
1140         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1141         vnn = find_public_ip_vnn(ctdb, &addr);
1142         if (vnn == NULL) {
1143                 switch (addr.sa.sa_family) {
1144                 case AF_INET:
1145                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1146                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1147                                         ctdb_addr_to_str(&addr)));
1148                         }
1149                         break;
1150                 case AF_INET6:
1151                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1152                                 ctdb_addr_to_str(&addr)));
1153                         break;
1154                 default:
1155                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1156                 }
1157
1158                 return 0;
1159         }
1160
1161         if (vnn->pnn != ctdb->pnn) {
1162                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1163                         ctdb_addr_to_str(&addr),
1164                         client_id, client->pid));
1165                 /* failing this call will tell smbd to die */
1166                 return -1;
1167         }
1168
1169         ip = talloc(client, struct ctdb_client_ip);
1170         CTDB_NO_MEMORY(ctdb, ip);
1171
1172         ip->ctdb      = ctdb;
1173         ip->addr      = addr;
1174         ip->client_id = client_id;
1175         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1176         DLIST_ADD(ctdb->client_ip_list, ip);
1177
1178         tcp = talloc(client, struct ctdb_tcp_list);
1179         CTDB_NO_MEMORY(ctdb, tcp);
1180
1181         tcp->connection.src_addr = tcp_sock->src;
1182         tcp->connection.dst_addr = tcp_sock->dest;
1183
1184         DLIST_ADD(client->tcp_list, tcp);
1185
1186         t.src  = tcp_sock->src;
1187         t.dest = tcp_sock->dest;
1188
1189         data.dptr = (uint8_t *)&t;
1190         data.dsize = sizeof(t);
1191
1192         switch (addr.sa.sa_family) {
1193         case AF_INET:
1194                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1195                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1196                         ctdb_addr_to_str(&tcp_sock->src),
1197                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1198                 break;
1199         case AF_INET6:
1200                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1201                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1202                         ctdb_addr_to_str(&tcp_sock->src),
1203                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1204                 break;
1205         default:
1206                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1207         }
1208
1209
1210         /* tell all nodes about this tcp connection */
1211         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1212                                        CTDB_CONTROL_TCP_ADD,
1213                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1214         if (ret != 0) {
1215                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1216                 return -1;
1217         }
1218
1219         return 0;
1220 }
1221
1222 /*
1223   find a tcp address on a list
1224  */
1225 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1226                                            struct ctdb_tcp_connection *tcp)
1227 {
1228         int i;
1229
1230         if (array == NULL) {
1231                 return NULL;
1232         }
1233
1234         for (i=0;i<array->num;i++) {
1235                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1236                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1237                         return &array->connections[i];
1238                 }
1239         }
1240         return NULL;
1241 }
1242
1243 /*
1244   called by a daemon to inform us of a TCP connection that one of its
1245   clients managing that should tickled with an ACK when IP takeover is
1246   done
1247  */
1248 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
1249 {
1250         struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
1251         struct ctdb_tcp_array *tcparray;
1252         struct ctdb_tcp_connection tcp;
1253         struct ctdb_vnn *vnn;
1254
1255         vnn = find_public_ip_vnn(ctdb, &p->dest);
1256         if (vnn == NULL) {
1257                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1258                         ctdb_addr_to_str(&p->dest)));
1259
1260                 return -1;
1261         }
1262
1263
1264         tcparray = vnn->tcp_array;
1265
1266         /* If this is the first tickle */
1267         if (tcparray == NULL) {
1268                 tcparray = talloc_size(ctdb->nodes, 
1269                         offsetof(struct ctdb_tcp_array, connections) +
1270                         sizeof(struct ctdb_tcp_connection) * 1);
1271                 CTDB_NO_MEMORY(ctdb, tcparray);
1272                 vnn->tcp_array = tcparray;
1273
1274                 tcparray->num = 0;
1275                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1276                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1277
1278                 tcparray->connections[tcparray->num].src_addr = p->src;
1279                 tcparray->connections[tcparray->num].dst_addr = p->dest;
1280                 tcparray->num++;
1281                 return 0;
1282         }
1283
1284
1285         /* Do we already have this tickle ?*/
1286         tcp.src_addr = p->src;
1287         tcp.dst_addr = p->dest;
1288         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1289                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1290                         ctdb_addr_to_str(&tcp.dst_addr),
1291                         ntohs(tcp.dst_addr.ip.sin_port),
1292                         vnn->pnn));
1293                 return 0;
1294         }
1295
1296         /* A new tickle, we must add it to the array */
1297         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1298                                         struct ctdb_tcp_connection,
1299                                         tcparray->num+1);
1300         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1301
1302         vnn->tcp_array = tcparray;
1303         tcparray->connections[tcparray->num].src_addr = p->src;
1304         tcparray->connections[tcparray->num].dst_addr = p->dest;
1305         tcparray->num++;
1306                                 
1307         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1308                 ctdb_addr_to_str(&tcp.dst_addr),
1309                 ntohs(tcp.dst_addr.ip.sin_port),
1310                 vnn->pnn));
1311
1312         return 0;
1313 }
1314
1315
1316 /*
1317   called by a daemon to inform us of a TCP connection that one of its
1318   clients managing that should tickled with an ACK when IP takeover is
1319   done
1320  */
1321 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1322 {
1323         struct ctdb_tcp_connection *tcpp;
1324         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1325
1326         if (vnn == NULL) {
1327                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1328                         ctdb_addr_to_str(&conn->dst_addr)));
1329                 return;
1330         }
1331
1332         /* if the array is empty we cant remove it
1333            and we dont need to do anything
1334          */
1335         if (vnn->tcp_array == NULL) {
1336                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1337                         ctdb_addr_to_str(&conn->dst_addr),
1338                         ntohs(conn->dst_addr.ip.sin_port)));
1339                 return;
1340         }
1341
1342
1343         /* See if we know this connection
1344            if we dont know this connection  then we dont need to do anything
1345          */
1346         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1347         if (tcpp == NULL) {
1348                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1349                         ctdb_addr_to_str(&conn->dst_addr),
1350                         ntohs(conn->dst_addr.ip.sin_port)));
1351                 return;
1352         }
1353
1354
1355         /* We need to remove this entry from the array.
1356            Instead of allocating a new array and copying data to it
1357            we cheat and just copy the last entry in the existing array
1358            to the entry that is to be removed and just shring the 
1359            ->num field
1360          */
1361         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1362         vnn->tcp_array->num--;
1363
1364         /* If we deleted the last entry we also need to remove the entire array
1365          */
1366         if (vnn->tcp_array->num == 0) {
1367                 talloc_free(vnn->tcp_array);
1368                 vnn->tcp_array = NULL;
1369         }               
1370
1371         vnn->tcp_update_needed = true;
1372
1373         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1374                 ctdb_addr_to_str(&conn->src_addr),
1375                 ntohs(conn->src_addr.ip.sin_port)));
1376 }
1377
1378
1379 /*
1380   called when a daemon restarts - send all tickes for all public addresses
1381   we are serving immediately to the new node.
1382  */
1383 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1384 {
1385 /*XXX here we should send all tickes we are serving to the new node */
1386         return 0;
1387 }
1388
1389
1390 /*
1391   called when a client structure goes away - hook to remove
1392   elements from the tcp_list in all daemons
1393  */
1394 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1395 {
1396         while (client->tcp_list) {
1397                 struct ctdb_tcp_list *tcp = client->tcp_list;
1398                 DLIST_REMOVE(client->tcp_list, tcp);
1399                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1400         }
1401 }
1402
1403
1404 /*
1405   release all IPs on shutdown
1406  */
1407 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1408 {
1409         struct ctdb_vnn *vnn;
1410
1411         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1412                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1413                         continue;
1414                 }
1415                 if (vnn->pnn == ctdb->pnn) {
1416                         vnn->pnn = -1;
1417                 }
1418                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1419                                   vnn->iface, 
1420                                   talloc_strdup(ctdb, ctdb_addr_to_str(&vnn->public_address)),
1421                                   vnn->public_netmask_bits);
1422                 release_kill_clients(ctdb, &vnn->public_address);
1423         }
1424 }
1425
1426
1427 /*
1428   get list of public IPs
1429  */
1430 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1431                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1432 {
1433         int i, num, len;
1434         struct ctdb_all_public_ips *ips;
1435         struct ctdb_vnn *vnn;
1436
1437         /* count how many public ip structures we have */
1438         num = 0;
1439         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1440                 num++;
1441         }
1442
1443         len = offsetof(struct ctdb_all_public_ips, ips) + 
1444                 num*sizeof(struct ctdb_public_ip);
1445         ips = talloc_zero_size(outdata, len);
1446         CTDB_NO_MEMORY(ctdb, ips);
1447
1448         outdata->dsize = len;
1449         outdata->dptr  = (uint8_t *)ips;
1450
1451         ips->num = num;
1452         i = 0;
1453         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1454                 ips->ips[i].pnn  = vnn->pnn;
1455                 ips->ips[i].addr = vnn->public_address;
1456                 i++;
1457         }
1458
1459         return 0;
1460 }
1461
1462
1463 /*
1464   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
1465  */
1466 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
1467                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1468 {
1469         int i, num, len;
1470         struct ctdb_all_public_ipsv4 *ips;
1471         struct ctdb_vnn *vnn;
1472
1473         /* count how many public ip structures we have */
1474         num = 0;
1475         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1476                 if (vnn->public_address.sa.sa_family != AF_INET) {
1477                         continue;
1478                 }
1479                 num++;
1480         }
1481
1482         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
1483                 num*sizeof(struct ctdb_public_ipv4);
1484         ips = talloc_zero_size(outdata, len);
1485         CTDB_NO_MEMORY(ctdb, ips);
1486
1487         outdata->dsize = len;
1488         outdata->dptr  = (uint8_t *)ips;
1489
1490         ips->num = num;
1491         i = 0;
1492         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1493                 if (vnn->public_address.sa.sa_family != AF_INET) {
1494                         continue;
1495                 }
1496                 ips->ips[i].pnn = vnn->pnn;
1497                 ips->ips[i].sin = vnn->public_address.ip;
1498                 i++;
1499         }
1500
1501         return 0;
1502 }
1503
1504
1505 /* 
1506    structure containing the listening socket and the list of tcp connections
1507    that the ctdb daemon is to kill
1508 */
1509 struct ctdb_kill_tcp {
1510         struct ctdb_vnn *vnn;
1511         struct ctdb_context *ctdb;
1512         int capture_fd;
1513         struct fd_event *fde;
1514         trbt_tree_t *connections;
1515         void *private_data;
1516 };
1517
1518 /*
1519   a tcp connection that is to be killed
1520  */
1521 struct ctdb_killtcp_con {
1522         ctdb_sock_addr src_addr;
1523         ctdb_sock_addr dst_addr;
1524         int count;
1525         struct ctdb_kill_tcp *killtcp;
1526 };
1527
1528 /* this function is used to create a key to represent this socketpair
1529    in the killtcp tree.
1530    this key is used to insert and lookup matching socketpairs that are
1531    to be tickled and RST
1532 */
1533 #define KILLTCP_KEYLEN  10
1534 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
1535 {
1536         static uint32_t key[KILLTCP_KEYLEN];
1537
1538         bzero(key, sizeof(key));
1539
1540         if (src->sa.sa_family != dst->sa.sa_family) {
1541                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
1542                 return key;
1543         }
1544         
1545         switch (src->sa.sa_family) {
1546         case AF_INET:
1547                 key[0]  = dst->ip.sin_addr.s_addr;
1548                 key[1]  = src->ip.sin_addr.s_addr;
1549                 key[2]  = dst->ip.sin_port;
1550                 key[3]  = src->ip.sin_port;
1551                 break;
1552         case AF_INET6:
1553                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
1554                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
1555                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
1556                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
1557                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
1558                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
1559                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
1560                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
1561                 key[8]  = dst->ip6.sin6_port;
1562                 key[9]  = src->ip6.sin6_port;
1563                 break;
1564         default:
1565                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
1566                 return key;
1567         }
1568
1569         return key;
1570 }
1571
1572 /*
1573   called when we get a read event on the raw socket
1574  */
1575 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
1576                                 uint16_t flags, void *private_data)
1577 {
1578         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1579         struct ctdb_killtcp_con *con;
1580         ctdb_sock_addr src, dst;
1581         uint32_t ack_seq, seq;
1582
1583         if (!(flags & EVENT_FD_READ)) {
1584                 return;
1585         }
1586
1587         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1588                                 killtcp->private_data,
1589                                 &src, &dst,
1590                                 &ack_seq, &seq) != 0) {
1591                 /* probably a non-tcp ACK packet */
1592                 return;
1593         }
1594
1595         /* check if we have this guy in our list of connections
1596            to kill
1597         */
1598         con = trbt_lookuparray32(killtcp->connections, 
1599                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1600         if (con == NULL) {
1601                 /* no this was some other packet we can just ignore */
1602                 return;
1603         }
1604
1605         /* This one has been tickled !
1606            now reset him and remove him from the list.
1607          */
1608         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
1609                 ntohs(con->dst_addr.ip.sin_port),
1610                 ctdb_addr_to_str(&con->src_addr),
1611                 ntohs(con->src_addr.ip.sin_port)));
1612
1613         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
1614         talloc_free(con);
1615 }
1616
1617
1618 /* when traversing the list of all tcp connections to send tickle acks to
1619    (so that we can capture the ack coming back and kill the connection
1620     by a RST)
1621    this callback is called for each connection we are currently trying to kill
1622 */
1623 static void tickle_connection_traverse(void *param, void *data)
1624 {
1625         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1626
1627         /* have tried too many times, just give up */
1628         if (con->count >= 5) {
1629                 /* can't delete in traverse: reparent to delete_cons */
1630                 talloc_steal(param, con);
1631                 return;
1632         }
1633
1634         /* othervise, try tickling it again */
1635         con->count++;
1636         ctdb_sys_send_tcp(
1637                 (ctdb_sock_addr *)&con->dst_addr,
1638                 (ctdb_sock_addr *)&con->src_addr,
1639                 0, 0, 0);
1640 }
1641
1642
1643 /* 
1644    called every second until all sentenced connections have been reset
1645  */
1646 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
1647                                               struct timeval t, void *private_data)
1648 {
1649         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1650         void *delete_cons = talloc_new(NULL);
1651
1652         /* loop over all connections sending tickle ACKs */
1653         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
1654
1655         /* now we've finished traverse, it's safe to do deletion. */
1656         talloc_free(delete_cons);
1657
1658         /* If there are no more connections to kill we can remove the
1659            entire killtcp structure
1660          */
1661         if ( (killtcp->connections == NULL) || 
1662              (killtcp->connections->root == NULL) ) {
1663                 talloc_free(killtcp);
1664                 return;
1665         }
1666
1667         /* try tickling them again in a seconds time
1668          */
1669         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
1670                         ctdb_tickle_sentenced_connections, killtcp);
1671 }
1672
1673 /*
1674   destroy the killtcp structure
1675  */
1676 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
1677 {
1678         if (killtcp->vnn) {
1679                 killtcp->vnn->killtcp = NULL;
1680         }
1681         return 0;
1682 }
1683
1684
1685 /* nothing fancy here, just unconditionally replace any existing
1686    connection structure with the new one.
1687
1688    dont even free the old one if it did exist, that one is talloc_stolen
1689    by the same node in the tree anyway and will be deleted when the new data 
1690    is deleted
1691 */
1692 static void *add_killtcp_callback(void *parm, void *data)
1693 {
1694         return parm;
1695 }
1696
1697 /*
1698   add a tcp socket to the list of connections we want to RST
1699  */
1700 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
1701                                        ctdb_sock_addr *s,
1702                                        ctdb_sock_addr *d)
1703 {
1704         ctdb_sock_addr src, dst;
1705         struct ctdb_kill_tcp *killtcp;
1706         struct ctdb_killtcp_con *con;
1707         struct ctdb_vnn *vnn;
1708
1709         ctdb_canonicalize_ip(s, &src);
1710         ctdb_canonicalize_ip(d, &dst);
1711
1712         vnn = find_public_ip_vnn(ctdb, &dst);
1713         if (vnn == NULL) {
1714                 vnn = find_public_ip_vnn(ctdb, &src);
1715         }
1716         if (vnn == NULL) {
1717                 /* if it is not a public ip   it could be our 'single ip' */
1718                 if (ctdb->single_ip_vnn) {
1719                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
1720                                 vnn = ctdb->single_ip_vnn;
1721                         }
1722                 }
1723         }
1724         if (vnn == NULL) {
1725                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
1726                 return -1;
1727         }
1728
1729         killtcp = vnn->killtcp;
1730         
1731         /* If this is the first connection to kill we must allocate
1732            a new structure
1733          */
1734         if (killtcp == NULL) {
1735                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
1736                 CTDB_NO_MEMORY(ctdb, killtcp);
1737
1738                 killtcp->vnn         = vnn;
1739                 killtcp->ctdb        = ctdb;
1740                 killtcp->capture_fd  = -1;
1741                 killtcp->connections = trbt_create(killtcp, 0);
1742
1743                 vnn->killtcp         = killtcp;
1744                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
1745         }
1746
1747
1748
1749         /* create a structure that describes this connection we want to
1750            RST and store it in killtcp->connections
1751         */
1752         con = talloc(killtcp, struct ctdb_killtcp_con);
1753         CTDB_NO_MEMORY(ctdb, con);
1754         con->src_addr = src;
1755         con->dst_addr = dst;
1756         con->count    = 0;
1757         con->killtcp  = killtcp;
1758
1759
1760         trbt_insertarray32_callback(killtcp->connections,
1761                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
1762                         add_killtcp_callback, con);
1763
1764         /* 
1765            If we dont have a socket to listen on yet we must create it
1766          */
1767         if (killtcp->capture_fd == -1) {
1768                 killtcp->capture_fd = ctdb_sys_open_capture_socket(vnn->iface, &killtcp->private_data);
1769                 if (killtcp->capture_fd == -1) {
1770                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing socket for killtcp\n"));
1771                         goto failed;
1772                 }
1773         }
1774
1775
1776         if (killtcp->fde == NULL) {
1777                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
1778                                             EVENT_FD_READ | EVENT_FD_AUTOCLOSE, 
1779                                             capture_tcp_handler, killtcp);
1780
1781                 /* We also need to set up some events to tickle all these connections
1782                    until they are all reset
1783                 */
1784                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
1785                                 ctdb_tickle_sentenced_connections, killtcp);
1786         }
1787
1788         /* tickle him once now */
1789         ctdb_sys_send_tcp(
1790                 &con->dst_addr,
1791                 &con->src_addr,
1792                 0, 0, 0);
1793
1794         return 0;
1795
1796 failed:
1797         talloc_free(vnn->killtcp);
1798         vnn->killtcp = NULL;
1799         return -1;
1800 }
1801
1802 /*
1803   kill a TCP connection.
1804  */
1805 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
1806 {
1807         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
1808
1809         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
1810 }
1811
1812 /*
1813   called by a daemon to inform us of the entire list of TCP tickles for
1814   a particular public address.
1815   this control should only be sent by the node that is currently serving
1816   that public address.
1817  */
1818 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
1819 {
1820         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
1821         struct ctdb_tcp_array *tcparray;
1822         struct ctdb_vnn *vnn;
1823
1824         /* We must at least have tickles.num or else we cant verify the size
1825            of the received data blob
1826          */
1827         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
1828                                         tickles.connections)) {
1829                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
1830                 return -1;
1831         }
1832
1833         /* verify that the size of data matches what we expect */
1834         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
1835                                 tickles.connections)
1836                          + sizeof(struct ctdb_tcp_connection)
1837                                  * list->tickles.num) {
1838                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
1839                 return -1;
1840         }       
1841
1842         vnn = find_public_ip_vnn(ctdb, &list->addr);
1843         if (vnn == NULL) {
1844                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
1845                         ctdb_addr_to_str(&list->addr)));
1846
1847                 return 1;
1848         }
1849
1850         /* remove any old ticklelist we might have */
1851         talloc_free(vnn->tcp_array);
1852         vnn->tcp_array = NULL;
1853
1854         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
1855         CTDB_NO_MEMORY(ctdb, tcparray);
1856
1857         tcparray->num = list->tickles.num;
1858
1859         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
1860         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1861
1862         memcpy(tcparray->connections, &list->tickles.connections[0], 
1863                sizeof(struct ctdb_tcp_connection)*tcparray->num);
1864
1865         /* We now have a new fresh tickle list array for this vnn */
1866         vnn->tcp_array = talloc_steal(vnn, tcparray);
1867         
1868         return 0;
1869 }
1870
1871 /*
1872   called to return the full list of tickles for the puclic address associated 
1873   with the provided vnn
1874  */
1875 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1876 {
1877         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
1878         struct ctdb_control_tcp_tickle_list *list;
1879         struct ctdb_tcp_array *tcparray;
1880         int num;
1881         struct ctdb_vnn *vnn;
1882
1883         vnn = find_public_ip_vnn(ctdb, addr);
1884         if (vnn == NULL) {
1885                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
1886                         ctdb_addr_to_str(addr)));
1887
1888                 return 1;
1889         }
1890
1891         tcparray = vnn->tcp_array;
1892         if (tcparray) {
1893                 num = tcparray->num;
1894         } else {
1895                 num = 0;
1896         }
1897
1898         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
1899                                 tickles.connections)
1900                         + sizeof(struct ctdb_tcp_connection) * num;
1901
1902         outdata->dptr  = talloc_size(outdata, outdata->dsize);
1903         CTDB_NO_MEMORY(ctdb, outdata->dptr);
1904         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
1905
1906         list->addr = *addr;
1907         list->tickles.num = num;
1908         if (num) {
1909                 memcpy(&list->tickles.connections[0], tcparray->connections, 
1910                         sizeof(struct ctdb_tcp_connection) * num);
1911         }
1912
1913         return 0;
1914 }
1915
1916
1917 /*
1918   set the list of all tcp tickles for a public address
1919  */
1920 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
1921                               struct timeval timeout, uint32_t destnode, 
1922                               ctdb_sock_addr *addr,
1923                               struct ctdb_tcp_array *tcparray)
1924 {
1925         int ret, num;
1926         TDB_DATA data;
1927         struct ctdb_control_tcp_tickle_list *list;
1928
1929         if (tcparray) {
1930                 num = tcparray->num;
1931         } else {
1932                 num = 0;
1933         }
1934
1935         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
1936                                 tickles.connections) +
1937                         sizeof(struct ctdb_tcp_connection) * num;
1938         data.dptr = talloc_size(ctdb, data.dsize);
1939         CTDB_NO_MEMORY(ctdb, data.dptr);
1940
1941         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
1942         list->addr = *addr;
1943         list->tickles.num = num;
1944         if (tcparray) {
1945                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
1946         }
1947
1948         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1949                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
1950                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1951         if (ret != 0) {
1952                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
1953                 return -1;
1954         }
1955
1956         talloc_free(data.dptr);
1957
1958         return ret;
1959 }
1960
1961
1962 /*
1963   perform tickle updates if required
1964  */
1965 static void ctdb_update_tcp_tickles(struct event_context *ev, 
1966                                 struct timed_event *te, 
1967                                 struct timeval t, void *private_data)
1968 {
1969         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1970         int ret;
1971         struct ctdb_vnn *vnn;
1972
1973         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1974                 /* we only send out updates for public addresses that 
1975                    we have taken over
1976                  */
1977                 if (ctdb->pnn != vnn->pnn) {
1978                         continue;
1979                 }
1980                 /* We only send out the updates if we need to */
1981                 if (!vnn->tcp_update_needed) {
1982                         continue;
1983                 }
1984                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
1985                                 TAKEOVER_TIMEOUT(),
1986                                 CTDB_BROADCAST_CONNECTED,
1987                                 &vnn->public_address,
1988                                 vnn->tcp_array);
1989                 if (ret != 0) {
1990                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
1991                                 ctdb_addr_to_str(&vnn->public_address)));
1992                 }
1993         }
1994
1995         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
1996                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
1997                              ctdb_update_tcp_tickles, ctdb);
1998 }               
1999         
2000
2001 /*
2002   start periodic update of tcp tickles
2003  */
2004 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2005 {
2006         ctdb->tickle_update_context = talloc_new(ctdb);
2007
2008         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2009                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2010                              ctdb_update_tcp_tickles, ctdb);
2011 }
2012
2013
2014
2015
2016 struct control_gratious_arp {
2017         struct ctdb_context *ctdb;
2018         ctdb_sock_addr addr;
2019         const char *iface;
2020         int count;
2021 };
2022
2023 /*
2024   send a control_gratuitous arp
2025  */
2026 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2027                                   struct timeval t, void *private_data)
2028 {
2029         int ret;
2030         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2031                                                         struct control_gratious_arp);
2032
2033         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2034         if (ret != 0) {
2035                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp failed (%s)\n", strerror(errno)));
2036         }
2037
2038
2039         arp->count++;
2040         if (arp->count == CTDB_ARP_REPEAT) {
2041                 talloc_free(arp);
2042                 return;
2043         }
2044
2045         event_add_timed(arp->ctdb->ev, arp, 
2046                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2047                         send_gratious_arp, arp);
2048 }
2049
2050
2051 /*
2052   send a gratious arp 
2053  */
2054 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2055 {
2056         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2057         struct control_gratious_arp *arp;
2058
2059         /* verify the size of indata */
2060         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2061                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2062                                  (unsigned)indata.dsize, 
2063                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2064                 return -1;
2065         }
2066         if (indata.dsize != 
2067                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2068                 + gratious_arp->len ) ){
2069
2070                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2071                         "but should be %u bytes\n", 
2072                          (unsigned)indata.dsize, 
2073                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2074                 return -1;
2075         }
2076
2077
2078         arp = talloc(ctdb, struct control_gratious_arp);
2079         CTDB_NO_MEMORY(ctdb, arp);
2080
2081         arp->ctdb  = ctdb;
2082         arp->addr   = gratious_arp->addr;
2083         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2084         CTDB_NO_MEMORY(ctdb, arp->iface);
2085         arp->count = 0;
2086         
2087         event_add_timed(arp->ctdb->ev, arp, 
2088                         timeval_zero(), send_gratious_arp, arp);
2089
2090         return 0;
2091 }
2092
2093 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2094 {
2095         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2096         int ret;
2097
2098         /* verify the size of indata */
2099         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2100                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2101                 return -1;
2102         }
2103         if (indata.dsize != 
2104                 ( offsetof(struct ctdb_control_ip_iface, iface)
2105                 + pub->len ) ){
2106
2107                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2108                         "but should be %u bytes\n", 
2109                          (unsigned)indata.dsize, 
2110                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2111                 return -1;
2112         }
2113
2114         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2115
2116         if (ret != 0) {
2117                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2118                 return -1;
2119         }
2120
2121         return 0;
2122 }
2123
2124 /*
2125   called when releaseip event finishes for del_public_address
2126  */
2127 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2128                                 void *private_data)
2129 {
2130         talloc_free(private_data);
2131 }
2132
2133 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2134 {
2135         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2136         struct ctdb_vnn *vnn;
2137         int ret;
2138
2139         /* verify the size of indata */
2140         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2141                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2142                 return -1;
2143         }
2144         if (indata.dsize != 
2145                 ( offsetof(struct ctdb_control_ip_iface, iface)
2146                 + pub->len ) ){
2147
2148                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2149                         "but should be %u bytes\n", 
2150                          (unsigned)indata.dsize, 
2151                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2152                 return -1;
2153         }
2154
2155         /* walk over all public addresses until we find a match */
2156         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2157                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2158                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2159
2160                         DLIST_REMOVE(ctdb->vnn, vnn);
2161
2162                         ret = ctdb_event_script_callback(ctdb, 
2163                                          mem_ctx, delete_ip_callback, mem_ctx,
2164                                          false,
2165                                          CTDB_EVENT_RELEASE_IP,
2166                                          "%s %s %u",
2167                                          vnn->iface, 
2168                                          talloc_strdup(mem_ctx, ctdb_addr_to_str(&vnn->public_address)),
2169                                          vnn->public_netmask_bits);
2170                         if (vnn->killtcp) {
2171                                 vnn->killtcp->vnn = NULL;
2172                         }
2173                         talloc_free(vnn);
2174                         if (ret != 0) {
2175                                 return -1;
2176                         }
2177                         return 0;
2178                 }
2179         }
2180
2181         return -1;
2182 }
2183
2184 /* This function is called from the recovery daemon to verify that a remote
2185    node has the expected ip allocation.
2186    This is verified against ctdb->ip_tree
2187 */
2188 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
2189 {
2190         struct ctdb_public_ip_list *tmp_ip; 
2191         int i;
2192
2193         if (ctdb->ip_tree == NULL) {
2194                 /* dont know the expected allocation yet, assume remote node
2195                    is correct. */
2196                 return 0;
2197         }
2198
2199         if (ips == NULL) {
2200                 return 0;
2201         }
2202
2203         for (i=0; i<ips->num; i++) {
2204                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
2205                 if (tmp_ip == NULL) {
2206                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
2207                         return -1;
2208                 }
2209
2210                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
2211                         continue;
2212                 }
2213
2214                 if (tmp_ip->pnn != ips->ips[i].pnn) {
2215                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
2216                         return -1;
2217                 }
2218         }
2219
2220         return 0;
2221 }
2222
2223 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
2224 {
2225         struct ctdb_public_ip_list *tmp_ip; 
2226
2227         if (ctdb->ip_tree == NULL) {
2228                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
2229                 return -1;
2230         }
2231
2232         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
2233         if (tmp_ip == NULL) {
2234                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
2235                 return -1;
2236         }
2237
2238         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
2239         tmp_ip->pnn = ip->pnn;
2240
2241         return 0;
2242 }