4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
30 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32 #define CTDB_ARP_INTERVAL 1
33 #define CTDB_ARP_REPEAT 3
35 struct ctdb_takeover_arp {
36 struct ctdb_context *ctdb;
38 struct sockaddr_in sin;
39 struct ctdb_tcp_list *tcp_list;
43 lists of tcp endpoints
45 struct ctdb_tcp_list {
46 struct ctdb_tcp_list *prev, *next;
48 struct sockaddr_in saddr;
49 struct sockaddr_in daddr;
54 list of clients to kill on IP release
56 struct ctdb_client_ip {
57 struct ctdb_client_ip *prev, *next;
58 struct ctdb_context *ctdb;
59 struct sockaddr_in ip;
67 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
68 struct timeval t, void *private_data)
70 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
71 struct ctdb_takeover_arp);
73 struct ctdb_tcp_list *tcp;
75 ret = ctdb_sys_send_arp(&arp->sin, arp->ctdb->takeover.interface);
77 DEBUG(0,(__location__ " sending of arp failed (%s)\n", strerror(errno)));
80 for (tcp=arp->tcp_list;tcp;tcp=tcp->next) {
81 DEBUG(2,("sending tcp tickle ack for %u->%s:%u\n",
82 (unsigned)ntohs(tcp->daddr.sin_port),
83 inet_ntoa(tcp->saddr.sin_addr),
84 (unsigned)ntohs(tcp->saddr.sin_port)));
85 ret = ctdb_sys_send_tcp(&tcp->saddr, &tcp->daddr, 0, 0, 0);
87 DEBUG(0,(__location__ " Failed to send tcp tickle ack for %s\n",
88 inet_ntoa(tcp->saddr.sin_addr)));
94 if (arp->count == CTDB_ARP_REPEAT) {
99 event_add_timed(arp->ctdb->ev, arp->ctdb->takeover.last_ctx,
100 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
101 ctdb_control_send_arp, arp);
104 struct takeover_callback_state {
105 struct ctdb_req_control *c;
106 struct sockaddr_in *sin;
110 called when takeip event finishes
112 static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
115 struct takeover_callback_state *state =
116 talloc_get_type(private_data, struct takeover_callback_state);
117 struct ctdb_takeover_arp *arp;
118 char *ip = inet_ntoa(state->sin->sin_addr);
119 struct ctdb_tcp_list *tcp;
121 ctdb_start_monitoring(ctdb);
124 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
125 ip, ctdb->takeover.interface));
126 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
131 if (!ctdb->takeover.last_ctx) {
132 ctdb->takeover.last_ctx = talloc_new(ctdb);
133 if (!ctdb->takeover.last_ctx) goto failed;
136 arp = talloc_zero(ctdb->takeover.last_ctx, struct ctdb_takeover_arp);
137 if (!arp) goto failed;
140 arp->sin = *state->sin;
142 /* add all of the known tcp connections for this IP to the
143 list of tcp connections to send tickle acks for */
144 for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) {
145 if (state->sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
146 struct ctdb_tcp_list *t2 = talloc(arp, struct ctdb_tcp_list);
147 if (t2 == NULL) goto failed;
149 DLIST_ADD(arp->tcp_list, t2);
153 event_add_timed(arp->ctdb->ev, arp->ctdb->takeover.last_ctx,
154 timeval_zero(), ctdb_control_send_arp, arp);
156 /* the control succeeded */
157 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
162 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
168 take over an ip address
170 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
171 struct ctdb_req_control *c,
176 struct takeover_callback_state *state;
177 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
178 char *ip = inet_ntoa(pip->sin.sin_addr);
181 /* update out node table */
182 ctdb->nodes[pip->vnn]->takeover_vnn = pip->takeover_vnn;
184 /* if our kernel already has this IP, do nothing */
185 if (ctdb_sys_have_ip(ip)) {
189 state = talloc(ctdb, struct takeover_callback_state);
190 CTDB_NO_MEMORY(ctdb, state);
192 state->c = talloc_steal(ctdb, c);
193 state->sin = talloc(ctdb, struct sockaddr_in);
194 CTDB_NO_MEMORY(ctdb, state->sin);
195 *state->sin = pip->sin;
197 DEBUG(0,("Takover of IP %s/%u on interface %s\n",
198 ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits,
199 ctdb->takeover.interface));
201 ctdb_stop_monitoring(ctdb);
203 ret = ctdb_event_script_callback(ctdb,
204 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
205 state, takeover_ip_callback, state,
207 ctdb->takeover.interface,
209 ctdb->nodes[ctdb->vnn]->public_netmask_bits);
211 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
212 ip, ctdb->takeover.interface));
217 /* tell ctdb_control.c that we will be replying asynchronously */
224 kill any clients that are registered with a IP that is being released
226 static void release_kill_clients(struct ctdb_context *ctdb, struct in_addr in)
228 struct ctdb_client_ip *ip;
230 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
231 if (ip->ip.sin_addr.s_addr == in.s_addr) {
232 struct ctdb_client *client = ctdb_reqid_find(ctdb,
235 if (client->pid != 0) {
236 DEBUG(0,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
237 (unsigned)client->pid, inet_ntoa(in),
239 kill(client->pid, SIGKILL);
246 called when releaseip event finishes
248 static void release_ip_callback(struct ctdb_context *ctdb, int status,
251 struct takeover_callback_state *state =
252 talloc_get_type(private_data, struct takeover_callback_state);
253 char *ip = inet_ntoa(state->sin->sin_addr);
255 struct ctdb_tcp_list *tcp;
257 ctdb_start_monitoring(ctdb);
259 /* send a message to all clients of this node telling them
260 that the cluster has been reconfigured and they should
261 release any sockets on this IP */
262 data.dptr = (uint8_t *)ip;
263 data.dsize = strlen(ip)+1;
265 ctdb_daemon_send_message(ctdb, ctdb->vnn, CTDB_SRVID_RELEASE_IP, data);
267 /* kill clients that have registered with this IP */
268 release_kill_clients(ctdb, state->sin->sin_addr);
271 /* tell other nodes about any tcp connections we were holding with this IP */
272 for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) {
273 if (tcp->vnn == ctdb->vnn &&
274 state->sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
275 struct ctdb_control_tcp_vnn t;
281 data.dptr = (uint8_t *)&t;
282 data.dsize = sizeof(t);
284 ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
285 CTDB_CONTROL_TCP_ADD,
286 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
290 /* the control succeeded */
291 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
297 release an ip address
299 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
300 struct ctdb_req_control *c,
305 struct takeover_callback_state *state;
306 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
307 char *ip = inet_ntoa(pip->sin.sin_addr);
309 /* update out node table */
310 ctdb->nodes[pip->vnn]->takeover_vnn = pip->takeover_vnn;
312 if (!ctdb_sys_have_ip(ip)) {
316 DEBUG(0,("Release of IP %s/%u on interface %s\n",
317 ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits,
318 ctdb->takeover.interface));
320 /* stop any previous arps */
321 talloc_free(ctdb->takeover.last_ctx);
322 ctdb->takeover.last_ctx = NULL;
324 state = talloc(ctdb, struct takeover_callback_state);
325 CTDB_NO_MEMORY(ctdb, state);
327 state->c = talloc_steal(state, c);
328 state->sin = talloc(state, struct sockaddr_in);
329 CTDB_NO_MEMORY(ctdb, state->sin);
330 *state->sin = pip->sin;
332 ctdb_stop_monitoring(ctdb);
334 ret = ctdb_event_script_callback(ctdb,
335 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
336 state, release_ip_callback, state,
337 "releaseip %s %s %u",
338 ctdb->takeover.interface,
340 ctdb->nodes[ctdb->vnn]->public_netmask_bits);
342 DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
343 ip, ctdb->takeover.interface));
348 /* tell the control that we will be reply asynchronously */
356 setup the event script
358 int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script)
360 ctdb->takeover.event_script = talloc_strdup(ctdb, script);
361 CTDB_NO_MEMORY(ctdb, ctdb->takeover.event_script);
366 setup the public address list from a file
368 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
374 lines = file_lines_load(alist, &nlines, ctdb);
376 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
379 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
383 if (nlines != ctdb->num_nodes) {
384 DEBUG(0,("Number of lines in %s does not match number of nodes!\n", alist));
389 for (i=0;i<nlines;i++) {
393 ctdb->nodes[i]->public_address = talloc_strdup(ctdb->nodes[i], lines[i]);
394 CTDB_NO_MEMORY(ctdb, ctdb->nodes[i]->public_address);
395 ctdb->nodes[i]->takeover_vnn = -1;
397 /* see if they supplied a netmask length */
398 p = strchr(ctdb->nodes[i]->public_address, '/');
400 DEBUG(0,("You must supply a netmask for public address %s\n",
401 ctdb->nodes[i]->public_address));
405 ctdb->nodes[i]->public_netmask_bits = atoi(p+1);
407 if (ctdb->nodes[i]->public_netmask_bits > 32) {
408 DEBUG(0, ("Illegal netmask for IP %s\n", ctdb->nodes[i]->public_address));
412 if (inet_aton(ctdb->nodes[i]->public_address, &in) == 0) {
413 DEBUG(0,("Badly formed IP '%s' in public address list\n", ctdb->nodes[i]->public_address));
423 see if two IPs are on the same subnet
425 static bool ctdb_same_subnet(const char *ip1, const char *ip2, uint8_t netmask_bits)
427 struct in_addr in1, in2;
430 inet_aton(ip1, &in1);
431 inet_aton(ip2, &in2);
433 mask = ~((1LL<<(32-netmask_bits))-1);
435 if ((ntohl(in1.s_addr) & mask) != (ntohl(in2.s_addr) & mask)) {
444 try to find an available node to take a given nodes IP that meets the
445 criterion given by the flags
447 static void ctdb_takeover_find_node(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
448 int start_node, uint32_t mask_flags)
451 for (j=(start_node+1)%nodemap->num;
453 j=(j+1)%nodemap->num) {
454 if (!(nodemap->nodes[j].flags & mask_flags) &&
455 ctdb_same_subnet(ctdb->nodes[j]->public_address,
456 ctdb->nodes[start_node]->public_address,
457 ctdb->nodes[j]->public_netmask_bits)) {
458 ctdb->nodes[start_node]->takeover_vnn = nodemap->nodes[j].vnn;
466 make any IP alias changes for public addresses that are necessary
468 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
472 struct ctdb_public_ip ip;
476 /* Work out which node will look after each public IP.
477 * takeover_node cycles over the nodes and is incremented each time a
478 * node has been assigned to take over for another node.
479 * This spreads the failed nodes out across the remaining
482 for (i=0;i<nodemap->num;i++) {
483 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
484 ctdb->nodes[i]->takeover_vnn = nodemap->nodes[i].vnn;
486 ctdb->nodes[i]->takeover_vnn = (uint32_t)-1;
488 ctdb_takeover_find_node(ctdb, nodemap, i, NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED);
490 /* if no enabled node can take it, then we
491 might as well use any active node. It
492 probably means that some subsystem (such as
493 NFS) is sick on all nodes. Best we can do
494 is to keep the other services up. */
495 if (ctdb->nodes[i]->takeover_vnn == (uint32_t)-1) {
496 ctdb_takeover_find_node(ctdb, nodemap, i, NODE_FLAGS_INACTIVE);
499 if (ctdb->nodes[i]->takeover_vnn == (uint32_t)-1) {
500 DEBUG(0,(__location__ " No node available on same network to take %s\n",
501 ctdb->nodes[i]->public_address));
506 /* at this point ctdb->nodes[i]->takeover_vnn is the vnn which will own each IP */
508 /* now tell all nodes to delete any alias that they should not
509 have. This will be a NOOP on nodes that don't currently
510 hold the given alias */
511 for (i=0;i<nodemap->num;i++) {
512 /* don't talk to unconnected nodes, but do talk to banned nodes */
513 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
517 /* tell this node to delete all of the aliases that it should not have */
518 for (j=0;j<nodemap->num;j++) {
519 if (ctdb->nodes[j]->takeover_vnn != nodemap->nodes[i].vnn) {
521 ip.takeover_vnn = ctdb->nodes[j]->takeover_vnn;
522 ip.sin.sin_family = AF_INET;
523 inet_aton(ctdb->nodes[j]->public_address, &ip.sin.sin_addr);
525 ret = ctdb_ctrl_release_ip(ctdb, TAKEOVER_TIMEOUT(),
526 nodemap->nodes[i].vnn,
529 DEBUG(0,("Failed to tell vnn %u to release IP %s\n",
530 nodemap->nodes[i].vnn,
531 ctdb->nodes[j]->public_address));
538 /* tell all nodes to get their own IPs */
539 for (i=0;i<nodemap->num;i++) {
540 if (ctdb->nodes[i]->takeover_vnn == -1) {
541 /* this IP won't be taken over */
545 ip.takeover_vnn = ctdb->nodes[i]->takeover_vnn;
546 ip.sin.sin_family = AF_INET;
547 inet_aton(ctdb->nodes[i]->public_address, &ip.sin.sin_addr);
549 ret = ctdb_ctrl_takeover_ip(ctdb, TAKEOVER_TIMEOUT(),
550 ctdb->nodes[i]->takeover_vnn,
553 DEBUG(0,("Failed asking vnn %u to take over IP %s\n",
554 ctdb->nodes[i]->takeover_vnn,
555 ctdb->nodes[i]->public_address));
565 destroy a ctdb_client_ip structure
567 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
569 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
574 called by a client to inform us of a TCP connection that it is managing
575 that should tickled with an ACK when IP takeover is done
577 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id, uint32_t vnn,
580 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
581 struct ctdb_control_tcp *p = (struct ctdb_control_tcp *)indata.dptr;
582 struct ctdb_tcp_list *tcp;
583 struct ctdb_control_tcp_vnn t;
586 struct ctdb_client_ip *ip;
588 ip = talloc(client, struct ctdb_client_ip);
589 CTDB_NO_MEMORY(ctdb, ip);
593 ip->client_id = client_id;
594 talloc_set_destructor(ip, ctdb_client_ip_destructor);
595 DLIST_ADD(ctdb->client_ip_list, ip);
597 tcp = talloc(client, struct ctdb_tcp_list);
598 CTDB_NO_MEMORY(ctdb, tcp);
602 tcp->daddr = p->dest;
604 DLIST_ADD(client->tcp_list, tcp);
610 data.dptr = (uint8_t *)&t;
611 data.dsize = sizeof(t);
613 /* tell all nodes about this tcp connection */
614 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
615 CTDB_CONTROL_TCP_ADD,
616 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
618 DEBUG(0,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
626 see if two sockaddr_in are the same
628 static bool same_sockaddr_in(struct sockaddr_in *in1, struct sockaddr_in *in2)
630 return in1->sin_family == in2->sin_family &&
631 in1->sin_port == in2->sin_port &&
632 in1->sin_addr.s_addr == in2->sin_addr.s_addr;
636 find a tcp address on a list
638 static struct ctdb_tcp_list *ctdb_tcp_find(struct ctdb_tcp_list *list,
639 struct ctdb_tcp_list *tcp)
642 if (same_sockaddr_in(&list->saddr, &tcp->saddr) &&
643 same_sockaddr_in(&list->daddr, &tcp->daddr)) {
652 called by a daemon to inform us of a TCP connection that one of its
653 clients managing that should tickled with an ACK when IP takeover is
656 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
658 struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
659 struct ctdb_tcp_list *tcp;
661 tcp = talloc(ctdb, struct ctdb_tcp_list);
662 CTDB_NO_MEMORY(ctdb, tcp);
666 tcp->daddr = p->dest;
668 if (NULL == ctdb_tcp_find(ctdb->tcp_list, tcp)) {
669 DLIST_ADD(ctdb->tcp_list, tcp);
670 DEBUG(2,("Added tickle info for %s:%u from vnn %u\n",
671 inet_ntoa(tcp->daddr.sin_addr), ntohs(tcp->daddr.sin_port),
674 DEBUG(4,("Already had tickle info for %s:%u from vnn %u\n",
675 inet_ntoa(tcp->daddr.sin_addr), ntohs(tcp->daddr.sin_port),
683 called by a daemon to inform us of a TCP connection that one of its
684 clients managing that should tickled with an ACK when IP takeover is
687 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
689 struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
690 struct ctdb_tcp_list t, *tcp;
696 tcp = ctdb_tcp_find(ctdb->tcp_list, &t);
698 DEBUG(2,("Removed tickle info for %s:%u from vnn %u\n",
699 inet_ntoa(tcp->daddr.sin_addr), ntohs(tcp->daddr.sin_port),
701 DLIST_REMOVE(ctdb->tcp_list, tcp);
710 called when a daemon restarts - wipes all tcp entries from that vnn
712 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
714 struct ctdb_tcp_list *tcp, *next;
715 for (tcp=ctdb->tcp_list;tcp;tcp=next) {
717 if (tcp->vnn == vnn) {
718 DLIST_REMOVE(ctdb->tcp_list, tcp);
722 /* and tell the new guy about any that he should have
724 if (tcp->vnn == ctdb->vnn) {
725 struct ctdb_control_tcp_vnn t;
732 data.dptr = (uint8_t *)&t;
733 data.dsize = sizeof(t);
735 ctdb_daemon_send_control(ctdb, vnn, 0,
736 CTDB_CONTROL_TCP_ADD,
737 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
745 called when a client structure goes away - hook to remove
746 elements from the tcp_list in all daemons
748 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
750 while (client->tcp_list) {
752 struct ctdb_control_tcp_vnn p;
753 struct ctdb_tcp_list *tcp = client->tcp_list;
754 DLIST_REMOVE(client->tcp_list, tcp);
758 data.dptr = (uint8_t *)&p;
759 data.dsize = sizeof(p);
760 ctdb_daemon_send_control(client->ctdb, CTDB_BROADCAST_CONNECTED, 0,
761 CTDB_CONTROL_TCP_REMOVE,
762 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
769 release all IPs on shutdown
771 void ctdb_release_all_ips(struct ctdb_context *ctdb)
775 if (!ctdb->takeover.enabled) {
779 for (i=0;i<ctdb->num_nodes;i++) {
780 struct ctdb_node *node = ctdb->nodes[i];
781 if (ctdb_sys_have_ip(node->public_address)) {
783 ctdb_event_script(ctdb, "releaseip %s %s %u",
784 ctdb->takeover.interface,
785 node->public_address,
786 node->public_netmask_bits);
787 if (inet_aton(node->public_address, &in) != 0) {
788 release_kill_clients(ctdb, in);
796 get list of public IPs
798 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, TDB_DATA *outdata)
801 struct ctdb_all_public_ips *ips;
803 len = offsetof(struct ctdb_all_public_ips, ips) + ctdb->num_nodes*sizeof(struct ctdb_public_ip);
805 ips = talloc_zero_size(outdata, len);
806 CTDB_NO_MEMORY(ctdb, ips);
808 outdata->dsize = len;
809 outdata->dptr = (uint8_t *)ips;
811 ips->num = ctdb->num_nodes;
812 for(i=0;i<ctdb->num_nodes;i++){
814 ips->ips[i].takeover_vnn = ctdb->nodes[i]->takeover_vnn;
815 ips->ips[i].sin.sin_family = AF_INET;
816 if (ctdb->nodes[i]->public_address) {
817 inet_aton(ctdb->nodes[i]->public_address, &ips->ips[i].sin.sin_addr);