4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
37 struct ctdb_iface *prev, *next;
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
46 return vnn->iface->name;
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
56 /* Verify that we dont have an entry for this ip yet */
57 for (i=ctdb->ifaces;i;i=i->next) {
58 if (strcmp(i->name, iface) == 0) {
63 /* create a new structure for this interface */
64 i = talloc_zero(ctdb, struct ctdb_iface);
65 CTDB_NO_MEMORY_FATAL(ctdb, i);
66 i->name = talloc_strdup(i, iface);
67 CTDB_NO_MEMORY(ctdb, i->name);
70 DLIST_ADD(ctdb->ifaces, i);
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
80 /* Verify that we dont have an entry for this ip yet */
81 for (i=ctdb->ifaces;i;i=i->next) {
82 if (strcmp(i->name, iface) == 0) {
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
94 struct ctdb_iface *cur = NULL;
95 struct ctdb_iface *best = NULL;
97 for (i=0; vnn->ifaces[i]; i++) {
99 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
113 if (cur->references < best->references) {
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123 struct ctdb_vnn *vnn)
125 struct ctdb_iface *best = NULL;
128 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129 "still assigned to iface '%s'\n",
130 ctdb_addr_to_str(&vnn->public_address),
131 ctdb_vnn_iface_string(vnn)));
135 best = ctdb_vnn_best_iface(ctdb, vnn);
137 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138 "cannot assign to iface any iface\n",
139 ctdb_addr_to_str(&vnn->public_address)));
145 vnn->pnn = ctdb->pnn;
147 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148 "now assigned to iface '%s' refs[%d]\n",
149 ctdb_addr_to_str(&vnn->public_address),
150 ctdb_vnn_iface_string(vnn),
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156 struct ctdb_vnn *vnn)
158 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159 "now unassigned (old iface '%s' refs[%d])\n",
160 ctdb_addr_to_str(&vnn->public_address),
161 ctdb_vnn_iface_string(vnn),
162 vnn->iface?vnn->iface->references:0));
164 vnn->iface->references--;
167 if (vnn->pnn == ctdb->pnn) {
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173 struct ctdb_vnn *vnn)
177 if (vnn->iface && vnn->iface->link_up) {
181 for (i=0; vnn->ifaces[i]; i++) {
182 struct ctdb_iface *cur;
184 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
197 struct ctdb_takeover_arp {
198 struct ctdb_context *ctdb;
201 struct ctdb_tcp_array *tcparray;
202 struct ctdb_vnn *vnn;
207 lists of tcp endpoints
209 struct ctdb_tcp_list {
210 struct ctdb_tcp_list *prev, *next;
211 struct ctdb_tcp_connection connection;
215 list of clients to kill on IP release
217 struct ctdb_client_ip {
218 struct ctdb_client_ip *prev, *next;
219 struct ctdb_context *ctdb;
226 send a gratuitous arp
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
229 struct timeval t, void *private_data)
231 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
232 struct ctdb_takeover_arp);
234 struct ctdb_tcp_array *tcparray;
235 const char *iface = ctdb_vnn_iface_string(arp->vnn);
237 ret = ctdb_sys_send_arp(&arp->addr, iface);
239 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240 iface, strerror(errno)));
243 tcparray = arp->tcparray;
245 for (i=0;i<tcparray->num;i++) {
246 struct ctdb_tcp_connection *tcon;
248 tcon = &tcparray->connections[i];
249 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
251 ctdb_addr_to_str(&tcon->src_addr),
252 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253 ret = ctdb_sys_send_tcp(
258 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259 ctdb_addr_to_str(&tcon->src_addr)));
266 if (arp->count == CTDB_ARP_REPEAT) {
271 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
272 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
273 ctdb_control_send_arp, arp);
276 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
277 struct ctdb_vnn *vnn)
279 struct ctdb_takeover_arp *arp;
280 struct ctdb_tcp_array *tcparray;
282 if (!vnn->takeover_ctx) {
283 vnn->takeover_ctx = talloc_new(vnn);
284 if (!vnn->takeover_ctx) {
289 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
295 arp->addr = vnn->public_address;
298 tcparray = vnn->tcp_array;
300 /* add all of the known tcp connections for this IP to the
301 list of tcp connections to send tickle acks for */
302 arp->tcparray = talloc_steal(arp, tcparray);
304 vnn->tcp_array = NULL;
305 vnn->tcp_update_needed = true;
308 event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
309 timeval_zero(), ctdb_control_send_arp, arp);
314 struct takeover_callback_state {
315 struct ctdb_req_control *c;
316 ctdb_sock_addr *addr;
317 struct ctdb_vnn *vnn;
320 struct ctdb_do_takeip_state {
321 struct ctdb_req_control *c;
322 struct ctdb_vnn *vnn;
326 called when takeip event finishes
328 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
331 struct ctdb_do_takeip_state *state =
332 talloc_get_type(private_data, struct ctdb_do_takeip_state);
337 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339 if (status == -ETIME) {
342 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
343 ctdb_addr_to_str(&state->vnn->public_address),
344 ctdb_vnn_iface_string(state->vnn)));
345 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347 node->flags |= NODE_FLAGS_UNHEALTHY;
352 if (ctdb->do_checkpublicip) {
354 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
356 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
363 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
364 data.dsize = strlen((char *)data.dptr) + 1;
365 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
367 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
370 /* the control succeeded */
371 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
377 take over an ip address
379 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
380 struct ctdb_req_control *c,
381 struct ctdb_vnn *vnn)
384 struct ctdb_do_takeip_state *state;
386 ret = ctdb_vnn_assign_iface(ctdb, vnn);
388 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
389 "assign a usable interface\n",
390 ctdb_addr_to_str(&vnn->public_address),
391 vnn->public_netmask_bits));
395 state = talloc(vnn, struct ctdb_do_takeip_state);
396 CTDB_NO_MEMORY(ctdb, state);
398 state->c = talloc_steal(ctdb, c);
401 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
402 ctdb_addr_to_str(&vnn->public_address),
403 vnn->public_netmask_bits,
404 ctdb_vnn_iface_string(vnn)));
406 ret = ctdb_event_script_callback(ctdb,
408 ctdb_do_takeip_callback,
413 ctdb_vnn_iface_string(vnn),
414 ctdb_addr_to_str(&vnn->public_address),
415 vnn->public_netmask_bits);
418 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
419 ctdb_addr_to_str(&vnn->public_address),
420 ctdb_vnn_iface_string(vnn)));
428 struct ctdb_do_updateip_state {
429 struct ctdb_req_control *c;
430 struct ctdb_iface *old;
431 struct ctdb_vnn *vnn;
435 called when updateip event finishes
437 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
440 struct ctdb_do_updateip_state *state =
441 talloc_get_type(private_data, struct ctdb_do_updateip_state);
445 if (status == -ETIME) {
448 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
449 ctdb_addr_to_str(&state->vnn->public_address),
451 ctdb_vnn_iface_string(state->vnn)));
454 * All we can do is reset the old interface
455 * and let the next run fix it
457 ctdb_vnn_unassign_iface(ctdb, state->vnn);
458 state->vnn->iface = state->old;
459 state->vnn->iface->references++;
461 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
466 if (ctdb->do_checkpublicip) {
468 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
470 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
477 /* the control succeeded */
478 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
484 update (move) an ip address
486 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
487 struct ctdb_req_control *c,
488 struct ctdb_vnn *vnn)
491 struct ctdb_do_updateip_state *state;
492 struct ctdb_iface *old = vnn->iface;
493 const char *new_name;
495 ctdb_vnn_unassign_iface(ctdb, vnn);
496 ret = ctdb_vnn_assign_iface(ctdb, vnn);
498 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
499 "assin a usable interface (old iface '%s')\n",
500 ctdb_addr_to_str(&vnn->public_address),
501 vnn->public_netmask_bits,
506 new_name = ctdb_vnn_iface_string(vnn);
507 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
508 /* A benign update from one interface onto itself.
509 * no need to run the eventscripts in this case, just return
512 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
516 state = talloc(vnn, struct ctdb_do_updateip_state);
517 CTDB_NO_MEMORY(ctdb, state);
519 state->c = talloc_steal(ctdb, c);
523 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
524 "interface %s to %s\n",
525 ctdb_addr_to_str(&vnn->public_address),
526 vnn->public_netmask_bits,
530 ret = ctdb_event_script_callback(ctdb,
532 ctdb_do_updateip_callback,
535 CTDB_EVENT_UPDATE_IP,
539 ctdb_addr_to_str(&vnn->public_address),
540 vnn->public_netmask_bits);
542 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
543 ctdb_addr_to_str(&vnn->public_address),
544 old->name, new_name));
553 Find the vnn of the node that has a public ip address
554 returns -1 if the address is not known as a public address
556 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
558 struct ctdb_vnn *vnn;
560 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
561 if (ctdb_same_ip(&vnn->public_address, addr)) {
570 take over an ip address
572 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
573 struct ctdb_req_control *c,
578 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
579 struct ctdb_vnn *vnn;
580 bool have_ip = false;
581 bool do_updateip = false;
582 bool do_takeip = false;
583 struct ctdb_iface *best_iface = NULL;
585 if (pip->pnn != ctdb->pnn) {
586 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
587 "with pnn %d, but we're node %d\n",
588 ctdb_addr_to_str(&pip->addr),
589 pip->pnn, ctdb->pnn));
593 /* update out vnn list */
594 vnn = find_public_ip_vnn(ctdb, &pip->addr);
596 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
597 ctdb_addr_to_str(&pip->addr)));
601 if (ctdb->do_checkpublicip) {
602 have_ip = ctdb_sys_have_ip(&pip->addr);
604 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
605 if (best_iface == NULL) {
606 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
607 "a usable interface (old %s, have_ip %d)\n",
608 ctdb_addr_to_str(&vnn->public_address),
609 vnn->public_netmask_bits,
610 ctdb_vnn_iface_string(vnn),
615 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
616 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
621 if (vnn->iface == NULL && have_ip) {
622 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
623 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
624 ctdb_addr_to_str(&vnn->public_address)));
628 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
629 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
630 "and we have it on iface[%s], but it was assigned to node %d"
631 "and we are node %d, banning ourself\n",
632 ctdb_addr_to_str(&vnn->public_address),
633 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
638 if (vnn->pnn == -1 && have_ip) {
639 vnn->pnn = ctdb->pnn;
640 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
641 "and we already have it on iface[%s], update local daemon\n",
642 ctdb_addr_to_str(&vnn->public_address),
643 ctdb_vnn_iface_string(vnn)));
648 if (vnn->iface->link_up) {
649 /* only move when the rebalance gains something */
650 if (vnn->iface->references > (best_iface->references + 1)) {
653 } else if (vnn->iface != best_iface) {
660 ctdb_vnn_unassign_iface(ctdb, vnn);
667 ret = ctdb_do_takeip(ctdb, c, vnn);
671 } else if (do_updateip) {
672 ret = ctdb_do_updateip(ctdb, c, vnn);
678 * The interface is up and the kernel known the ip
681 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
682 ctdb_addr_to_str(&pip->addr),
683 vnn->public_netmask_bits,
684 ctdb_vnn_iface_string(vnn)));
688 /* tell ctdb_control.c that we will be replying asynchronously */
695 takeover an ip address old v4 style
697 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb,
698 struct ctdb_req_control *c,
704 data.dsize = sizeof(struct ctdb_public_ip);
705 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
706 CTDB_NO_MEMORY(ctdb, data.dptr);
708 memcpy(data.dptr, indata.dptr, indata.dsize);
709 return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
713 kill any clients that are registered with a IP that is being released
715 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
717 struct ctdb_client_ip *ip;
719 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
720 ctdb_addr_to_str(addr)));
722 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
723 ctdb_sock_addr tmp_addr;
726 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
728 ctdb_addr_to_str(&ip->addr)));
730 if (ctdb_same_ip(&tmp_addr, addr)) {
731 struct ctdb_client *client = ctdb_reqid_find(ctdb,
734 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
736 ctdb_addr_to_str(&ip->addr),
739 if (client->pid != 0) {
740 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
741 (unsigned)client->pid,
742 ctdb_addr_to_str(addr),
744 kill(client->pid, SIGKILL);
751 called when releaseip event finishes
753 static void release_ip_callback(struct ctdb_context *ctdb, int status,
756 struct takeover_callback_state *state =
757 talloc_get_type(private_data, struct takeover_callback_state);
760 if (status == -ETIME) {
764 /* send a message to all clients of this node telling them
765 that the cluster has been reconfigured and they should
766 release any sockets on this IP */
767 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
768 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
769 data.dsize = strlen((char *)data.dptr)+1;
771 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
773 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
775 /* kill clients that have registered with this IP */
776 release_kill_clients(ctdb, state->addr);
778 ctdb_vnn_unassign_iface(ctdb, state->vnn);
780 /* the control succeeded */
781 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
786 release an ip address
788 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
789 struct ctdb_req_control *c,
794 struct takeover_callback_state *state;
795 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
796 struct ctdb_vnn *vnn;
798 /* update our vnn list */
799 vnn = find_public_ip_vnn(ctdb, &pip->addr);
801 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
802 ctdb_addr_to_str(&pip->addr)));
807 /* stop any previous arps */
808 talloc_free(vnn->takeover_ctx);
809 vnn->takeover_ctx = NULL;
811 if (ctdb->do_checkpublicip) {
813 if (!ctdb_sys_have_ip(&pip->addr)) {
814 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
815 ctdb_addr_to_str(&pip->addr),
816 vnn->public_netmask_bits,
817 ctdb_vnn_iface_string(vnn)));
818 ctdb_vnn_unassign_iface(ctdb, vnn);
822 if (vnn->iface == NULL) {
823 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
824 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
825 ctdb_addr_to_str(&vnn->public_address)));
829 } else if (vnn->iface == NULL) {
830 DEBUG(DEBUG_ERR, ("No interface found for IP %s.\n",
831 ctdb_addr_to_str(&vnn->public_address)));
835 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
836 ctdb_addr_to_str(&pip->addr),
837 vnn->public_netmask_bits,
838 ctdb_vnn_iface_string(vnn),
841 state = talloc(ctdb, struct takeover_callback_state);
842 CTDB_NO_MEMORY(ctdb, state);
844 state->c = talloc_steal(state, c);
845 state->addr = talloc(state, ctdb_sock_addr);
846 CTDB_NO_MEMORY(ctdb, state->addr);
847 *state->addr = pip->addr;
850 ret = ctdb_event_script_callback(ctdb,
851 state, release_ip_callback, state,
853 CTDB_EVENT_RELEASE_IP,
855 ctdb_vnn_iface_string(vnn),
856 ctdb_addr_to_str(&pip->addr),
857 vnn->public_netmask_bits);
859 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
860 ctdb_addr_to_str(&pip->addr),
861 ctdb_vnn_iface_string(vnn)));
866 /* tell the control that we will be reply asynchronously */
872 release an ip address old v4 style
874 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb,
875 struct ctdb_req_control *c,
881 data.dsize = sizeof(struct ctdb_public_ip);
882 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
883 CTDB_NO_MEMORY(ctdb, data.dptr);
885 memcpy(data.dptr, indata.dptr, indata.dsize);
886 return ctdb_control_release_ip(ctdb, c, data, async_reply);
890 static int ctdb_add_public_address(struct ctdb_context *ctdb,
891 ctdb_sock_addr *addr,
892 unsigned mask, const char *ifaces,
895 struct ctdb_vnn *vnn;
902 tmp = strdup(ifaces);
903 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
904 if (!ctdb_sys_check_iface_exists(iface)) {
905 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
912 /* Verify that we dont have an entry for this ip yet */
913 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
914 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
915 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
916 ctdb_addr_to_str(addr)));
921 /* create a new vnn structure for this ip address */
922 vnn = talloc_zero(ctdb, struct ctdb_vnn);
923 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
924 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
925 tmp = talloc_strdup(vnn, ifaces);
926 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
927 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
928 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
929 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
930 vnn->ifaces[num] = talloc_strdup(vnn, iface);
931 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
935 vnn->ifaces[num] = NULL;
936 vnn->public_address = *addr;
937 vnn->public_netmask_bits = mask;
940 if (ctdb_sys_have_ip(addr)) {
941 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
942 vnn->pnn = ctdb->pnn;
946 for (i=0; vnn->ifaces[i]; i++) {
947 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
949 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
950 "for public_address[%s]\n",
951 vnn->ifaces[i], ctdb_addr_to_str(addr)));
956 vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
960 DLIST_ADD(ctdb->vnn, vnn);
966 setup the event script directory
968 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
970 ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
971 CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
975 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te,
976 struct timeval t, void *private_data)
978 struct ctdb_context *ctdb = talloc_get_type(private_data,
979 struct ctdb_context);
980 struct ctdb_vnn *vnn;
982 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
985 for (i=0; vnn->ifaces[i] != NULL; i++) {
986 if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
987 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
989 ctdb_addr_to_str(&vnn->public_address)));
994 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
995 timeval_current_ofs(30, 0),
996 ctdb_check_interfaces_event, ctdb);
1000 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1002 if (ctdb->check_public_ifaces_ctx != NULL) {
1003 talloc_free(ctdb->check_public_ifaces_ctx);
1004 ctdb->check_public_ifaces_ctx = NULL;
1007 ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1008 if (ctdb->check_public_ifaces_ctx == NULL) {
1009 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1012 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
1013 timeval_current_ofs(30, 0),
1014 ctdb_check_interfaces_event, ctdb);
1021 setup the public address lists from a file
1023 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1029 lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1030 if (lines == NULL) {
1031 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1034 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1038 for (i=0;i<nlines;i++) {
1040 ctdb_sock_addr addr;
1041 const char *addrstr;
1046 while ((*line == ' ') || (*line == '\t')) {
1052 if (strcmp(line, "") == 0) {
1055 tok = strtok(line, " \t");
1057 tok = strtok(NULL, " \t");
1059 if (NULL == ctdb->default_public_interface) {
1060 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1065 ifaces = ctdb->default_public_interface;
1070 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1071 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1075 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1076 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1087 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1091 struct ctdb_vnn *svnn;
1092 struct ctdb_iface *cur = NULL;
1096 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1097 CTDB_NO_MEMORY(ctdb, svnn);
1099 svnn->ifaces = talloc_array(svnn, const char *, 2);
1100 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1101 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1102 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1103 svnn->ifaces[1] = NULL;
1105 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1111 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1113 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1114 "for single_ip[%s]\n",
1116 ctdb_addr_to_str(&svnn->public_address)));
1121 /* assume the single public ip interface is initially "good" */
1122 cur = ctdb_find_iface(ctdb, iface);
1124 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1127 cur->link_up = true;
1129 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1135 ctdb->single_ip_vnn = svnn;
1139 /* Given a physical node, return the number of
1140 public addresses that is currently assigned to this node.
1142 static int node_ip_coverage(struct ctdb_context *ctdb,
1144 struct ctdb_public_ip_list *ips)
1148 for (;ips;ips=ips->next) {
1149 if (ips->pnn == pnn) {
1157 /* Check if this is a public ip known to the node, i.e. can that
1158 node takeover this ip ?
1160 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn,
1161 struct ctdb_public_ip_list *ip)
1163 struct ctdb_all_public_ips *public_ips;
1166 public_ips = ctdb->nodes[pnn]->available_public_ips;
1168 if (public_ips == NULL) {
1172 for (i=0;i<public_ips->num;i++) {
1173 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1174 /* yes, this node can serve this public ip */
1183 /* search the node lists list for a node to takeover this ip.
1184 pick the node that currently are serving the least number of ips
1185 so that the ips get spread out evenly.
1187 static int find_takeover_node(struct ctdb_context *ctdb,
1188 struct ctdb_node_map *nodemap, uint32_t mask,
1189 struct ctdb_public_ip_list *ip,
1190 struct ctdb_public_ip_list *all_ips)
1192 int pnn, min=0, num;
1196 for (i=0;i<nodemap->num;i++) {
1197 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1198 /* This node is not allowed to takeover any addresses
1203 if (nodemap->nodes[i].flags & mask) {
1204 /* This node is not healty and can not be used to serve
1210 /* verify that this node can serve this ip */
1211 if (can_node_serve_ip(ctdb, i, ip)) {
1212 /* no it couldnt so skip to the next node */
1216 num = node_ip_coverage(ctdb, i, all_ips);
1217 /* was this the first node we checked ? */
1229 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1230 ctdb_addr_to_str(&ip->addr)));
1240 static uint32_t *ip_key(ctdb_sock_addr *ip)
1242 static uint32_t key[IP_KEYLEN];
1244 bzero(key, sizeof(key));
1246 switch (ip->sa.sa_family) {
1248 key[3] = htonl(ip->ip.sin_addr.s_addr);
1251 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1252 key[0] = htonl(s6_a32[0]);
1253 key[1] = htonl(s6_a32[1]);
1254 key[2] = htonl(s6_a32[2]);
1255 key[3] = htonl(s6_a32[3]);
1259 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1266 static void *add_ip_callback(void *parm, void *data)
1268 struct ctdb_public_ip_list *this_ip = parm;
1269 struct ctdb_public_ip_list *prev_ip = data;
1271 if (prev_ip == NULL) {
1274 if (this_ip->pnn == -1) {
1275 this_ip->pnn = prev_ip->pnn;
1281 static int getips_count_callback(void *param, void *data)
1283 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1284 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1286 new_ip->next = *ip_list;
1291 static struct ctdb_public_ip_list *
1292 create_merged_ip_list(struct ctdb_context *ctdb)
1295 struct ctdb_public_ip_list *ip_list;
1296 struct ctdb_all_public_ips *public_ips;
1298 if (ctdb->ip_tree != NULL) {
1299 talloc_free(ctdb->ip_tree);
1300 ctdb->ip_tree = NULL;
1302 ctdb->ip_tree = trbt_create(ctdb, 0);
1304 for (i=0;i<ctdb->num_nodes;i++) {
1305 public_ips = ctdb->nodes[i]->known_public_ips;
1307 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1311 /* there were no public ips for this node */
1312 if (public_ips == NULL) {
1316 for (j=0;j<public_ips->num;j++) {
1317 struct ctdb_public_ip_list *tmp_ip;
1319 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1320 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1321 tmp_ip->pnn = public_ips->ips[j].pnn;
1322 tmp_ip->addr = public_ips->ips[j].addr;
1323 tmp_ip->next = NULL;
1325 trbt_insertarray32_callback(ctdb->ip_tree,
1326 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1333 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1339 * This is the length of the longtest common prefix between the IPs.
1340 * It is calculated by XOR-ing the 2 IPs together and counting the
1341 * number of leading zeroes. The implementation means that all
1342 * addresses end up being 128 bits long.
1344 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1345 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1346 * lots of nodes and IP addresses?
1348 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1350 uint32_t ip1_k[IP_KEYLEN];
1355 uint32_t distance = 0;
1357 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1359 for (i=0; i<IP_KEYLEN; i++) {
1360 x = ip1_k[i] ^ t[i];
1364 /* Count number of leading zeroes.
1365 * FIXME? This could be optimised...
1367 while ((x & (1 << 31)) == 0) {
1377 /* Calculate the IP distance for the given IP relative to IPs on the
1378 given node. The ips argument is generally the all_ips variable
1379 used in the main part of the algorithm.
1381 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1382 struct ctdb_public_ip_list *ips,
1385 struct ctdb_public_ip_list *t;
1390 for (t=ips; t != NULL; t=t->next) {
1391 if (t->pnn != pnn) {
1395 /* Optimisation: We never calculate the distance
1396 * between an address and itself. This allows us to
1397 * calculate the effect of removing an address from a
1398 * node by simply calculating the distance between
1399 * that address and all of the exitsing addresses.
1400 * Moreover, we assume that we're only ever dealing
1401 * with addresses from all_ips so we can identify an
1402 * address via a pointer rather than doing a more
1403 * expensive address comparison. */
1404 if (&(t->addr) == ip) {
1408 d = ip_distance(ip, &(t->addr));
1409 sum += d * d; /* Cheaper than pulling in math.h :-) */
1415 /* Return the LCP2 imbalance metric for addresses currently assigned
1418 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1420 struct ctdb_public_ip_list *t;
1422 uint32_t imbalance = 0;
1424 for (t=all_ips; t!=NULL; t=t->next) {
1425 if (t->pnn != pnn) {
1428 /* Pass the rest of the IPs rather than the whole
1431 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1437 /* Allocate any unassigned IPs just by looping through the IPs and
1438 * finding the best node for each.
1440 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1441 struct ctdb_node_map *nodemap,
1443 struct ctdb_public_ip_list *all_ips)
1445 struct ctdb_public_ip_list *tmp_ip;
1447 /* loop over all ip's and find a physical node to cover for
1450 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1451 if (tmp_ip->pnn == -1) {
1452 if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1453 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1454 ctdb_addr_to_str(&tmp_ip->addr)));
1460 /* Basic non-deterministic rebalancing algorithm.
1462 static bool basic_failback(struct ctdb_context *ctdb,
1463 struct ctdb_node_map *nodemap,
1465 struct ctdb_public_ip_list *all_ips,
1470 int maxnode, maxnum=0, minnode, minnum=0, num;
1471 struct ctdb_public_ip_list *tmp_ip;
1473 /* for each ip address, loop over all nodes that can serve
1474 this ip and make sure that the difference between the node
1475 serving the most and the node serving the least ip's are
1478 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1479 if (tmp_ip->pnn == -1) {
1483 /* Get the highest and lowest number of ips's served by any
1484 valid node which can serve this ip.
1488 for (i=0;i<nodemap->num;i++) {
1489 if (nodemap->nodes[i].flags & mask) {
1493 /* Only check nodes that are allowed to takeover an ip */
1494 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1498 /* only check nodes that can actually serve this ip */
1499 if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1500 /* no it couldnt so skip to the next node */
1504 num = node_ip_coverage(ctdb, i, all_ips);
1505 if (maxnode == -1) {
1514 if (minnode == -1) {
1524 if (maxnode == -1) {
1525 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1526 ctdb_addr_to_str(&tmp_ip->addr)));
1531 /* If we want deterministic IPs then dont try to reallocate
1532 them to spread out the load.
1534 if (1 == ctdb->tunable.deterministic_public_ips) {
1538 /* if the spread between the smallest and largest coverage by
1539 a node is >=2 we steal one of the ips from the node with
1540 most coverage to even things out a bit.
1541 try to do this a limited number of times since we dont
1542 want to spend too much time balancing the ip coverage.
1544 if ( (maxnum > minnum+1)
1545 && (*retries < (num_ips + 5)) ){
1546 struct ctdb_public_ip_list *tmp;
1548 /* mark one of maxnode's vnn's as unassigned and try
1551 for (tmp=all_ips;tmp;tmp=tmp->next) {
1552 if (tmp->pnn == maxnode) {
1564 struct ctdb_rebalancenodes {
1565 struct ctdb_rebalancenodes *next;
1568 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1571 /* set this flag to force the node to be rebalanced even if it just didnt
1572 become healthy again.
1574 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1576 struct ctdb_rebalancenodes *rebalance;
1578 for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1579 if (rebalance->pnn == pnn) {
1584 rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1585 rebalance->pnn = pnn;
1586 rebalance->next = force_rebalance_list;
1587 force_rebalance_list = rebalance;
1590 /* Do necessary LCP2 initialisation. Bury it in a function here so
1591 * that we can unit test it.
1593 static void lcp2_init(struct ctdb_context * tmp_ctx,
1594 struct ctdb_node_map * nodemap,
1596 struct ctdb_public_ip_list *all_ips,
1597 uint32_t **lcp2_imbalances,
1598 bool **newly_healthy)
1601 struct ctdb_public_ip_list *tmp_ip;
1603 *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1604 CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1605 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1606 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1608 for (i=0;i<nodemap->num;i++) {
1609 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1610 /* First step: is the node "healthy"? */
1611 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1614 /* 2nd step: if a ndoe has IPs assigned then it must have been
1615 * healthy before, so we remove it from consideration... */
1616 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1617 if (tmp_ip->pnn != -1) {
1618 (*newly_healthy)[tmp_ip->pnn] = false;
1622 /* 3rd step: if a node is forced to re-balance then
1623 we allow failback onto the node */
1624 while (force_rebalance_list != NULL) {
1625 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1627 if (force_rebalance_list->pnn <= nodemap->num) {
1628 (*newly_healthy)[force_rebalance_list->pnn] = true;
1631 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1632 talloc_free(force_rebalance_list);
1633 force_rebalance_list = next;
1637 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1638 * the IP/node combination that will cost the least.
1640 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1641 struct ctdb_node_map *nodemap,
1643 struct ctdb_public_ip_list *all_ips,
1644 uint32_t *lcp2_imbalances)
1646 struct ctdb_public_ip_list *tmp_ip;
1650 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1651 struct ctdb_public_ip_list *minip;
1653 bool should_loop = true;
1654 bool have_unassigned = true;
1656 while (have_unassigned && should_loop) {
1657 should_loop = false;
1659 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1660 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1666 /* loop over each unassigned ip. */
1667 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1668 if (tmp_ip->pnn != -1) {
1672 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1673 /* Only check nodes that are allowed to takeover an ip */
1674 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1678 /* only check nodes that can actually serve this ip */
1679 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1680 /* no it couldnt so skip to the next node */
1683 if (nodemap->nodes[dstnode].flags & mask) {
1687 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1688 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1689 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1690 ctdb_addr_to_str(&(tmp_ip->addr)),
1692 dstimbl - lcp2_imbalances[dstnode]));
1695 if ((minnode == -1) || (dstdsum < mindsum)) {
1705 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1707 /* If we found one then assign it to the given node. */
1708 if (minnode != -1) {
1709 minip->pnn = minnode;
1710 lcp2_imbalances[minnode] = minimbl;
1711 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1712 ctdb_addr_to_str(&(minip->addr)),
1717 /* There might be a better way but at least this is clear. */
1718 have_unassigned = false;
1719 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1720 if (tmp_ip->pnn == -1) {
1721 have_unassigned = true;
1726 /* We know if we have an unassigned addresses so we might as
1729 if (have_unassigned) {
1730 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1731 if (tmp_ip->pnn == -1) {
1732 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1733 ctdb_addr_to_str(&tmp_ip->addr)));
1739 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1740 * to move IPs from, determines the best IP/destination node
1741 * combination to move from the source node.
1743 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1744 struct ctdb_node_map *nodemap,
1745 struct ctdb_public_ip_list *all_ips,
1748 uint32_t *lcp2_imbalances,
1749 bool *newly_healthy)
1751 int dstnode, mindstnode;
1752 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1753 uint32_t minsrcimbl, mindstimbl;
1754 struct ctdb_public_ip_list *minip;
1755 struct ctdb_public_ip_list *tmp_ip;
1757 /* Find an IP and destination node that best reduces imbalance. */
1763 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1764 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1766 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1767 /* Only consider addresses on srcnode. */
1768 if (tmp_ip->pnn != srcnode) {
1772 /* What is this IP address costing the source node? */
1773 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1774 srcimbl = candimbl - srcdsum;
1776 /* Consider this IP address would cost each potential
1777 * destination node. Destination nodes are limited to
1778 * those that are newly healthy, since we don't want
1779 * to do gratuitous failover of IPs just to make minor
1780 * balance improvements.
1782 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1783 if (! newly_healthy[dstnode]) {
1787 /* Only check nodes that are allowed to takeover an ip */
1788 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1792 /* only check nodes that can actually serve this ip */
1793 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1794 /* no it couldnt so skip to the next node */
1798 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1799 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1800 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1801 srcnode, srcimbl - lcp2_imbalances[srcnode],
1802 ctdb_addr_to_str(&(tmp_ip->addr)),
1803 dstnode, dstimbl - lcp2_imbalances[dstnode]));
1805 if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1806 ((mindstnode == -1) || \
1807 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1810 minsrcimbl = srcimbl;
1811 mindstnode = dstnode;
1812 mindstimbl = dstimbl;
1816 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1818 if (mindstnode != -1) {
1819 /* We found a move that makes things better... */
1820 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1821 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1822 ctdb_addr_to_str(&(minip->addr)),
1823 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1826 lcp2_imbalances[srcnode] = srcimbl;
1827 lcp2_imbalances[mindstnode] = mindstimbl;
1828 minip->pnn = mindstnode;
1837 struct lcp2_imbalance_pnn {
1842 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1844 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1845 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1847 if (lipa->imbalance > lipb->imbalance) {
1849 } else if (lipa->imbalance == lipb->imbalance) {
1856 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1857 * node with the highest LCP2 imbalance, and then determines the best
1858 * IP/destination node combination to move from the source node.
1860 static bool lcp2_failback(struct ctdb_context *ctdb,
1861 struct ctdb_node_map *nodemap,
1863 struct ctdb_public_ip_list *all_ips,
1864 uint32_t *lcp2_imbalances,
1865 bool *newly_healthy)
1867 int i, num_newly_healthy;
1868 struct lcp2_imbalance_pnn * lips;
1871 /* It is only worth continuing if we have suitable target
1872 * nodes to transfer IPs to. This check is much cheaper than
1875 num_newly_healthy = 0;
1876 for (i = 0; i < nodemap->num; i++) {
1877 if (newly_healthy[i]) {
1878 num_newly_healthy++;
1881 if (num_newly_healthy == 0) {
1885 /* Put the imbalances and nodes into an array, sort them and
1886 * iterate through candidates. Usually the 1st one will be
1887 * used, so this doesn't cost much...
1889 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
1890 for (i = 0; i < nodemap->num; i++) {
1891 lips[i].imbalance = lcp2_imbalances[i];
1894 qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
1895 lcp2_cmp_imbalance_pnn);
1898 for (i = 0; i < nodemap->num; i++) {
1899 /* This means that all nodes had 0 or 1 addresses, so
1900 * can't be imbalanced.
1902 if (lips[i].imbalance == 0) {
1906 if (lcp2_failback_candidate(ctdb,
1922 /* The calculation part of the IP allocation algorithm. */
1923 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1924 struct ctdb_node_map *nodemap,
1925 struct ctdb_public_ip_list **all_ips_p)
1927 int i, num_healthy, retries, num_ips;
1929 struct ctdb_public_ip_list *all_ips, *tmp_ip;
1930 uint32_t *lcp2_imbalances;
1931 bool *newly_healthy;
1933 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1935 /* Count how many completely healthy nodes we have */
1937 for (i=0;i<nodemap->num;i++) {
1938 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1943 if (num_healthy > 0) {
1944 /* We have healthy nodes, so only consider them for
1945 serving public addresses
1947 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1949 /* We didnt have any completely healthy nodes so
1950 use "disabled" nodes as a fallback
1952 mask = NODE_FLAGS_INACTIVE;
1955 /* since nodes only know about those public addresses that
1956 can be served by that particular node, no single node has
1957 a full list of all public addresses that exist in the cluster.
1958 Walk over all node structures and create a merged list of
1959 all public addresses that exist in the cluster.
1961 keep the tree of ips around as ctdb->ip_tree
1963 all_ips = create_merged_ip_list(ctdb);
1964 *all_ips_p = all_ips; /* minimal code changes */
1966 /* Count how many ips we have */
1968 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1972 /* If we want deterministic ip allocations, i.e. that the ip addresses
1973 will always be allocated the same way for a specific set of
1974 available/unavailable nodes.
1976 if (1 == ctdb->tunable.deterministic_public_ips) {
1977 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1978 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1979 tmp_ip->pnn = i%nodemap->num;
1984 /* mark all public addresses with a masked node as being served by
1987 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1988 if (tmp_ip->pnn == -1) {
1991 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1996 /* verify that the assigned nodes can serve that public ip
1997 and set it to -1 if not
1999 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2000 if (tmp_ip->pnn == -1) {
2003 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2004 /* this node can not serve this ip. */
2009 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2010 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2013 /* now we must redistribute all public addresses with takeover node
2014 -1 among the nodes available
2018 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2019 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2021 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2024 /* If we dont want ips to fail back after a node becomes healthy
2025 again, we wont even try to reallocat the ip addresses so that
2026 they are evenly spread out.
2027 This can NOT be used at the same time as DeterministicIPs !
2029 if (1 == ctdb->tunable.no_ip_failback) {
2030 if (1 == ctdb->tunable.deterministic_public_ips) {
2031 DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
2037 /* now, try to make sure the ip adresses are evenly distributed
2040 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2041 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
2045 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
2050 /* finished distributing the public addresses, now just send the
2051 info out to the nodes */
2053 /* at this point ->pnn is the node which will own each IP
2054 or -1 if there is no node that can cover this ip
2060 static void noiptakeover_cb(struct ctdb_context *ctdb, uint32_t pnn, int32_t res, TDB_DATA outdata, void *callback)
2062 struct ctdb_node_map *nodemap = (struct ctdb_node_map *)callback;
2065 DEBUG(DEBUG_ERR,("Failure to read NoIPTakeover tunable from remote node %d\n", pnn));
2069 if (outdata.dsize != sizeof(uint32_t)) {
2070 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading NoIPTakeover tunable from node %d. Expected %d bytes but received %d bytes\n", pnn, (int)sizeof(uint32_t), (int)outdata.dsize));
2074 if (pnn >= nodemap->num) {
2075 DEBUG(DEBUG_ERR,("Got NoIPTakeover reply from node %d but nodemap only has %d entries\n", pnn, nodemap->num));
2079 if (*(uint32_t *)outdata.dptr != 0) {
2080 nodemap->nodes[pnn].flags |= NODE_FLAGS_NOIPTAKEOVER;
2085 make any IP alias changes for public addresses that are necessary
2087 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2090 struct ctdb_public_ip ip;
2091 struct ctdb_public_ipv4 ipv4;
2092 struct ctdb_control_get_tunable *t;
2094 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2096 struct timeval timeout;
2097 struct client_async_data *async_data;
2098 struct ctdb_client_control_state *state;
2099 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2102 * ip failover is completely disabled, just send out the
2103 * ipreallocated event.
2105 if (ctdb->tunable.disable_ip_failover != 0) {
2110 /* assume all nodes do support failback */
2111 for (i=0;i<nodemap->num;i++) {
2112 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2114 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen("NoIPTakeover") + 1;
2115 data.dptr = talloc_size(tmp_ctx, data.dsize);
2116 t = (struct ctdb_control_get_tunable *)data.dptr;
2117 t->length = strlen("NoIPTakeover")+1;
2118 memcpy(t->name, "NoIPTakeover", t->length);
2119 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2120 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2121 nodes, 0, TAKEOVER_TIMEOUT(),
2123 noiptakeover_cb, NULL,
2125 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get noiptakeover tunable failed\n"));
2128 talloc_free(data.dptr);
2133 /* Do the IP reassignment calculations */
2134 ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2136 /* now tell all nodes to delete any alias that they should not
2137 have. This will be a NOOP on nodes that don't currently
2138 hold the given alias */
2139 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2140 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2142 for (i=0;i<nodemap->num;i++) {
2143 /* don't talk to unconnected nodes, but do talk to banned nodes */
2144 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2148 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2149 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2150 /* This node should be serving this
2151 vnn so dont tell it to release the ip
2155 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2156 ipv4.pnn = tmp_ip->pnn;
2157 ipv4.sin = tmp_ip->addr.ip;
2159 timeout = TAKEOVER_TIMEOUT();
2160 data.dsize = sizeof(ipv4);
2161 data.dptr = (uint8_t *)&ipv4;
2162 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2163 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2167 ip.pnn = tmp_ip->pnn;
2168 ip.addr = tmp_ip->addr;
2170 timeout = TAKEOVER_TIMEOUT();
2171 data.dsize = sizeof(ip);
2172 data.dptr = (uint8_t *)&ip;
2173 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2174 0, CTDB_CONTROL_RELEASE_IP, 0,
2179 if (state == NULL) {
2180 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2181 talloc_free(tmp_ctx);
2185 ctdb_client_async_add(async_data, state);
2188 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2189 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2190 talloc_free(tmp_ctx);
2193 talloc_free(async_data);
2196 /* tell all nodes to get their own IPs */
2197 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2198 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2199 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2200 if (tmp_ip->pnn == -1) {
2201 /* this IP won't be taken over */
2205 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2206 ipv4.pnn = tmp_ip->pnn;
2207 ipv4.sin = tmp_ip->addr.ip;
2209 timeout = TAKEOVER_TIMEOUT();
2210 data.dsize = sizeof(ipv4);
2211 data.dptr = (uint8_t *)&ipv4;
2212 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2213 0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2217 ip.pnn = tmp_ip->pnn;
2218 ip.addr = tmp_ip->addr;
2220 timeout = TAKEOVER_TIMEOUT();
2221 data.dsize = sizeof(ip);
2222 data.dptr = (uint8_t *)&ip;
2223 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2224 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2228 if (state == NULL) {
2229 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2230 talloc_free(tmp_ctx);
2234 ctdb_client_async_add(async_data, state);
2236 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2237 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2238 talloc_free(tmp_ctx);
2243 /* tell all nodes to update natwg */
2244 /* send the flags update natgw on all connected nodes */
2245 data.dptr = discard_const("ipreallocated");
2246 data.dsize = strlen((char *)data.dptr) + 1;
2247 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2248 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2249 nodes, 0, TAKEOVER_TIMEOUT(),
2253 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2256 talloc_free(tmp_ctx);
2262 destroy a ctdb_client_ip structure
2264 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2266 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2267 ctdb_addr_to_str(&ip->addr),
2268 ntohs(ip->addr.ip.sin_port),
2271 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2276 called by a client to inform us of a TCP connection that it is managing
2277 that should tickled with an ACK when IP takeover is done
2278 we handle both the old ipv4 style of packets as well as the new ipv4/6
2281 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2284 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2285 struct ctdb_control_tcp *old_addr = NULL;
2286 struct ctdb_control_tcp_addr new_addr;
2287 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2288 struct ctdb_tcp_list *tcp;
2289 struct ctdb_tcp_connection t;
2292 struct ctdb_client_ip *ip;
2293 struct ctdb_vnn *vnn;
2294 ctdb_sock_addr addr;
2296 switch (indata.dsize) {
2297 case sizeof(struct ctdb_control_tcp):
2298 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2299 ZERO_STRUCT(new_addr);
2300 tcp_sock = &new_addr;
2301 tcp_sock->src.ip = old_addr->src;
2302 tcp_sock->dest.ip = old_addr->dest;
2304 case sizeof(struct ctdb_control_tcp_addr):
2305 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2308 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2309 "to ctdb_control_tcp_client. size was %d but "
2310 "only allowed sizes are %lu and %lu\n",
2312 (long unsigned)sizeof(struct ctdb_control_tcp),
2313 (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2317 addr = tcp_sock->src;
2318 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2319 addr = tcp_sock->dest;
2320 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2323 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2324 vnn = find_public_ip_vnn(ctdb, &addr);
2326 switch (addr.sa.sa_family) {
2328 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2329 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2330 ctdb_addr_to_str(&addr)));
2334 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2335 ctdb_addr_to_str(&addr)));
2338 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2344 if (vnn->pnn != ctdb->pnn) {
2345 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2346 ctdb_addr_to_str(&addr),
2347 client_id, client->pid));
2348 /* failing this call will tell smbd to die */
2352 ip = talloc(client, struct ctdb_client_ip);
2353 CTDB_NO_MEMORY(ctdb, ip);
2357 ip->client_id = client_id;
2358 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2359 DLIST_ADD(ctdb->client_ip_list, ip);
2361 tcp = talloc(client, struct ctdb_tcp_list);
2362 CTDB_NO_MEMORY(ctdb, tcp);
2364 tcp->connection.src_addr = tcp_sock->src;
2365 tcp->connection.dst_addr = tcp_sock->dest;
2367 DLIST_ADD(client->tcp_list, tcp);
2369 t.src_addr = tcp_sock->src;
2370 t.dst_addr = tcp_sock->dest;
2372 data.dptr = (uint8_t *)&t;
2373 data.dsize = sizeof(t);
2375 switch (addr.sa.sa_family) {
2377 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2378 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
2379 ctdb_addr_to_str(&tcp_sock->src),
2380 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2383 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2384 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
2385 ctdb_addr_to_str(&tcp_sock->src),
2386 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2389 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2393 /* tell all nodes about this tcp connection */
2394 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2395 CTDB_CONTROL_TCP_ADD,
2396 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2398 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2406 find a tcp address on a list
2408 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2409 struct ctdb_tcp_connection *tcp)
2413 if (array == NULL) {
2417 for (i=0;i<array->num;i++) {
2418 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2419 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2420 return &array->connections[i];
2429 called by a daemon to inform us of a TCP connection that one of its
2430 clients managing that should tickled with an ACK when IP takeover is
2433 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2435 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2436 struct ctdb_tcp_array *tcparray;
2437 struct ctdb_tcp_connection tcp;
2438 struct ctdb_vnn *vnn;
2440 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2442 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2443 ctdb_addr_to_str(&p->dst_addr)));
2449 tcparray = vnn->tcp_array;
2451 /* If this is the first tickle */
2452 if (tcparray == NULL) {
2453 tcparray = talloc_size(ctdb->nodes,
2454 offsetof(struct ctdb_tcp_array, connections) +
2455 sizeof(struct ctdb_tcp_connection) * 1);
2456 CTDB_NO_MEMORY(ctdb, tcparray);
2457 vnn->tcp_array = tcparray;
2460 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2461 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2463 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2464 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2467 if (tcp_update_needed) {
2468 vnn->tcp_update_needed = true;
2474 /* Do we already have this tickle ?*/
2475 tcp.src_addr = p->src_addr;
2476 tcp.dst_addr = p->dst_addr;
2477 if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2478 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2479 ctdb_addr_to_str(&tcp.dst_addr),
2480 ntohs(tcp.dst_addr.ip.sin_port),
2485 /* A new tickle, we must add it to the array */
2486 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2487 struct ctdb_tcp_connection,
2489 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2491 vnn->tcp_array = tcparray;
2492 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2493 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2496 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2497 ctdb_addr_to_str(&tcp.dst_addr),
2498 ntohs(tcp.dst_addr.ip.sin_port),
2501 if (tcp_update_needed) {
2502 vnn->tcp_update_needed = true;
2510 called by a daemon to inform us of a TCP connection that one of its
2511 clients managing that should tickled with an ACK when IP takeover is
2514 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2516 struct ctdb_tcp_connection *tcpp;
2517 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2520 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2521 ctdb_addr_to_str(&conn->dst_addr)));
2525 /* if the array is empty we cant remove it
2526 and we dont need to do anything
2528 if (vnn->tcp_array == NULL) {
2529 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2530 ctdb_addr_to_str(&conn->dst_addr),
2531 ntohs(conn->dst_addr.ip.sin_port)));
2536 /* See if we know this connection
2537 if we dont know this connection then we dont need to do anything
2539 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2541 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2542 ctdb_addr_to_str(&conn->dst_addr),
2543 ntohs(conn->dst_addr.ip.sin_port)));
2548 /* We need to remove this entry from the array.
2549 Instead of allocating a new array and copying data to it
2550 we cheat and just copy the last entry in the existing array
2551 to the entry that is to be removed and just shring the
2554 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2555 vnn->tcp_array->num--;
2557 /* If we deleted the last entry we also need to remove the entire array
2559 if (vnn->tcp_array->num == 0) {
2560 talloc_free(vnn->tcp_array);
2561 vnn->tcp_array = NULL;
2564 vnn->tcp_update_needed = true;
2566 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2567 ctdb_addr_to_str(&conn->src_addr),
2568 ntohs(conn->src_addr.ip.sin_port)));
2573 called by a daemon to inform us of a TCP connection that one of its
2574 clients used are no longer needed in the tickle database
2576 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2578 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2580 ctdb_remove_tcp_connection(ctdb, conn);
2587 called when a daemon restarts - send all tickes for all public addresses
2588 we are serving immediately to the new node.
2590 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2592 /*XXX here we should send all tickes we are serving to the new node */
2598 called when a client structure goes away - hook to remove
2599 elements from the tcp_list in all daemons
2601 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2603 while (client->tcp_list) {
2604 struct ctdb_tcp_list *tcp = client->tcp_list;
2605 DLIST_REMOVE(client->tcp_list, tcp);
2606 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2612 release all IPs on shutdown
2614 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2616 struct ctdb_vnn *vnn;
2618 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2619 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2620 ctdb_vnn_unassign_iface(ctdb, vnn);
2626 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2627 ctdb_vnn_iface_string(vnn),
2628 ctdb_addr_to_str(&vnn->public_address),
2629 vnn->public_netmask_bits);
2630 release_kill_clients(ctdb, &vnn->public_address);
2631 ctdb_vnn_unassign_iface(ctdb, vnn);
2637 get list of public IPs
2639 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
2640 struct ctdb_req_control *c, TDB_DATA *outdata)
2643 struct ctdb_all_public_ips *ips;
2644 struct ctdb_vnn *vnn;
2645 bool only_available = false;
2647 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2648 only_available = true;
2651 /* count how many public ip structures we have */
2653 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2657 len = offsetof(struct ctdb_all_public_ips, ips) +
2658 num*sizeof(struct ctdb_public_ip);
2659 ips = talloc_zero_size(outdata, len);
2660 CTDB_NO_MEMORY(ctdb, ips);
2663 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2664 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2667 ips->ips[i].pnn = vnn->pnn;
2668 ips->ips[i].addr = vnn->public_address;
2672 len = offsetof(struct ctdb_all_public_ips, ips) +
2673 i*sizeof(struct ctdb_public_ip);
2675 outdata->dsize = len;
2676 outdata->dptr = (uint8_t *)ips;
2683 get list of public IPs, old ipv4 style. only returns ipv4 addresses
2685 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb,
2686 struct ctdb_req_control *c, TDB_DATA *outdata)
2689 struct ctdb_all_public_ipsv4 *ips;
2690 struct ctdb_vnn *vnn;
2692 /* count how many public ip structures we have */
2694 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2695 if (vnn->public_address.sa.sa_family != AF_INET) {
2701 len = offsetof(struct ctdb_all_public_ipsv4, ips) +
2702 num*sizeof(struct ctdb_public_ipv4);
2703 ips = talloc_zero_size(outdata, len);
2704 CTDB_NO_MEMORY(ctdb, ips);
2706 outdata->dsize = len;
2707 outdata->dptr = (uint8_t *)ips;
2711 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2712 if (vnn->public_address.sa.sa_family != AF_INET) {
2715 ips->ips[i].pnn = vnn->pnn;
2716 ips->ips[i].sin = vnn->public_address.ip;
2723 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2724 struct ctdb_req_control *c,
2729 ctdb_sock_addr *addr;
2730 struct ctdb_control_public_ip_info *info;
2731 struct ctdb_vnn *vnn;
2733 addr = (ctdb_sock_addr *)indata.dptr;
2735 vnn = find_public_ip_vnn(ctdb, addr);
2737 /* if it is not a public ip it could be our 'single ip' */
2738 if (ctdb->single_ip_vnn) {
2739 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2740 vnn = ctdb->single_ip_vnn;
2745 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2746 "'%s'not a public address\n",
2747 ctdb_addr_to_str(addr)));
2751 /* count how many public ip structures we have */
2753 for (;vnn->ifaces[num];) {
2757 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2758 num*sizeof(struct ctdb_control_iface_info);
2759 info = talloc_zero_size(outdata, len);
2760 CTDB_NO_MEMORY(ctdb, info);
2762 info->ip.addr = vnn->public_address;
2763 info->ip.pnn = vnn->pnn;
2764 info->active_idx = 0xFFFFFFFF;
2766 for (i=0; vnn->ifaces[i]; i++) {
2767 struct ctdb_iface *cur;
2769 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2771 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2775 if (vnn->iface == cur) {
2776 info->active_idx = i;
2778 strcpy(info->ifaces[i].name, cur->name);
2779 info->ifaces[i].link_state = cur->link_up;
2780 info->ifaces[i].references = cur->references;
2783 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2784 i*sizeof(struct ctdb_control_iface_info);
2786 outdata->dsize = len;
2787 outdata->dptr = (uint8_t *)info;
2792 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2793 struct ctdb_req_control *c,
2797 struct ctdb_control_get_ifaces *ifaces;
2798 struct ctdb_iface *cur;
2800 /* count how many public ip structures we have */
2802 for (cur=ctdb->ifaces;cur;cur=cur->next) {
2806 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2807 num*sizeof(struct ctdb_control_iface_info);
2808 ifaces = talloc_zero_size(outdata, len);
2809 CTDB_NO_MEMORY(ctdb, ifaces);
2812 for (cur=ctdb->ifaces;cur;cur=cur->next) {
2813 strcpy(ifaces->ifaces[i].name, cur->name);
2814 ifaces->ifaces[i].link_state = cur->link_up;
2815 ifaces->ifaces[i].references = cur->references;
2819 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2820 i*sizeof(struct ctdb_control_iface_info);
2822 outdata->dsize = len;
2823 outdata->dptr = (uint8_t *)ifaces;
2828 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2829 struct ctdb_req_control *c,
2832 struct ctdb_control_iface_info *info;
2833 struct ctdb_iface *iface;
2834 bool link_up = false;
2836 info = (struct ctdb_control_iface_info *)indata.dptr;
2838 if (info->name[CTDB_IFACE_SIZE] != '\0') {
2839 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2840 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2841 len, len, info->name));
2845 switch (info->link_state) {
2853 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2854 (unsigned int)info->link_state));
2858 if (info->references != 0) {
2859 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2860 (unsigned int)info->references));
2864 iface = ctdb_find_iface(ctdb, info->name);
2865 if (iface == NULL) {
2869 if (link_up == iface->link_up) {
2873 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2874 ("iface[%s] has changed it's link status %s => %s\n",
2876 iface->link_up?"up":"down",
2877 link_up?"up":"down"));
2879 iface->link_up = link_up;
2885 structure containing the listening socket and the list of tcp connections
2886 that the ctdb daemon is to kill
2888 struct ctdb_kill_tcp {
2889 struct ctdb_vnn *vnn;
2890 struct ctdb_context *ctdb;
2892 struct fd_event *fde;
2893 trbt_tree_t *connections;
2898 a tcp connection that is to be killed
2900 struct ctdb_killtcp_con {
2901 ctdb_sock_addr src_addr;
2902 ctdb_sock_addr dst_addr;
2904 struct ctdb_kill_tcp *killtcp;
2907 /* this function is used to create a key to represent this socketpair
2908 in the killtcp tree.
2909 this key is used to insert and lookup matching socketpairs that are
2910 to be tickled and RST
2912 #define KILLTCP_KEYLEN 10
2913 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2915 static uint32_t key[KILLTCP_KEYLEN];
2917 bzero(key, sizeof(key));
2919 if (src->sa.sa_family != dst->sa.sa_family) {
2920 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2924 switch (src->sa.sa_family) {
2926 key[0] = dst->ip.sin_addr.s_addr;
2927 key[1] = src->ip.sin_addr.s_addr;
2928 key[2] = dst->ip.sin_port;
2929 key[3] = src->ip.sin_port;
2932 uint32_t *dst6_addr32 =
2933 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2934 uint32_t *src6_addr32 =
2935 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2936 key[0] = dst6_addr32[3];
2937 key[1] = src6_addr32[3];
2938 key[2] = dst6_addr32[2];
2939 key[3] = src6_addr32[2];
2940 key[4] = dst6_addr32[1];
2941 key[5] = src6_addr32[1];
2942 key[6] = dst6_addr32[0];
2943 key[7] = src6_addr32[0];
2944 key[8] = dst->ip6.sin6_port;
2945 key[9] = src->ip6.sin6_port;
2949 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2957 called when we get a read event on the raw socket
2959 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
2960 uint16_t flags, void *private_data)
2962 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2963 struct ctdb_killtcp_con *con;
2964 ctdb_sock_addr src, dst;
2965 uint32_t ack_seq, seq;
2967 if (!(flags & EVENT_FD_READ)) {
2971 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2972 killtcp->private_data,
2974 &ack_seq, &seq) != 0) {
2975 /* probably a non-tcp ACK packet */
2979 /* check if we have this guy in our list of connections
2982 con = trbt_lookuparray32(killtcp->connections,
2983 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2985 /* no this was some other packet we can just ignore */
2989 /* This one has been tickled !
2990 now reset him and remove him from the list.
2992 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2993 ntohs(con->dst_addr.ip.sin_port),
2994 ctdb_addr_to_str(&con->src_addr),
2995 ntohs(con->src_addr.ip.sin_port)));
2997 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3002 /* when traversing the list of all tcp connections to send tickle acks to
3003 (so that we can capture the ack coming back and kill the connection
3005 this callback is called for each connection we are currently trying to kill
3007 static int tickle_connection_traverse(void *param, void *data)
3009 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3011 /* have tried too many times, just give up */
3012 if (con->count >= 5) {
3013 /* can't delete in traverse: reparent to delete_cons */
3014 talloc_steal(param, con);
3018 /* othervise, try tickling it again */
3021 (ctdb_sock_addr *)&con->dst_addr,
3022 (ctdb_sock_addr *)&con->src_addr,
3029 called every second until all sentenced connections have been reset
3031 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
3032 struct timeval t, void *private_data)
3034 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3035 void *delete_cons = talloc_new(NULL);
3037 /* loop over all connections sending tickle ACKs */
3038 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3040 /* now we've finished traverse, it's safe to do deletion. */
3041 talloc_free(delete_cons);
3043 /* If there are no more connections to kill we can remove the
3044 entire killtcp structure
3046 if ( (killtcp->connections == NULL) ||
3047 (killtcp->connections->root == NULL) ) {
3048 talloc_free(killtcp);
3052 /* try tickling them again in a seconds time
3054 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3055 ctdb_tickle_sentenced_connections, killtcp);
3059 destroy the killtcp structure
3061 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3063 struct ctdb_vnn *tmpvnn;
3065 /* verify that this vnn is still active */
3066 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3067 if (tmpvnn == killtcp->vnn) {
3072 if (tmpvnn == NULL) {
3076 if (killtcp->vnn->killtcp != killtcp) {
3080 killtcp->vnn->killtcp = NULL;
3086 /* nothing fancy here, just unconditionally replace any existing
3087 connection structure with the new one.
3089 dont even free the old one if it did exist, that one is talloc_stolen
3090 by the same node in the tree anyway and will be deleted when the new data
3093 static void *add_killtcp_callback(void *parm, void *data)
3099 add a tcp socket to the list of connections we want to RST
3101 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3105 ctdb_sock_addr src, dst;
3106 struct ctdb_kill_tcp *killtcp;
3107 struct ctdb_killtcp_con *con;
3108 struct ctdb_vnn *vnn;
3110 ctdb_canonicalize_ip(s, &src);
3111 ctdb_canonicalize_ip(d, &dst);
3113 vnn = find_public_ip_vnn(ctdb, &dst);
3115 vnn = find_public_ip_vnn(ctdb, &src);
3118 /* if it is not a public ip it could be our 'single ip' */
3119 if (ctdb->single_ip_vnn) {
3120 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3121 vnn = ctdb->single_ip_vnn;
3126 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3130 killtcp = vnn->killtcp;
3132 /* If this is the first connection to kill we must allocate
3135 if (killtcp == NULL) {
3136 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3137 CTDB_NO_MEMORY(ctdb, killtcp);
3140 killtcp->ctdb = ctdb;
3141 killtcp->capture_fd = -1;
3142 killtcp->connections = trbt_create(killtcp, 0);
3144 vnn->killtcp = killtcp;
3145 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3150 /* create a structure that describes this connection we want to
3151 RST and store it in killtcp->connections
3153 con = talloc(killtcp, struct ctdb_killtcp_con);
3154 CTDB_NO_MEMORY(ctdb, con);
3155 con->src_addr = src;
3156 con->dst_addr = dst;
3158 con->killtcp = killtcp;
3161 trbt_insertarray32_callback(killtcp->connections,
3162 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3163 add_killtcp_callback, con);
3166 If we dont have a socket to listen on yet we must create it
3168 if (killtcp->capture_fd == -1) {
3169 const char *iface = ctdb_vnn_iface_string(vnn);
3170 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3171 if (killtcp->capture_fd == -1) {
3172 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3173 "socket on iface '%s' for killtcp (%s)\n",
3174 iface, strerror(errno)));
3180 if (killtcp->fde == NULL) {
3181 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
3183 capture_tcp_handler, killtcp);
3184 tevent_fd_set_auto_close(killtcp->fde);
3186 /* We also need to set up some events to tickle all these connections
3187 until they are all reset
3189 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3190 ctdb_tickle_sentenced_connections, killtcp);
3193 /* tickle him once now */
3202 talloc_free(vnn->killtcp);
3203 vnn->killtcp = NULL;
3208 kill a TCP connection.
3210 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3212 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3214 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3218 called by a daemon to inform us of the entire list of TCP tickles for
3219 a particular public address.
3220 this control should only be sent by the node that is currently serving
3221 that public address.
3223 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3225 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3226 struct ctdb_tcp_array *tcparray;
3227 struct ctdb_vnn *vnn;
3229 /* We must at least have tickles.num or else we cant verify the size
3230 of the received data blob
3232 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3233 tickles.connections)) {
3234 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3238 /* verify that the size of data matches what we expect */
3239 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3240 tickles.connections)
3241 + sizeof(struct ctdb_tcp_connection)
3242 * list->tickles.num) {
3243 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3247 vnn = find_public_ip_vnn(ctdb, &list->addr);
3249 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3250 ctdb_addr_to_str(&list->addr)));
3255 /* remove any old ticklelist we might have */
3256 talloc_free(vnn->tcp_array);
3257 vnn->tcp_array = NULL;
3259 tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3260 CTDB_NO_MEMORY(ctdb, tcparray);
3262 tcparray->num = list->tickles.num;
3264 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3265 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3267 memcpy(tcparray->connections, &list->tickles.connections[0],
3268 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3270 /* We now have a new fresh tickle list array for this vnn */
3271 vnn->tcp_array = talloc_steal(vnn, tcparray);
3277 called to return the full list of tickles for the puclic address associated
3278 with the provided vnn
3280 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3282 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3283 struct ctdb_control_tcp_tickle_list *list;
3284 struct ctdb_tcp_array *tcparray;
3286 struct ctdb_vnn *vnn;
3288 vnn = find_public_ip_vnn(ctdb, addr);
3290 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3291 ctdb_addr_to_str(addr)));
3296 tcparray = vnn->tcp_array;
3298 num = tcparray->num;
3303 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3304 tickles.connections)
3305 + sizeof(struct ctdb_tcp_connection) * num;
3307 outdata->dptr = talloc_size(outdata, outdata->dsize);
3308 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3309 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3312 list->tickles.num = num;
3314 memcpy(&list->tickles.connections[0], tcparray->connections,
3315 sizeof(struct ctdb_tcp_connection) * num);
3323 set the list of all tcp tickles for a public address
3325 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb,
3326 struct timeval timeout, uint32_t destnode,
3327 ctdb_sock_addr *addr,
3328 struct ctdb_tcp_array *tcparray)
3332 struct ctdb_control_tcp_tickle_list *list;
3335 num = tcparray->num;
3340 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3341 tickles.connections) +
3342 sizeof(struct ctdb_tcp_connection) * num;
3343 data.dptr = talloc_size(ctdb, data.dsize);
3344 CTDB_NO_MEMORY(ctdb, data.dptr);
3346 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3348 list->tickles.num = num;
3350 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3353 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
3354 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3355 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3357 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3361 talloc_free(data.dptr);
3368 perform tickle updates if required
3370 static void ctdb_update_tcp_tickles(struct event_context *ev,
3371 struct timed_event *te,
3372 struct timeval t, void *private_data)
3374 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3376 struct ctdb_vnn *vnn;
3378 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3379 /* we only send out updates for public addresses that
3382 if (ctdb->pnn != vnn->pnn) {
3385 /* We only send out the updates if we need to */
3386 if (!vnn->tcp_update_needed) {
3389 ret = ctdb_ctrl_set_tcp_tickles(ctdb,
3391 CTDB_BROADCAST_CONNECTED,
3392 &vnn->public_address,
3395 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3396 ctdb_addr_to_str(&vnn->public_address)));
3400 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3401 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3402 ctdb_update_tcp_tickles, ctdb);
3407 start periodic update of tcp tickles
3409 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3411 ctdb->tickle_update_context = talloc_new(ctdb);
3413 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3414 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3415 ctdb_update_tcp_tickles, ctdb);
3421 struct control_gratious_arp {
3422 struct ctdb_context *ctdb;
3423 ctdb_sock_addr addr;
3429 send a control_gratuitous arp
3431 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
3432 struct timeval t, void *private_data)
3435 struct control_gratious_arp *arp = talloc_get_type(private_data,
3436 struct control_gratious_arp);
3438 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3440 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3441 arp->iface, strerror(errno)));
3446 if (arp->count == CTDB_ARP_REPEAT) {
3451 event_add_timed(arp->ctdb->ev, arp,
3452 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3453 send_gratious_arp, arp);
3460 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3462 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3463 struct control_gratious_arp *arp;
3465 /* verify the size of indata */
3466 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3467 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3468 (unsigned)indata.dsize,
3469 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3473 ( offsetof(struct ctdb_control_gratious_arp, iface)
3474 + gratious_arp->len ) ){
3476 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3477 "but should be %u bytes\n",
3478 (unsigned)indata.dsize,
3479 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3484 arp = talloc(ctdb, struct control_gratious_arp);
3485 CTDB_NO_MEMORY(ctdb, arp);
3488 arp->addr = gratious_arp->addr;
3489 arp->iface = talloc_strdup(arp, gratious_arp->iface);
3490 CTDB_NO_MEMORY(ctdb, arp->iface);
3493 event_add_timed(arp->ctdb->ev, arp,
3494 timeval_zero(), send_gratious_arp, arp);
3499 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3501 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3504 /* verify the size of indata */
3505 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3506 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3510 ( offsetof(struct ctdb_control_ip_iface, iface)
3513 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3514 "but should be %u bytes\n",
3515 (unsigned)indata.dsize,
3516 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3520 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3523 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3531 called when releaseip event finishes for del_public_address
3533 static void delete_ip_callback(struct ctdb_context *ctdb, int status,
3536 talloc_free(private_data);
3539 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3541 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3542 struct ctdb_vnn *vnn;
3545 /* verify the size of indata */
3546 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3547 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3551 ( offsetof(struct ctdb_control_ip_iface, iface)
3554 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3555 "but should be %u bytes\n",
3556 (unsigned)indata.dsize,
3557 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3561 /* walk over all public addresses until we find a match */
3562 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3563 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3564 TALLOC_CTX *mem_ctx;
3566 DLIST_REMOVE(ctdb->vnn, vnn);
3567 if (vnn->pnn != ctdb->pnn) {
3568 if (vnn->iface != NULL) {
3569 ctdb_vnn_unassign_iface(ctdb, vnn);
3576 mem_ctx = talloc_new(ctdb);
3577 talloc_steal(mem_ctx, vnn);
3578 ret = ctdb_event_script_callback(ctdb,
3579 mem_ctx, delete_ip_callback, mem_ctx,
3581 CTDB_EVENT_RELEASE_IP,
3583 ctdb_vnn_iface_string(vnn),
3584 ctdb_addr_to_str(&vnn->public_address),
3585 vnn->public_netmask_bits);
3586 if (vnn->iface != NULL) {
3587 ctdb_vnn_unassign_iface(ctdb, vnn);
3599 /* This function is called from the recovery daemon to verify that a remote
3600 node has the expected ip allocation.
3601 This is verified against ctdb->ip_tree
3603 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3605 struct ctdb_public_ip_list *tmp_ip;
3608 if (ctdb->ip_tree == NULL) {
3609 /* dont know the expected allocation yet, assume remote node
3618 for (i=0; i<ips->num; i++) {
3619 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3620 if (tmp_ip == NULL) {
3621 DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3625 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3629 if (tmp_ip->pnn != ips->ips[i].pnn) {
3630 DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3638 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3640 struct ctdb_public_ip_list *tmp_ip;
3642 if (ctdb->ip_tree == NULL) {
3643 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3647 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3648 if (tmp_ip == NULL) {
3649 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3653 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3654 tmp_ip->pnn = ip->pnn;
3660 struct ctdb_reloadips_handle {
3661 struct ctdb_context *ctdb;
3662 struct ctdb_req_control *c;
3666 struct fd_event *fde;
3669 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3671 if (h == h->ctdb->reload_ips) {
3672 h->ctdb->reload_ips = NULL;
3675 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3678 kill(h->child, SIGKILL);
3682 static void ctdb_reloadips_timeout_event(struct event_context *ev,
3683 struct timed_event *te,
3684 struct timeval t, void *private_data)
3686 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3691 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde,
3692 uint16_t flags, void *private_data)
3694 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3699 ret = read(h->fd[0], &res, 1);
3700 if (ret < 1 || res != 0) {
3701 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3709 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3711 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3712 struct ctdb_all_public_ips *ips;
3713 struct ctdb_vnn *vnn;
3716 /* read the ip allocation from the local node */
3717 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
3719 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
3720 talloc_free(mem_ctx);
3724 /* re-read the public ips file */
3726 if (ctdb_set_public_addresses(ctdb, false) != 0) {
3727 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3728 talloc_free(mem_ctx);
3733 /* check the previous list of ips and scan for ips that have been
3736 for (i = 0; i < ips->num; i++) {
3737 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3738 if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3743 /* we need to delete this ip, no longer available on this node */
3745 struct ctdb_control_ip_iface pub;
3747 DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3748 pub.addr = ips->ips[i].addr;
3752 ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3754 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3761 /* loop over all new ones and check the ones we need to add */
3762 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3763 for (i = 0; i < ips->num; i++) {
3764 if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3768 if (i == ips->num) {
3769 struct ctdb_control_ip_iface pub;
3770 char *ifaces = NULL;
3773 DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
3775 pub.addr = vnn->public_address;
3776 pub.mask = vnn->public_netmask_bits;
3779 ifaces = vnn->ifaces[0];
3781 while (vnn->ifaces[iface] != NULL) {
3782 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
3785 pub.len = strlen(ifaces)+1;
3786 memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
3788 ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3790 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
3799 /* This control is sent to force the node to re-read the public addresses file
3800 and drop any addresses we should nnot longer host, and add new addresses
3801 that we are now able to host
3803 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
3805 struct ctdb_reloadips_handle *h;
3806 pid_t parent = getpid();
3808 if (ctdb->reload_ips != NULL) {
3809 talloc_free(ctdb->reload_ips);
3810 ctdb->reload_ips = NULL;
3813 h = talloc(ctdb, struct ctdb_reloadips_handle);
3814 CTDB_NO_MEMORY(ctdb, h);
3819 if (pipe(h->fd) == -1) {
3820 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3825 h->child = ctdb_fork(ctdb);
3826 if (h->child == (pid_t)-1) {
3827 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3835 if (h->child == 0) {
3836 signed char res = 0;
3839 debug_extra = talloc_asprintf(NULL, "reloadips:");
3841 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
3842 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
3845 res = ctdb_reloadips_child(ctdb);
3847 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
3851 write(h->fd[1], &res, 1);
3852 /* make sure we die when our parent dies */
3853 while (kill(parent, 0) == 0 || errno != ESRCH) {
3859 h->c = talloc_steal(h, c);
3862 set_close_on_exec(h->fd[0]);
3864 talloc_set_destructor(h, ctdb_reloadips_destructor);
3867 h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
3868 EVENT_FD_READ, ctdb_reloadips_child_handler,
3870 tevent_fd_set_auto_close(h->fde);
3872 event_add_timed(ctdb->ev, h,
3873 timeval_current_ofs(120, 0),
3874 ctdb_reloadips_timeout_event, h);
3876 /* we reply later */
3877 *async_reply = True;