4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
56 struct ctdb_interface {
57 struct ctdb_interface *prev, *next;
63 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
66 return vnn->iface->name;
72 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
74 struct ctdb_interface *i;
76 /* Verify that we don't have an entry for this ip yet */
77 for (i=ctdb->ifaces;i;i=i->next) {
78 if (strcmp(i->name, iface) == 0) {
83 /* create a new structure for this interface */
84 i = talloc_zero(ctdb, struct ctdb_interface);
85 CTDB_NO_MEMORY_FATAL(ctdb, i);
86 i->name = talloc_strdup(i, iface);
87 CTDB_NO_MEMORY(ctdb, i->name);
91 DLIST_ADD(ctdb->ifaces, i);
96 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
101 for (n = 0; vnn->ifaces[n] != NULL; n++) {
102 if (strcmp(name, vnn->ifaces[n]) == 0) {
110 /* If any interfaces now have no possible IPs then delete them. This
111 * implementation is naive (i.e. simple) rather than clever
112 * (i.e. complex). Given that this is run on delip and that operation
113 * is rare, this doesn't need to be efficient - it needs to be
114 * foolproof. One alternative is reference counting, where the logic
115 * is distributed and can, therefore, be broken in multiple places.
116 * Another alternative is to build a red-black tree of interfaces that
117 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
118 * once) and then walking ctdb->ifaces once and deleting those not in
119 * the tree. Let's go to one of those if the naive implementation
120 * causes problems... :-)
122 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
123 struct ctdb_vnn *vnn)
125 struct ctdb_interface *i, *next;
127 /* For each interface, check if there's an IP using it. */
128 for (i = ctdb->ifaces; i != NULL; i = next) {
133 /* Only consider interfaces named in the given VNN. */
134 if (!vnn_has_interface_with_name(vnn, i->name)) {
138 /* Is the "single IP" on this interface? */
139 if ((ctdb->single_ip_vnn != NULL) &&
140 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
141 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
142 /* Found, next interface please... */
145 /* Search for a vnn with this interface. */
147 for (tv=ctdb->vnn; tv; tv=tv->next) {
148 if (vnn_has_interface_with_name(tv, i->name)) {
155 /* None of the VNNs are using this interface. */
156 DLIST_REMOVE(ctdb->ifaces, i);
163 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
166 struct ctdb_interface *i;
168 for (i=ctdb->ifaces;i;i=i->next) {
169 if (strcmp(i->name, iface) == 0) {
177 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178 struct ctdb_vnn *vnn)
181 struct ctdb_interface *cur = NULL;
182 struct ctdb_interface *best = NULL;
184 for (i=0; vnn->ifaces[i]; i++) {
186 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
200 if (cur->references < best->references) {
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210 struct ctdb_vnn *vnn)
212 struct ctdb_interface *best = NULL;
215 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216 "still assigned to iface '%s'\n",
217 ctdb_addr_to_str(&vnn->public_address),
218 ctdb_vnn_iface_string(vnn)));
222 best = ctdb_vnn_best_iface(ctdb, vnn);
224 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225 "cannot assign to iface any iface\n",
226 ctdb_addr_to_str(&vnn->public_address)));
232 vnn->pnn = ctdb->pnn;
234 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235 "now assigned to iface '%s' refs[%d]\n",
236 ctdb_addr_to_str(&vnn->public_address),
237 ctdb_vnn_iface_string(vnn),
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243 struct ctdb_vnn *vnn)
245 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246 "now unassigned (old iface '%s' refs[%d])\n",
247 ctdb_addr_to_str(&vnn->public_address),
248 ctdb_vnn_iface_string(vnn),
249 vnn->iface?vnn->iface->references:0));
251 vnn->iface->references--;
254 if (vnn->pnn == ctdb->pnn) {
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260 struct ctdb_vnn *vnn)
264 /* Nodes that are not RUNNING can not host IPs */
265 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
269 if (vnn->delete_pending) {
273 if (vnn->iface && vnn->iface->link_up) {
277 for (i=0; vnn->ifaces[i]; i++) {
278 struct ctdb_interface *cur;
280 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
293 struct ctdb_takeover_arp {
294 struct ctdb_context *ctdb;
297 struct ctdb_tcp_array *tcparray;
298 struct ctdb_vnn *vnn;
303 lists of tcp endpoints
305 struct ctdb_tcp_list {
306 struct ctdb_tcp_list *prev, *next;
307 struct ctdb_connection connection;
311 list of clients to kill on IP release
313 struct ctdb_client_ip {
314 struct ctdb_client_ip *prev, *next;
315 struct ctdb_context *ctdb;
322 send a gratuitous arp
324 static void ctdb_control_send_arp(struct tevent_context *ev,
325 struct tevent_timer *te,
326 struct timeval t, void *private_data)
328 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
329 struct ctdb_takeover_arp);
331 struct ctdb_tcp_array *tcparray;
332 const char *iface = ctdb_vnn_iface_string(arp->vnn);
334 ret = ctdb_sys_send_arp(&arp->addr, iface);
336 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
337 iface, strerror(errno)));
340 tcparray = arp->tcparray;
342 for (i=0;i<tcparray->num;i++) {
343 struct ctdb_connection *tcon;
345 tcon = &tcparray->connections[i];
346 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
347 (unsigned)ntohs(tcon->dst.ip.sin_port),
348 ctdb_addr_to_str(&tcon->src),
349 (unsigned)ntohs(tcon->src.ip.sin_port)));
350 ret = ctdb_sys_send_tcp(
355 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
356 ctdb_addr_to_str(&tcon->src)));
363 if (arp->count == CTDB_ARP_REPEAT) {
368 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
369 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
370 ctdb_control_send_arp, arp);
373 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
374 struct ctdb_vnn *vnn)
376 struct ctdb_takeover_arp *arp;
377 struct ctdb_tcp_array *tcparray;
379 if (!vnn->takeover_ctx) {
380 vnn->takeover_ctx = talloc_new(vnn);
381 if (!vnn->takeover_ctx) {
386 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
392 arp->addr = vnn->public_address;
395 tcparray = vnn->tcp_array;
397 /* add all of the known tcp connections for this IP to the
398 list of tcp connections to send tickle acks for */
399 arp->tcparray = talloc_steal(arp, tcparray);
401 vnn->tcp_array = NULL;
402 vnn->tcp_update_needed = true;
405 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
406 timeval_zero(), ctdb_control_send_arp, arp);
411 struct takeover_callback_state {
412 struct ctdb_req_control_old *c;
413 ctdb_sock_addr *addr;
414 struct ctdb_vnn *vnn;
417 struct ctdb_do_takeip_state {
418 struct ctdb_req_control_old *c;
419 struct ctdb_vnn *vnn;
423 called when takeip event finishes
425 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
428 struct ctdb_do_takeip_state *state =
429 talloc_get_type(private_data, struct ctdb_do_takeip_state);
434 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
436 if (status == -ETIME) {
439 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
440 ctdb_addr_to_str(&state->vnn->public_address),
441 ctdb_vnn_iface_string(state->vnn)));
442 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
444 node->flags |= NODE_FLAGS_UNHEALTHY;
449 if (ctdb->do_checkpublicip) {
451 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
453 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
460 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
461 data.dsize = strlen((char *)data.dptr) + 1;
462 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
464 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
467 /* the control succeeded */
468 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
473 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
475 state->vnn->update_in_flight = false;
480 take over an ip address
482 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
483 struct ctdb_req_control_old *c,
484 struct ctdb_vnn *vnn)
487 struct ctdb_do_takeip_state *state;
489 if (vnn->update_in_flight) {
490 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
491 "update for this IP already in flight\n",
492 ctdb_addr_to_str(&vnn->public_address),
493 vnn->public_netmask_bits));
497 ret = ctdb_vnn_assign_iface(ctdb, vnn);
499 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
500 "assign a usable interface\n",
501 ctdb_addr_to_str(&vnn->public_address),
502 vnn->public_netmask_bits));
506 state = talloc(vnn, struct ctdb_do_takeip_state);
507 CTDB_NO_MEMORY(ctdb, state);
509 state->c = talloc_steal(ctdb, c);
512 vnn->update_in_flight = true;
513 talloc_set_destructor(state, ctdb_takeip_destructor);
515 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
516 ctdb_addr_to_str(&vnn->public_address),
517 vnn->public_netmask_bits,
518 ctdb_vnn_iface_string(vnn)));
520 ret = ctdb_event_script_callback(ctdb,
522 ctdb_do_takeip_callback,
526 ctdb_vnn_iface_string(vnn),
527 ctdb_addr_to_str(&vnn->public_address),
528 vnn->public_netmask_bits);
531 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
532 ctdb_addr_to_str(&vnn->public_address),
533 ctdb_vnn_iface_string(vnn)));
541 struct ctdb_do_updateip_state {
542 struct ctdb_req_control_old *c;
543 struct ctdb_interface *old;
544 struct ctdb_vnn *vnn;
548 called when updateip event finishes
550 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
553 struct ctdb_do_updateip_state *state =
554 talloc_get_type(private_data, struct ctdb_do_updateip_state);
558 if (status == -ETIME) {
561 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
562 ctdb_addr_to_str(&state->vnn->public_address),
564 ctdb_vnn_iface_string(state->vnn)));
567 * All we can do is reset the old interface
568 * and let the next run fix it
570 ctdb_vnn_unassign_iface(ctdb, state->vnn);
571 state->vnn->iface = state->old;
572 state->vnn->iface->references++;
574 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
579 if (ctdb->do_checkpublicip) {
581 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
583 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
590 /* the control succeeded */
591 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
596 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
598 state->vnn->update_in_flight = false;
603 update (move) an ip address
605 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
606 struct ctdb_req_control_old *c,
607 struct ctdb_vnn *vnn)
610 struct ctdb_do_updateip_state *state;
611 struct ctdb_interface *old = vnn->iface;
612 const char *new_name;
614 if (vnn->update_in_flight) {
615 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
616 "update for this IP already in flight\n",
617 ctdb_addr_to_str(&vnn->public_address),
618 vnn->public_netmask_bits));
622 ctdb_vnn_unassign_iface(ctdb, vnn);
623 ret = ctdb_vnn_assign_iface(ctdb, vnn);
625 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
626 "assin a usable interface (old iface '%s')\n",
627 ctdb_addr_to_str(&vnn->public_address),
628 vnn->public_netmask_bits,
633 new_name = ctdb_vnn_iface_string(vnn);
634 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
635 /* A benign update from one interface onto itself.
636 * no need to run the eventscripts in this case, just return
639 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
643 state = talloc(vnn, struct ctdb_do_updateip_state);
644 CTDB_NO_MEMORY(ctdb, state);
646 state->c = talloc_steal(ctdb, c);
650 vnn->update_in_flight = true;
651 talloc_set_destructor(state, ctdb_updateip_destructor);
653 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
654 "interface %s to %s\n",
655 ctdb_addr_to_str(&vnn->public_address),
656 vnn->public_netmask_bits,
660 ret = ctdb_event_script_callback(ctdb,
662 ctdb_do_updateip_callback,
664 CTDB_EVENT_UPDATE_IP,
668 ctdb_addr_to_str(&vnn->public_address),
669 vnn->public_netmask_bits);
671 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
672 ctdb_addr_to_str(&vnn->public_address),
673 old->name, new_name));
682 Find the vnn of the node that has a public ip address
683 returns -1 if the address is not known as a public address
685 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
687 struct ctdb_vnn *vnn;
689 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
690 if (ctdb_same_ip(&vnn->public_address, addr)) {
699 take over an ip address
701 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
702 struct ctdb_req_control_old *c,
707 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
708 struct ctdb_vnn *vnn;
709 bool have_ip = false;
710 bool do_updateip = false;
711 bool do_takeip = false;
712 struct ctdb_interface *best_iface = NULL;
714 if (pip->pnn != ctdb->pnn) {
715 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
716 "with pnn %d, but we're node %d\n",
717 ctdb_addr_to_str(&pip->addr),
718 pip->pnn, ctdb->pnn));
722 /* update out vnn list */
723 vnn = find_public_ip_vnn(ctdb, &pip->addr);
725 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
726 ctdb_addr_to_str(&pip->addr)));
730 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
731 have_ip = ctdb_sys_have_ip(&pip->addr);
733 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
734 if (best_iface == NULL) {
735 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
736 "a usable interface (old %s, have_ip %d)\n",
737 ctdb_addr_to_str(&vnn->public_address),
738 vnn->public_netmask_bits,
739 ctdb_vnn_iface_string(vnn),
744 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
745 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
750 if (vnn->iface == NULL && have_ip) {
751 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
753 ctdb_addr_to_str(&vnn->public_address)));
757 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
758 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
759 "and we have it on iface[%s], but it was assigned to node %d"
760 "and we are node %d, banning ourself\n",
761 ctdb_addr_to_str(&vnn->public_address),
762 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
767 if (vnn->pnn == -1 && have_ip) {
768 vnn->pnn = ctdb->pnn;
769 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
770 "and we already have it on iface[%s], update local daemon\n",
771 ctdb_addr_to_str(&vnn->public_address),
772 ctdb_vnn_iface_string(vnn)));
777 if (vnn->iface != best_iface) {
778 if (!vnn->iface->link_up) {
780 } else if (vnn->iface->references > (best_iface->references + 1)) {
781 /* only move when the rebalance gains something */
789 ctdb_vnn_unassign_iface(ctdb, vnn);
796 ret = ctdb_do_takeip(ctdb, c, vnn);
800 } else if (do_updateip) {
801 ret = ctdb_do_updateip(ctdb, c, vnn);
807 * The interface is up and the kernel known the ip
810 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
811 ctdb_addr_to_str(&pip->addr),
812 vnn->public_netmask_bits,
813 ctdb_vnn_iface_string(vnn)));
817 /* tell ctdb_control.c that we will be replying asynchronously */
824 kill any clients that are registered with a IP that is being released
826 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
828 struct ctdb_client_ip *ip;
830 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
831 ctdb_addr_to_str(addr)));
833 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
834 ctdb_sock_addr tmp_addr;
837 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
839 ctdb_addr_to_str(&ip->addr)));
841 if (ctdb_same_ip(&tmp_addr, addr)) {
842 struct ctdb_client *client = reqid_find(ctdb->idr,
845 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
847 ctdb_addr_to_str(&ip->addr),
850 if (client->pid != 0) {
851 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
852 (unsigned)client->pid,
853 ctdb_addr_to_str(addr),
855 kill(client->pid, SIGKILL);
861 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
863 DLIST_REMOVE(ctdb->vnn, vnn);
864 ctdb_vnn_unassign_iface(ctdb, vnn);
865 ctdb_remove_orphaned_ifaces(ctdb, vnn);
870 called when releaseip event finishes
872 static void release_ip_callback(struct ctdb_context *ctdb, int status,
875 struct takeover_callback_state *state =
876 talloc_get_type(private_data, struct takeover_callback_state);
879 if (status == -ETIME) {
883 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
884 if (ctdb_sys_have_ip(state->addr)) {
886 ("IP %s still hosted during release IP callback, failing\n",
887 ctdb_addr_to_str(state->addr)));
888 ctdb_request_control_reply(ctdb, state->c,
895 /* send a message to all clients of this node telling them
896 that the cluster has been reconfigured and they should
897 release any sockets on this IP */
898 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
899 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
900 data.dsize = strlen((char *)data.dptr)+1;
902 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
904 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
906 /* kill clients that have registered with this IP */
907 release_kill_clients(ctdb, state->addr);
909 ctdb_vnn_unassign_iface(ctdb, state->vnn);
911 /* Process the IP if it has been marked for deletion */
912 if (state->vnn->delete_pending) {
913 do_delete_ip(ctdb, state->vnn);
917 /* the control succeeded */
918 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
922 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
924 if (state->vnn != NULL) {
925 state->vnn->update_in_flight = false;
931 release an ip address
933 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
934 struct ctdb_req_control_old *c,
939 struct takeover_callback_state *state;
940 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
941 struct ctdb_vnn *vnn;
944 /* update our vnn list */
945 vnn = find_public_ip_vnn(ctdb, &pip->addr);
947 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
948 ctdb_addr_to_str(&pip->addr)));
953 /* stop any previous arps */
954 talloc_free(vnn->takeover_ctx);
955 vnn->takeover_ctx = NULL;
957 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
958 * lazy multicast to drop an IP from any node that isn't the
959 * intended new node. The following causes makes ctdbd ignore
960 * a release for any address it doesn't host.
962 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
963 if (!ctdb_sys_have_ip(&pip->addr)) {
964 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
965 ctdb_addr_to_str(&pip->addr),
966 vnn->public_netmask_bits,
967 ctdb_vnn_iface_string(vnn)));
968 ctdb_vnn_unassign_iface(ctdb, vnn);
972 if (vnn->iface == NULL) {
973 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
974 ctdb_addr_to_str(&pip->addr),
975 vnn->public_netmask_bits));
980 /* There is a potential race between take_ip and us because we
981 * update the VNN via a callback that run when the
982 * eventscripts have been run. Avoid the race by allowing one
983 * update to be in flight at a time.
985 if (vnn->update_in_flight) {
986 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
987 "update for this IP already in flight\n",
988 ctdb_addr_to_str(&vnn->public_address),
989 vnn->public_netmask_bits));
993 iface = strdup(ctdb_vnn_iface_string(vnn));
995 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
996 ctdb_addr_to_str(&pip->addr),
997 vnn->public_netmask_bits,
1001 state = talloc(ctdb, struct takeover_callback_state);
1002 if (state == NULL) {
1003 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1004 __FILE__, __LINE__);
1009 state->c = talloc_steal(state, c);
1010 state->addr = talloc(state, ctdb_sock_addr);
1011 if (state->addr == NULL) {
1012 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1013 __FILE__, __LINE__);
1018 *state->addr = pip->addr;
1021 vnn->update_in_flight = true;
1022 talloc_set_destructor(state, ctdb_releaseip_destructor);
1024 ret = ctdb_event_script_callback(ctdb,
1025 state, release_ip_callback, state,
1026 CTDB_EVENT_RELEASE_IP,
1029 ctdb_addr_to_str(&pip->addr),
1030 vnn->public_netmask_bits);
1033 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1034 ctdb_addr_to_str(&pip->addr),
1035 ctdb_vnn_iface_string(vnn)));
1040 /* tell the control that we will be reply asynchronously */
1041 *async_reply = true;
1045 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1046 ctdb_sock_addr *addr,
1047 unsigned mask, const char *ifaces,
1050 struct ctdb_vnn *vnn;
1057 tmp = strdup(ifaces);
1058 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1059 if (!ctdb_sys_check_iface_exists(iface)) {
1060 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1067 /* Verify that we don't have an entry for this ip yet */
1068 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1069 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1070 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1071 ctdb_addr_to_str(addr)));
1076 /* create a new vnn structure for this ip address */
1077 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1078 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1079 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1080 tmp = talloc_strdup(vnn, ifaces);
1081 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1082 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1083 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1084 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1085 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1086 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1090 vnn->ifaces[num] = NULL;
1091 vnn->public_address = *addr;
1092 vnn->public_netmask_bits = mask;
1094 if (check_address) {
1095 if (ctdb_sys_have_ip(addr)) {
1096 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1097 vnn->pnn = ctdb->pnn;
1101 for (i=0; vnn->ifaces[i]; i++) {
1102 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1104 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1105 "for public_address[%s]\n",
1106 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1112 DLIST_ADD(ctdb->vnn, vnn);
1118 setup the public address lists from a file
1120 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1126 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1127 if (lines == NULL) {
1128 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1131 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1135 for (i=0;i<nlines;i++) {
1137 ctdb_sock_addr addr;
1138 const char *addrstr;
1143 while ((*line == ' ') || (*line == '\t')) {
1149 if (strcmp(line, "") == 0) {
1152 tok = strtok(line, " \t");
1154 tok = strtok(NULL, " \t");
1156 if (NULL == ctdb->default_public_interface) {
1157 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1162 ifaces = ctdb->default_public_interface;
1167 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1168 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1172 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1173 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1184 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1188 struct ctdb_vnn *svnn;
1189 struct ctdb_interface *cur = NULL;
1193 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1194 CTDB_NO_MEMORY(ctdb, svnn);
1196 svnn->ifaces = talloc_array(svnn, const char *, 2);
1197 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1198 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1199 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1200 svnn->ifaces[1] = NULL;
1202 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1208 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1210 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1211 "for single_ip[%s]\n",
1213 ctdb_addr_to_str(&svnn->public_address)));
1218 /* assume the single public ip interface is initially "good" */
1219 cur = ctdb_find_iface(ctdb, iface);
1221 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1224 cur->link_up = true;
1226 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1232 ctdb->single_ip_vnn = svnn;
1236 struct public_ip_list {
1237 struct public_ip_list *next;
1239 ctdb_sock_addr addr;
1242 /* Given a physical node, return the number of
1243 public addresses that is currently assigned to this node.
1245 static int node_ip_coverage(struct ctdb_context *ctdb, int32_t pnn,
1246 struct public_ip_list *ips)
1250 for (;ips;ips=ips->next) {
1251 if (ips->pnn == pnn) {
1259 /* Can the given node host the given IP: is the public IP known to the
1260 * node and is NOIPHOST unset?
1262 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1263 struct ctdb_ipflags ipflags,
1264 struct public_ip_list *ip)
1266 struct ctdb_public_ip_list_old *public_ips;
1269 if (ipflags.noiphost) {
1273 public_ips = ctdb->nodes[pnn]->available_public_ips;
1275 if (public_ips == NULL) {
1279 for (i=0; i<public_ips->num; i++) {
1280 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1281 /* yes, this node can serve this public ip */
1289 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1290 struct ctdb_ipflags ipflags,
1291 struct public_ip_list *ip)
1293 if (ipflags.noiptakeover) {
1297 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1300 /* search the node lists list for a node to takeover this ip.
1301 pick the node that currently are serving the least number of ips
1302 so that the ips get spread out evenly.
1304 static int find_takeover_node(struct ctdb_context *ctdb,
1305 struct ctdb_ipflags *ipflags,
1306 struct public_ip_list *ip,
1307 struct public_ip_list *all_ips)
1309 int pnn, min=0, num;
1312 numnodes = talloc_array_length(ipflags);
1314 for (i=0; i<numnodes; i++) {
1315 /* verify that this node can serve this ip */
1316 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1317 /* no it couldnt so skip to the next node */
1321 num = node_ip_coverage(ctdb, i, all_ips);
1322 /* was this the first node we checked ? */
1334 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1335 ctdb_addr_to_str(&ip->addr)));
1345 static uint32_t *ip_key(ctdb_sock_addr *ip)
1347 static uint32_t key[IP_KEYLEN];
1349 bzero(key, sizeof(key));
1351 switch (ip->sa.sa_family) {
1353 key[3] = htonl(ip->ip.sin_addr.s_addr);
1356 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1357 key[0] = htonl(s6_a32[0]);
1358 key[1] = htonl(s6_a32[1]);
1359 key[2] = htonl(s6_a32[2]);
1360 key[3] = htonl(s6_a32[3]);
1364 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1371 static void *add_ip_callback(void *parm, void *data)
1373 struct public_ip_list *this_ip = parm;
1374 struct public_ip_list *prev_ip = data;
1376 if (prev_ip == NULL) {
1379 if (this_ip->pnn == -1) {
1380 this_ip->pnn = prev_ip->pnn;
1386 static int getips_count_callback(void *param, void *data)
1388 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1389 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1391 new_ip->next = *ip_list;
1396 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1397 struct ctdb_public_ip_list_old *ips,
1400 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1401 struct ctdb_node_map_old *nodemap)
1406 if (ctdb->num_nodes != nodemap->num) {
1407 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1408 ctdb->num_nodes, nodemap->num));
1412 for (j=0; j<nodemap->num; j++) {
1413 /* For readability */
1414 struct ctdb_node *node = ctdb->nodes[j];
1416 /* release any existing data */
1417 TALLOC_FREE(node->known_public_ips);
1418 TALLOC_FREE(node->available_public_ips);
1420 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1424 /* Retrieve the list of known public IPs from the node */
1425 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1430 &node->known_public_ips);
1433 ("Failed to read known public IPs from node: %u\n",
1438 if (ctdb->do_checkpublicip) {
1439 verify_remote_ip_allocation(ctdb,
1440 node->known_public_ips,
1444 /* Retrieve the list of available public IPs from the node */
1445 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1449 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1450 &node->available_public_ips);
1453 ("Failed to read available public IPs from node: %u\n",
1462 static struct public_ip_list *
1463 create_merged_ip_list(struct ctdb_context *ctdb)
1466 struct public_ip_list *ip_list;
1467 struct ctdb_public_ip_list_old *public_ips;
1469 if (ctdb->ip_tree != NULL) {
1470 talloc_free(ctdb->ip_tree);
1471 ctdb->ip_tree = NULL;
1473 ctdb->ip_tree = trbt_create(ctdb, 0);
1475 for (i=0;i<ctdb->num_nodes;i++) {
1476 public_ips = ctdb->nodes[i]->known_public_ips;
1478 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1482 /* there were no public ips for this node */
1483 if (public_ips == NULL) {
1487 for (j=0;j<public_ips->num;j++) {
1488 struct public_ip_list *tmp_ip;
1490 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1491 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1492 /* Do not use information about IP addresses hosted
1493 * on other nodes, it may not be accurate */
1494 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1495 tmp_ip->pnn = public_ips->ips[j].pnn;
1499 tmp_ip->addr = public_ips->ips[j].addr;
1500 tmp_ip->next = NULL;
1502 trbt_insertarray32_callback(ctdb->ip_tree,
1503 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1510 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1516 * This is the length of the longtest common prefix between the IPs.
1517 * It is calculated by XOR-ing the 2 IPs together and counting the
1518 * number of leading zeroes. The implementation means that all
1519 * addresses end up being 128 bits long.
1521 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1522 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1523 * lots of nodes and IP addresses?
1525 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1527 uint32_t ip1_k[IP_KEYLEN];
1532 uint32_t distance = 0;
1534 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1536 for (i=0; i<IP_KEYLEN; i++) {
1537 x = ip1_k[i] ^ t[i];
1541 /* Count number of leading zeroes.
1542 * FIXME? This could be optimised...
1544 while ((x & (1 << 31)) == 0) {
1554 /* Calculate the IP distance for the given IP relative to IPs on the
1555 given node. The ips argument is generally the all_ips variable
1556 used in the main part of the algorithm.
1558 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1559 struct public_ip_list *ips,
1562 struct public_ip_list *t;
1567 for (t=ips; t != NULL; t=t->next) {
1568 if (t->pnn != pnn) {
1572 /* Optimisation: We never calculate the distance
1573 * between an address and itself. This allows us to
1574 * calculate the effect of removing an address from a
1575 * node by simply calculating the distance between
1576 * that address and all of the exitsing addresses.
1577 * Moreover, we assume that we're only ever dealing
1578 * with addresses from all_ips so we can identify an
1579 * address via a pointer rather than doing a more
1580 * expensive address comparison. */
1581 if (&(t->addr) == ip) {
1585 d = ip_distance(ip, &(t->addr));
1586 sum += d * d; /* Cheaper than pulling in math.h :-) */
1592 /* Return the LCP2 imbalance metric for addresses currently assigned
1595 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1597 struct public_ip_list *t;
1599 uint32_t imbalance = 0;
1601 for (t=all_ips; t!=NULL; t=t->next) {
1602 if (t->pnn != pnn) {
1605 /* Pass the rest of the IPs rather than the whole
1608 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1614 /* Allocate any unassigned IPs just by looping through the IPs and
1615 * finding the best node for each.
1617 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1618 struct ctdb_ipflags *ipflags,
1619 struct public_ip_list *all_ips)
1621 struct public_ip_list *tmp_ip;
1623 /* loop over all ip's and find a physical node to cover for
1626 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1627 if (tmp_ip->pnn == -1) {
1628 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1629 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1630 ctdb_addr_to_str(&tmp_ip->addr)));
1636 /* Basic non-deterministic rebalancing algorithm.
1638 static void basic_failback(struct ctdb_context *ctdb,
1639 struct ctdb_ipflags *ipflags,
1640 struct public_ip_list *all_ips,
1644 int maxnode, maxnum, minnode, minnum, num, retries;
1645 struct public_ip_list *tmp_ip;
1647 numnodes = talloc_array_length(ipflags);
1654 /* for each ip address, loop over all nodes that can serve
1655 this ip and make sure that the difference between the node
1656 serving the most and the node serving the least ip's are
1659 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1660 if (tmp_ip->pnn == -1) {
1664 /* Get the highest and lowest number of ips's served by any
1665 valid node which can serve this ip.
1669 for (i=0; i<numnodes; i++) {
1670 /* only check nodes that can actually serve this ip */
1671 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1672 /* no it couldnt so skip to the next node */
1676 num = node_ip_coverage(ctdb, i, all_ips);
1677 if (maxnode == -1) {
1686 if (minnode == -1) {
1696 if (maxnode == -1) {
1697 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1698 ctdb_addr_to_str(&tmp_ip->addr)));
1703 /* if the spread between the smallest and largest coverage by
1704 a node is >=2 we steal one of the ips from the node with
1705 most coverage to even things out a bit.
1706 try to do this a limited number of times since we dont
1707 want to spend too much time balancing the ip coverage.
1709 if ( (maxnum > minnum+1)
1710 && (retries < (num_ips + 5)) ){
1711 struct public_ip_list *tmp;
1713 /* Reassign one of maxnode's VNNs */
1714 for (tmp=all_ips;tmp;tmp=tmp->next) {
1715 if (tmp->pnn == maxnode) {
1716 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1725 static void lcp2_init(struct ctdb_context *tmp_ctx,
1726 struct ctdb_ipflags *ipflags,
1727 struct public_ip_list *all_ips,
1728 uint32_t *force_rebalance_nodes,
1729 uint32_t **lcp2_imbalances,
1730 bool **rebalance_candidates)
1733 struct public_ip_list *tmp_ip;
1735 numnodes = talloc_array_length(ipflags);
1737 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1738 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1739 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1740 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1742 for (i=0; i<numnodes; i++) {
1743 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1744 /* First step: assume all nodes are candidates */
1745 (*rebalance_candidates)[i] = true;
1748 /* 2nd step: if a node has IPs assigned then it must have been
1749 * healthy before, so we remove it from consideration. This
1750 * is overkill but is all we have because we don't maintain
1751 * state between takeover runs. An alternative would be to
1752 * keep state and invalidate it every time the recovery master
1755 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1756 if (tmp_ip->pnn != -1) {
1757 (*rebalance_candidates)[tmp_ip->pnn] = false;
1761 /* 3rd step: if a node is forced to re-balance then
1762 we allow failback onto the node */
1763 if (force_rebalance_nodes == NULL) {
1766 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1767 uint32_t pnn = force_rebalance_nodes[i];
1768 if (pnn >= numnodes) {
1770 (__location__ "unknown node %u\n", pnn));
1775 ("Forcing rebalancing of IPs to node %u\n", pnn));
1776 (*rebalance_candidates)[pnn] = true;
1780 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1781 * the IP/node combination that will cost the least.
1783 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1784 struct ctdb_ipflags *ipflags,
1785 struct public_ip_list *all_ips,
1786 uint32_t *lcp2_imbalances)
1788 struct public_ip_list *tmp_ip;
1789 int dstnode, numnodes;
1792 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1793 struct public_ip_list *minip;
1795 bool should_loop = true;
1796 bool have_unassigned = true;
1798 numnodes = talloc_array_length(ipflags);
1800 while (have_unassigned && should_loop) {
1801 should_loop = false;
1803 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1804 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1810 /* loop over each unassigned ip. */
1811 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1812 if (tmp_ip->pnn != -1) {
1816 for (dstnode=0; dstnode<numnodes; dstnode++) {
1817 /* only check nodes that can actually takeover this ip */
1818 if (!can_node_takeover_ip(ctdb, dstnode,
1821 /* no it couldnt so skip to the next node */
1825 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1826 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1827 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1828 ctdb_addr_to_str(&(tmp_ip->addr)),
1830 dstimbl - lcp2_imbalances[dstnode]));
1833 if ((minnode == -1) || (dstdsum < mindsum)) {
1843 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1845 /* If we found one then assign it to the given node. */
1846 if (minnode != -1) {
1847 minip->pnn = minnode;
1848 lcp2_imbalances[minnode] = minimbl;
1849 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1850 ctdb_addr_to_str(&(minip->addr)),
1855 /* There might be a better way but at least this is clear. */
1856 have_unassigned = false;
1857 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1858 if (tmp_ip->pnn == -1) {
1859 have_unassigned = true;
1864 /* We know if we have an unassigned addresses so we might as
1867 if (have_unassigned) {
1868 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1869 if (tmp_ip->pnn == -1) {
1870 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1871 ctdb_addr_to_str(&tmp_ip->addr)));
1877 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1878 * to move IPs from, determines the best IP/destination node
1879 * combination to move from the source node.
1881 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1882 struct ctdb_ipflags *ipflags,
1883 struct public_ip_list *all_ips,
1885 uint32_t *lcp2_imbalances,
1886 bool *rebalance_candidates)
1888 int dstnode, mindstnode, numnodes;
1889 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1890 uint32_t minsrcimbl, mindstimbl;
1891 struct public_ip_list *minip;
1892 struct public_ip_list *tmp_ip;
1894 /* Find an IP and destination node that best reduces imbalance. */
1901 numnodes = talloc_array_length(ipflags);
1903 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1904 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1905 srcnode, lcp2_imbalances[srcnode]));
1907 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1908 /* Only consider addresses on srcnode. */
1909 if (tmp_ip->pnn != srcnode) {
1913 /* What is this IP address costing the source node? */
1914 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1915 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1917 /* Consider this IP address would cost each potential
1918 * destination node. Destination nodes are limited to
1919 * those that are newly healthy, since we don't want
1920 * to do gratuitous failover of IPs just to make minor
1921 * balance improvements.
1923 for (dstnode=0; dstnode<numnodes; dstnode++) {
1924 if (!rebalance_candidates[dstnode]) {
1928 /* only check nodes that can actually takeover this ip */
1929 if (!can_node_takeover_ip(ctdb, dstnode,
1930 ipflags[dstnode], tmp_ip)) {
1931 /* no it couldnt so skip to the next node */
1935 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1936 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1937 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1939 ctdb_addr_to_str(&(tmp_ip->addr)),
1942 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1943 (dstdsum < srcdsum) && \
1944 ((mindstnode == -1) || \
1945 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1948 minsrcimbl = srcimbl;
1949 mindstnode = dstnode;
1950 mindstimbl = dstimbl;
1954 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1956 if (mindstnode != -1) {
1957 /* We found a move that makes things better... */
1958 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1959 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1960 ctdb_addr_to_str(&(minip->addr)),
1961 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1964 lcp2_imbalances[srcnode] = minsrcimbl;
1965 lcp2_imbalances[mindstnode] = mindstimbl;
1966 minip->pnn = mindstnode;
1975 struct lcp2_imbalance_pnn {
1980 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1982 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1983 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1985 if (lipa->imbalance > lipb->imbalance) {
1987 } else if (lipa->imbalance == lipb->imbalance) {
1994 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1995 * node with the highest LCP2 imbalance, and then determines the best
1996 * IP/destination node combination to move from the source node.
1998 static void lcp2_failback(struct ctdb_context *ctdb,
1999 struct ctdb_ipflags *ipflags,
2000 struct public_ip_list *all_ips,
2001 uint32_t *lcp2_imbalances,
2002 bool *rebalance_candidates)
2005 struct lcp2_imbalance_pnn * lips;
2008 numnodes = talloc_array_length(ipflags);
2011 /* Put the imbalances and nodes into an array, sort them and
2012 * iterate through candidates. Usually the 1st one will be
2013 * used, so this doesn't cost much...
2015 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2016 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2017 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2018 for (i=0; i<numnodes; i++) {
2019 lips[i].imbalance = lcp2_imbalances[i];
2021 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2023 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2024 lcp2_cmp_imbalance_pnn);
2027 for (i=0; i<numnodes; i++) {
2028 /* This means that all nodes had 0 or 1 addresses, so
2029 * can't be imbalanced.
2031 if (lips[i].imbalance == 0) {
2035 if (lcp2_failback_candidate(ctdb,
2040 rebalance_candidates)) {
2052 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2053 struct ctdb_ipflags *ipflags,
2054 struct public_ip_list *all_ips)
2056 struct public_ip_list *tmp_ip;
2058 /* verify that the assigned nodes can serve that public ip
2059 and set it to -1 if not
2061 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2062 if (tmp_ip->pnn == -1) {
2065 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2066 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2067 /* this node can not serve this ip. */
2068 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2069 ctdb_addr_to_str(&(tmp_ip->addr)),
2076 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2077 struct ctdb_ipflags *ipflags,
2078 struct public_ip_list *all_ips)
2080 struct public_ip_list *tmp_ip;
2083 numnodes = talloc_array_length(ipflags);
2085 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2086 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2087 * always be allocated the same way for a specific set of
2088 * available/unavailable nodes.
2091 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2092 tmp_ip->pnn = i % numnodes;
2095 /* IP failback doesn't make sense with deterministic
2096 * IPs, since the modulo step above implicitly fails
2097 * back IPs to their "home" node.
2099 if (1 == ctdb->tunable.no_ip_failback) {
2100 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2103 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2105 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2107 /* No failback here! */
2110 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2111 struct ctdb_ipflags *ipflags,
2112 struct public_ip_list *all_ips)
2114 /* This should be pushed down into basic_failback. */
2115 struct public_ip_list *tmp_ip;
2117 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2121 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2123 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2125 /* If we don't want IPs to fail back then don't rebalance IPs. */
2126 if (1 == ctdb->tunable.no_ip_failback) {
2130 /* Now, try to make sure the ip adresses are evenly distributed
2133 basic_failback(ctdb, ipflags, all_ips, num_ips);
2136 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2137 struct ctdb_ipflags *ipflags,
2138 struct public_ip_list *all_ips,
2139 uint32_t *force_rebalance_nodes)
2141 uint32_t *lcp2_imbalances;
2142 bool *rebalance_candidates;
2143 int numnodes, num_rebalance_candidates, i;
2145 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2147 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2149 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2150 &lcp2_imbalances, &rebalance_candidates);
2152 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2154 /* If we don't want IPs to fail back then don't rebalance IPs. */
2155 if (1 == ctdb->tunable.no_ip_failback) {
2159 /* It is only worth continuing if we have suitable target
2160 * nodes to transfer IPs to. This check is much cheaper than
2163 numnodes = talloc_array_length(ipflags);
2164 num_rebalance_candidates = 0;
2165 for (i=0; i<numnodes; i++) {
2166 if (rebalance_candidates[i]) {
2167 num_rebalance_candidates++;
2170 if (num_rebalance_candidates == 0) {
2174 /* Now, try to make sure the ip adresses are evenly distributed
2177 lcp2_failback(ctdb, ipflags, all_ips,
2178 lcp2_imbalances, rebalance_candidates);
2181 talloc_free(tmp_ctx);
2184 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2188 for (i=0;i<nodemap->num;i++) {
2189 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2190 /* Found one completely healthy node */
2198 /* The calculation part of the IP allocation algorithm. */
2199 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2200 struct ctdb_ipflags *ipflags,
2201 struct public_ip_list **all_ips_p,
2202 uint32_t *force_rebalance_nodes)
2204 /* since nodes only know about those public addresses that
2205 can be served by that particular node, no single node has
2206 a full list of all public addresses that exist in the cluster.
2207 Walk over all node structures and create a merged list of
2208 all public addresses that exist in the cluster.
2210 keep the tree of ips around as ctdb->ip_tree
2212 *all_ips_p = create_merged_ip_list(ctdb);
2214 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2215 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2216 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2217 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2219 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2222 /* at this point ->pnn is the node which will own each IP
2223 or -1 if there is no node that can cover this ip
2229 struct get_tunable_callback_data {
2230 const char *tunable;
2235 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2236 int32_t res, TDB_DATA outdata,
2239 struct get_tunable_callback_data *cd =
2240 (struct get_tunable_callback_data *)callback;
2244 /* Already handled in fail callback */
2248 if (outdata.dsize != sizeof(uint32_t)) {
2249 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2250 cd->tunable, pnn, (int)sizeof(uint32_t),
2251 (int)outdata.dsize));
2256 size = talloc_array_length(cd->out);
2258 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2259 cd->tunable, pnn, size));
2264 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2267 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2268 int32_t res, TDB_DATA outdata,
2271 struct get_tunable_callback_data *cd =
2272 (struct get_tunable_callback_data *)callback;
2277 ("Timed out getting tunable \"%s\" from node %d\n",
2283 DEBUG(DEBUG_WARNING,
2284 ("Tunable \"%s\" not implemented on node %d\n",
2289 ("Unexpected error getting tunable \"%s\" from node %d\n",
2295 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2296 TALLOC_CTX *tmp_ctx,
2297 struct ctdb_node_map_old *nodemap,
2298 const char *tunable,
2299 uint32_t default_value)
2302 struct ctdb_control_get_tunable *t;
2305 struct get_tunable_callback_data callback_data;
2308 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2309 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2310 for (i=0; i<nodemap->num; i++) {
2311 tvals[i] = default_value;
2314 callback_data.out = tvals;
2315 callback_data.tunable = tunable;
2316 callback_data.fatal = false;
2318 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2319 data.dptr = talloc_size(tmp_ctx, data.dsize);
2320 t = (struct ctdb_control_get_tunable *)data.dptr;
2321 t->length = strlen(tunable)+1;
2322 memcpy(t->name, tunable, t->length);
2323 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2324 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2325 nodes, 0, TAKEOVER_TIMEOUT(),
2327 get_tunable_callback,
2328 get_tunable_fail_callback,
2329 &callback_data) != 0) {
2330 if (callback_data.fatal) {
2336 talloc_free(data.dptr);
2341 /* Set internal flags for IP allocation:
2343 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2344 * Set NOIPHOST ip flag for each INACTIVE node
2345 * if all nodes are disabled:
2346 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2348 * Set NOIPHOST ip flags for disabled nodes
2350 static struct ctdb_ipflags *
2351 set_ipflags_internal(struct ctdb_context *ctdb,
2352 TALLOC_CTX *tmp_ctx,
2353 struct ctdb_node_map_old *nodemap,
2354 uint32_t *tval_noiptakeover,
2355 uint32_t *tval_noiphostonalldisabled)
2358 struct ctdb_ipflags *ipflags;
2360 /* Clear IP flags - implicit due to talloc_zero */
2361 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2362 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2364 for (i=0;i<nodemap->num;i++) {
2365 /* Can not take IPs on node with NoIPTakeover set */
2366 if (tval_noiptakeover[i] != 0) {
2367 ipflags[i].noiptakeover = true;
2370 /* Can not host IPs on INACTIVE node */
2371 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2372 ipflags[i].noiphost = true;
2376 if (all_nodes_are_disabled(nodemap)) {
2377 /* If all nodes are disabled, can not host IPs on node
2378 * with NoIPHostOnAllDisabled set
2380 for (i=0;i<nodemap->num;i++) {
2381 if (tval_noiphostonalldisabled[i] != 0) {
2382 ipflags[i].noiphost = true;
2386 /* If some nodes are not disabled, then can not host
2387 * IPs on DISABLED node
2389 for (i=0;i<nodemap->num;i++) {
2390 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2391 ipflags[i].noiphost = true;
2399 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2400 TALLOC_CTX *tmp_ctx,
2401 struct ctdb_node_map_old *nodemap)
2403 uint32_t *tval_noiptakeover;
2404 uint32_t *tval_noiphostonalldisabled;
2405 struct ctdb_ipflags *ipflags;
2408 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2410 if (tval_noiptakeover == NULL) {
2414 tval_noiphostonalldisabled =
2415 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2416 "NoIPHostOnAllDisabled", 0);
2417 if (tval_noiphostonalldisabled == NULL) {
2418 /* Caller frees tmp_ctx */
2422 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2424 tval_noiphostonalldisabled);
2426 talloc_free(tval_noiptakeover);
2427 talloc_free(tval_noiphostonalldisabled);
2432 struct iprealloc_callback_data {
2435 client_async_callback fail_callback;
2436 void *fail_callback_data;
2437 struct ctdb_node_map_old *nodemap;
2440 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2441 int32_t res, TDB_DATA outdata,
2445 struct iprealloc_callback_data *cd =
2446 (struct iprealloc_callback_data *)callback;
2448 numnodes = talloc_array_length(cd->retry_nodes);
2449 if (pnn > numnodes) {
2451 ("ipreallocated failure from node %d, "
2452 "but only %d nodes in nodemap\n",
2457 /* Can't run the "ipreallocated" event on a INACTIVE node */
2458 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2459 DEBUG(DEBUG_WARNING,
2460 ("ipreallocated failed on inactive node %d, ignoring\n",
2467 /* If the control timed out then that's a real error,
2468 * so call the real fail callback
2470 if (cd->fail_callback) {
2471 cd->fail_callback(ctdb, pnn, res, outdata,
2472 cd->fail_callback_data);
2474 DEBUG(DEBUG_WARNING,
2475 ("iprealloc timed out but no callback registered\n"));
2479 /* If not a timeout then either the ipreallocated
2480 * eventscript (or some setup) failed. This might
2481 * have failed because the IPREALLOCATED control isn't
2482 * implemented - right now there is no way of knowing
2483 * because the error codes are all folded down to -1.
2484 * Consider retrying using EVENTSCRIPT control...
2486 DEBUG(DEBUG_WARNING,
2487 ("ipreallocated failure from node %d, flagging retry\n",
2489 cd->retry_nodes[pnn] = true;
2494 struct takeover_callback_data {
2496 client_async_callback fail_callback;
2497 void *fail_callback_data;
2498 struct ctdb_node_map_old *nodemap;
2501 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2502 uint32_t node_pnn, int32_t res,
2503 TDB_DATA outdata, void *callback_data)
2505 struct takeover_callback_data *cd =
2506 talloc_get_type_abort(callback_data,
2507 struct takeover_callback_data);
2510 for (i = 0; i < cd->nodemap->num; i++) {
2511 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2516 if (i == cd->nodemap->num) {
2517 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2521 if (!cd->node_failed[i]) {
2522 cd->node_failed[i] = true;
2523 cd->fail_callback(ctdb, node_pnn, res, outdata,
2524 cd->fail_callback_data);
2529 make any IP alias changes for public addresses that are necessary
2531 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2532 uint32_t *force_rebalance_nodes,
2533 client_async_callback fail_callback, void *callback_data)
2536 struct ctdb_public_ip ip;
2538 struct public_ip_list *all_ips, *tmp_ip;
2540 struct timeval timeout;
2541 struct client_async_data *async_data;
2542 struct ctdb_client_control_state *state;
2543 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2544 struct ctdb_ipflags *ipflags;
2545 struct takeover_callback_data *takeover_data;
2546 struct iprealloc_callback_data iprealloc_data;
2551 * ip failover is completely disabled, just send out the
2552 * ipreallocated event.
2554 if (ctdb->tunable.disable_ip_failover != 0) {
2558 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2559 if (ipflags == NULL) {
2560 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2561 talloc_free(tmp_ctx);
2565 /* Fetch known/available public IPs from each active node */
2566 ret = ctdb_reload_remote_public_ips(ctdb, nodemap);
2568 talloc_free(tmp_ctx);
2572 /* Short-circuit IP allocation if no node has available IPs */
2573 can_host_ips = false;
2574 for (i=0; i < ctdb->num_nodes; i++) {
2575 if (ctdb->nodes[i]->available_public_ips != NULL) {
2576 can_host_ips = true;
2579 if (!can_host_ips) {
2580 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2584 /* Do the IP reassignment calculations */
2585 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2587 /* Now tell all nodes to release any public IPs should not
2588 * host. This will be a NOOP on nodes that don't currently
2589 * hold the given IP.
2591 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2592 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2594 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2595 bool, nodemap->num);
2596 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2597 takeover_data->fail_callback = fail_callback;
2598 takeover_data->fail_callback_data = callback_data;
2599 takeover_data->nodemap = nodemap;
2601 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2602 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2604 async_data->fail_callback = takeover_run_fail_callback;
2605 async_data->callback_data = takeover_data;
2607 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2609 /* Send a RELEASE_IP to all nodes that should not be hosting
2610 * each IP. For each IP, all but one of these will be
2611 * redundant. However, the redundant ones are used to tell
2612 * nodes which node should be hosting the IP so that commands
2613 * like "ctdb ip" can display a particular nodes idea of who
2614 * is hosting what. */
2615 for (i=0;i<nodemap->num;i++) {
2616 /* don't talk to unconnected nodes, but do talk to banned nodes */
2617 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2621 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2622 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2623 /* This node should be serving this
2624 vnn so don't tell it to release the ip
2628 ip.pnn = tmp_ip->pnn;
2629 ip.addr = tmp_ip->addr;
2631 timeout = TAKEOVER_TIMEOUT();
2632 data.dsize = sizeof(ip);
2633 data.dptr = (uint8_t *)&ip;
2634 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2635 0, CTDB_CONTROL_RELEASE_IP, 0,
2638 if (state == NULL) {
2639 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2640 talloc_free(tmp_ctx);
2644 ctdb_client_async_add(async_data, state);
2647 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2648 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2649 talloc_free(tmp_ctx);
2652 talloc_free(async_data);
2655 /* For each IP, send a TAKOVER_IP to the node that should be
2656 * hosting it. Many of these will often be redundant (since
2657 * the allocation won't have changed) but they can be useful
2658 * to recover from inconsistencies. */
2659 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2660 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2662 async_data->fail_callback = fail_callback;
2663 async_data->callback_data = callback_data;
2665 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2666 if (tmp_ip->pnn == -1) {
2667 /* this IP won't be taken over */
2671 ip.pnn = tmp_ip->pnn;
2672 ip.addr = tmp_ip->addr;
2674 timeout = TAKEOVER_TIMEOUT();
2675 data.dsize = sizeof(ip);
2676 data.dptr = (uint8_t *)&ip;
2677 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2678 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2679 data, async_data, &timeout, NULL);
2680 if (state == NULL) {
2681 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2682 talloc_free(tmp_ctx);
2686 ctdb_client_async_add(async_data, state);
2688 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2689 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2690 talloc_free(tmp_ctx);
2696 * Tell all nodes to run eventscripts to process the
2697 * "ipreallocated" event. This can do a lot of things,
2698 * including restarting services to reconfigure them if public
2699 * IPs have moved. Once upon a time this event only used to
2702 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2703 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2704 iprealloc_data.retry_nodes = retry_data;
2705 iprealloc_data.retry_count = 0;
2706 iprealloc_data.fail_callback = fail_callback;
2707 iprealloc_data.fail_callback_data = callback_data;
2708 iprealloc_data.nodemap = nodemap;
2710 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2711 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2712 nodes, 0, TAKEOVER_TIMEOUT(),
2714 NULL, iprealloc_fail_callback,
2717 /* If the control failed then we should retry to any
2718 * nodes flagged by iprealloc_fail_callback using the
2719 * EVENTSCRIPT control. This is a best-effort at
2720 * backward compatiblity when running a mixed cluster
2721 * where some nodes have not yet been upgraded to
2722 * support the IPREALLOCATED control.
2724 DEBUG(DEBUG_WARNING,
2725 ("Retry ipreallocated to some nodes using eventscript control\n"));
2727 nodes = talloc_array(tmp_ctx, uint32_t,
2728 iprealloc_data.retry_count);
2729 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2732 for (i=0; i<nodemap->num; i++) {
2733 if (iprealloc_data.retry_nodes[i]) {
2739 data.dptr = discard_const("ipreallocated");
2740 data.dsize = strlen((char *)data.dptr) + 1;
2741 ret = ctdb_client_async_control(ctdb,
2742 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2743 nodes, 0, TAKEOVER_TIMEOUT(),
2745 NULL, fail_callback,
2748 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2752 talloc_free(tmp_ctx);
2758 destroy a ctdb_client_ip structure
2760 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2762 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2763 ctdb_addr_to_str(&ip->addr),
2764 ntohs(ip->addr.ip.sin_port),
2767 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2772 called by a client to inform us of a TCP connection that it is managing
2773 that should tickled with an ACK when IP takeover is done
2775 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2778 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2779 struct ctdb_connection *tcp_sock = NULL;
2780 struct ctdb_tcp_list *tcp;
2781 struct ctdb_connection t;
2784 struct ctdb_client_ip *ip;
2785 struct ctdb_vnn *vnn;
2786 ctdb_sock_addr addr;
2788 /* If we don't have public IPs, tickles are useless */
2789 if (ctdb->vnn == NULL) {
2793 tcp_sock = (struct ctdb_connection *)indata.dptr;
2795 addr = tcp_sock->src;
2796 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2797 addr = tcp_sock->dst;
2798 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2801 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2802 vnn = find_public_ip_vnn(ctdb, &addr);
2804 switch (addr.sa.sa_family) {
2806 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2807 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2808 ctdb_addr_to_str(&addr)));
2812 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2813 ctdb_addr_to_str(&addr)));
2816 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2822 if (vnn->pnn != ctdb->pnn) {
2823 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2824 ctdb_addr_to_str(&addr),
2825 client_id, client->pid));
2826 /* failing this call will tell smbd to die */
2830 ip = talloc(client, struct ctdb_client_ip);
2831 CTDB_NO_MEMORY(ctdb, ip);
2835 ip->client_id = client_id;
2836 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2837 DLIST_ADD(ctdb->client_ip_list, ip);
2839 tcp = talloc(client, struct ctdb_tcp_list);
2840 CTDB_NO_MEMORY(ctdb, tcp);
2842 tcp->connection.src = tcp_sock->src;
2843 tcp->connection.dst = tcp_sock->dst;
2845 DLIST_ADD(client->tcp_list, tcp);
2847 t.src = tcp_sock->src;
2848 t.dst = tcp_sock->dst;
2850 data.dptr = (uint8_t *)&t;
2851 data.dsize = sizeof(t);
2853 switch (addr.sa.sa_family) {
2855 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2856 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2857 ctdb_addr_to_str(&tcp_sock->src),
2858 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2861 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2862 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2863 ctdb_addr_to_str(&tcp_sock->src),
2864 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2867 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2871 /* tell all nodes about this tcp connection */
2872 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2873 CTDB_CONTROL_TCP_ADD,
2874 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2876 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2884 find a tcp address on a list
2886 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2887 struct ctdb_connection *tcp)
2891 if (array == NULL) {
2895 for (i=0;i<array->num;i++) {
2896 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2897 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2898 return &array->connections[i];
2907 called by a daemon to inform us of a TCP connection that one of its
2908 clients managing that should tickled with an ACK when IP takeover is
2911 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2913 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2914 struct ctdb_tcp_array *tcparray;
2915 struct ctdb_connection tcp;
2916 struct ctdb_vnn *vnn;
2918 /* If we don't have public IPs, tickles are useless */
2919 if (ctdb->vnn == NULL) {
2923 vnn = find_public_ip_vnn(ctdb, &p->dst);
2925 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2926 ctdb_addr_to_str(&p->dst)));
2932 tcparray = vnn->tcp_array;
2934 /* If this is the first tickle */
2935 if (tcparray == NULL) {
2936 tcparray = talloc(vnn, struct ctdb_tcp_array);
2937 CTDB_NO_MEMORY(ctdb, tcparray);
2938 vnn->tcp_array = tcparray;
2941 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2942 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2944 tcparray->connections[tcparray->num].src = p->src;
2945 tcparray->connections[tcparray->num].dst = p->dst;
2948 if (tcp_update_needed) {
2949 vnn->tcp_update_needed = true;
2955 /* Do we already have this tickle ?*/
2958 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2959 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2960 ctdb_addr_to_str(&tcp.dst),
2961 ntohs(tcp.dst.ip.sin_port),
2966 /* A new tickle, we must add it to the array */
2967 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2968 struct ctdb_connection,
2970 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2972 tcparray->connections[tcparray->num].src = p->src;
2973 tcparray->connections[tcparray->num].dst = p->dst;
2976 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2977 ctdb_addr_to_str(&tcp.dst),
2978 ntohs(tcp.dst.ip.sin_port),
2981 if (tcp_update_needed) {
2982 vnn->tcp_update_needed = true;
2990 called by a daemon to inform us of a TCP connection that one of its
2991 clients managing that should tickled with an ACK when IP takeover is
2994 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
2996 struct ctdb_connection *tcpp;
2997 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3000 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3001 ctdb_addr_to_str(&conn->dst)));
3005 /* if the array is empty we cant remove it
3006 and we don't need to do anything
3008 if (vnn->tcp_array == NULL) {
3009 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3010 ctdb_addr_to_str(&conn->dst),
3011 ntohs(conn->dst.ip.sin_port)));
3016 /* See if we know this connection
3017 if we don't know this connection then we dont need to do anything
3019 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3021 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3022 ctdb_addr_to_str(&conn->dst),
3023 ntohs(conn->dst.ip.sin_port)));
3028 /* We need to remove this entry from the array.
3029 Instead of allocating a new array and copying data to it
3030 we cheat and just copy the last entry in the existing array
3031 to the entry that is to be removed and just shring the
3034 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3035 vnn->tcp_array->num--;
3037 /* If we deleted the last entry we also need to remove the entire array
3039 if (vnn->tcp_array->num == 0) {
3040 talloc_free(vnn->tcp_array);
3041 vnn->tcp_array = NULL;
3044 vnn->tcp_update_needed = true;
3046 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3047 ctdb_addr_to_str(&conn->src),
3048 ntohs(conn->src.ip.sin_port)));
3053 called by a daemon to inform us of a TCP connection that one of its
3054 clients used are no longer needed in the tickle database
3056 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3058 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3060 /* If we don't have public IPs, tickles are useless */
3061 if (ctdb->vnn == NULL) {
3065 ctdb_remove_connection(ctdb, conn);
3072 Called when another daemon starts - causes all tickles for all
3073 public addresses we are serving to be sent to the new node on the
3074 next check. This actually causes the next scheduled call to
3075 tdb_update_tcp_tickles() to update all nodes. This is simple and
3076 doesn't require careful error handling.
3078 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3080 struct ctdb_vnn *vnn;
3082 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3083 (unsigned long) pnn));
3085 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3086 vnn->tcp_update_needed = true;
3094 called when a client structure goes away - hook to remove
3095 elements from the tcp_list in all daemons
3097 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3099 while (client->tcp_list) {
3100 struct ctdb_tcp_list *tcp = client->tcp_list;
3101 DLIST_REMOVE(client->tcp_list, tcp);
3102 ctdb_remove_connection(client->ctdb, &tcp->connection);
3107 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3109 struct ctdb_vnn *vnn;
3112 if (ctdb->tunable.disable_ip_failover == 1) {
3116 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3117 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3118 ctdb_vnn_unassign_iface(ctdb, vnn);
3125 /* Don't allow multiple releases at once. Some code,
3126 * particularly ctdb_tickle_sentenced_connections() is
3128 if (vnn->update_in_flight) {
3129 DEBUG(DEBUG_WARNING,
3131 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3132 ctdb_addr_to_str(&vnn->public_address),
3133 vnn->public_netmask_bits,
3134 ctdb_vnn_iface_string(vnn)));
3137 vnn->update_in_flight = true;
3139 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3140 ctdb_addr_to_str(&vnn->public_address),
3141 vnn->public_netmask_bits,
3142 ctdb_vnn_iface_string(vnn)));
3144 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3145 ctdb_vnn_iface_string(vnn),
3146 ctdb_addr_to_str(&vnn->public_address),
3147 vnn->public_netmask_bits);
3148 release_kill_clients(ctdb, &vnn->public_address);
3149 ctdb_vnn_unassign_iface(ctdb, vnn);
3150 vnn->update_in_flight = false;
3154 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3159 get list of public IPs
3161 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3162 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3165 struct ctdb_public_ip_list_old *ips;
3166 struct ctdb_vnn *vnn;
3167 bool only_available = false;
3169 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3170 only_available = true;
3173 /* count how many public ip structures we have */
3175 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3179 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3180 num*sizeof(struct ctdb_public_ip);
3181 ips = talloc_zero_size(outdata, len);
3182 CTDB_NO_MEMORY(ctdb, ips);
3185 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3186 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3189 ips->ips[i].pnn = vnn->pnn;
3190 ips->ips[i].addr = vnn->public_address;
3194 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3195 i*sizeof(struct ctdb_public_ip);
3197 outdata->dsize = len;
3198 outdata->dptr = (uint8_t *)ips;
3204 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3205 struct ctdb_req_control_old *c,
3210 ctdb_sock_addr *addr;
3211 struct ctdb_public_ip_info_old *info;
3212 struct ctdb_vnn *vnn;
3214 addr = (ctdb_sock_addr *)indata.dptr;
3216 vnn = find_public_ip_vnn(ctdb, addr);
3218 /* if it is not a public ip it could be our 'single ip' */
3219 if (ctdb->single_ip_vnn) {
3220 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3221 vnn = ctdb->single_ip_vnn;
3226 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3227 "'%s'not a public address\n",
3228 ctdb_addr_to_str(addr)));
3232 /* count how many public ip structures we have */
3234 for (;vnn->ifaces[num];) {
3238 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3239 num*sizeof(struct ctdb_iface);
3240 info = talloc_zero_size(outdata, len);
3241 CTDB_NO_MEMORY(ctdb, info);
3243 info->ip.addr = vnn->public_address;
3244 info->ip.pnn = vnn->pnn;
3245 info->active_idx = 0xFFFFFFFF;
3247 for (i=0; vnn->ifaces[i]; i++) {
3248 struct ctdb_interface *cur;
3250 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3252 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3256 if (vnn->iface == cur) {
3257 info->active_idx = i;
3259 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3260 info->ifaces[i].link_state = cur->link_up;
3261 info->ifaces[i].references = cur->references;
3264 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3265 i*sizeof(struct ctdb_iface);
3267 outdata->dsize = len;
3268 outdata->dptr = (uint8_t *)info;
3273 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3274 struct ctdb_req_control_old *c,
3278 struct ctdb_iface_list_old *ifaces;
3279 struct ctdb_interface *cur;
3281 /* count how many public ip structures we have */
3283 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3287 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3288 num*sizeof(struct ctdb_iface);
3289 ifaces = talloc_zero_size(outdata, len);
3290 CTDB_NO_MEMORY(ctdb, ifaces);
3293 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3294 strcpy(ifaces->ifaces[i].name, cur->name);
3295 ifaces->ifaces[i].link_state = cur->link_up;
3296 ifaces->ifaces[i].references = cur->references;
3300 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3301 i*sizeof(struct ctdb_iface);
3303 outdata->dsize = len;
3304 outdata->dptr = (uint8_t *)ifaces;
3309 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3310 struct ctdb_req_control_old *c,
3313 struct ctdb_iface *info;
3314 struct ctdb_interface *iface;
3315 bool link_up = false;
3317 info = (struct ctdb_iface *)indata.dptr;
3319 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3320 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3321 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3322 len, len, info->name));
3326 switch (info->link_state) {
3334 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3335 (unsigned int)info->link_state));
3339 if (info->references != 0) {
3340 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3341 (unsigned int)info->references));
3345 iface = ctdb_find_iface(ctdb, info->name);
3346 if (iface == NULL) {
3350 if (link_up == iface->link_up) {
3354 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3355 ("iface[%s] has changed it's link status %s => %s\n",
3357 iface->link_up?"up":"down",
3358 link_up?"up":"down"));
3360 iface->link_up = link_up;
3366 structure containing the listening socket and the list of tcp connections
3367 that the ctdb daemon is to kill
3369 struct ctdb_kill_tcp {
3370 struct ctdb_vnn *vnn;
3371 struct ctdb_context *ctdb;
3373 struct tevent_fd *fde;
3374 trbt_tree_t *connections;
3379 a tcp connection that is to be killed
3381 struct ctdb_killtcp_con {
3382 ctdb_sock_addr src_addr;
3383 ctdb_sock_addr dst_addr;
3385 struct ctdb_kill_tcp *killtcp;
3388 /* this function is used to create a key to represent this socketpair
3389 in the killtcp tree.
3390 this key is used to insert and lookup matching socketpairs that are
3391 to be tickled and RST
3393 #define KILLTCP_KEYLEN 10
3394 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3396 static uint32_t key[KILLTCP_KEYLEN];
3398 bzero(key, sizeof(key));
3400 if (src->sa.sa_family != dst->sa.sa_family) {
3401 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3405 switch (src->sa.sa_family) {
3407 key[0] = dst->ip.sin_addr.s_addr;
3408 key[1] = src->ip.sin_addr.s_addr;
3409 key[2] = dst->ip.sin_port;
3410 key[3] = src->ip.sin_port;
3413 uint32_t *dst6_addr32 =
3414 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3415 uint32_t *src6_addr32 =
3416 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3417 key[0] = dst6_addr32[3];
3418 key[1] = src6_addr32[3];
3419 key[2] = dst6_addr32[2];
3420 key[3] = src6_addr32[2];
3421 key[4] = dst6_addr32[1];
3422 key[5] = src6_addr32[1];
3423 key[6] = dst6_addr32[0];
3424 key[7] = src6_addr32[0];
3425 key[8] = dst->ip6.sin6_port;
3426 key[9] = src->ip6.sin6_port;
3430 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3438 called when we get a read event on the raw socket
3440 static void capture_tcp_handler(struct tevent_context *ev,
3441 struct tevent_fd *fde,
3442 uint16_t flags, void *private_data)
3444 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3445 struct ctdb_killtcp_con *con;
3446 ctdb_sock_addr src, dst;
3447 uint32_t ack_seq, seq;
3449 if (!(flags & TEVENT_FD_READ)) {
3453 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3454 killtcp->private_data,
3456 &ack_seq, &seq) != 0) {
3457 /* probably a non-tcp ACK packet */
3461 /* check if we have this guy in our list of connections
3464 con = trbt_lookuparray32(killtcp->connections,
3465 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3467 /* no this was some other packet we can just ignore */
3471 /* This one has been tickled !
3472 now reset him and remove him from the list.
3474 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3475 ntohs(con->dst_addr.ip.sin_port),
3476 ctdb_addr_to_str(&con->src_addr),
3477 ntohs(con->src_addr.ip.sin_port)));
3479 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3484 /* when traversing the list of all tcp connections to send tickle acks to
3485 (so that we can capture the ack coming back and kill the connection
3487 this callback is called for each connection we are currently trying to kill
3489 static int tickle_connection_traverse(void *param, void *data)
3491 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3493 /* have tried too many times, just give up */
3494 if (con->count >= 5) {
3495 /* can't delete in traverse: reparent to delete_cons */
3496 talloc_steal(param, con);
3500 /* othervise, try tickling it again */
3503 (ctdb_sock_addr *)&con->dst_addr,
3504 (ctdb_sock_addr *)&con->src_addr,
3511 called every second until all sentenced connections have been reset
3513 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3514 struct tevent_timer *te,
3515 struct timeval t, void *private_data)
3517 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3518 void *delete_cons = talloc_new(NULL);
3520 /* loop over all connections sending tickle ACKs */
3521 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3523 /* now we've finished traverse, it's safe to do deletion. */
3524 talloc_free(delete_cons);
3526 /* If there are no more connections to kill we can remove the
3527 entire killtcp structure
3529 if ( (killtcp->connections == NULL) ||
3530 (killtcp->connections->root == NULL) ) {
3531 talloc_free(killtcp);
3535 /* try tickling them again in a seconds time
3537 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3538 timeval_current_ofs(1, 0),
3539 ctdb_tickle_sentenced_connections, killtcp);
3543 destroy the killtcp structure
3545 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3547 struct ctdb_vnn *tmpvnn;
3549 /* verify that this vnn is still active */
3550 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3551 if (tmpvnn == killtcp->vnn) {
3556 if (tmpvnn == NULL) {
3560 if (killtcp->vnn->killtcp != killtcp) {
3564 killtcp->vnn->killtcp = NULL;
3570 /* nothing fancy here, just unconditionally replace any existing
3571 connection structure with the new one.
3573 don't even free the old one if it did exist, that one is talloc_stolen
3574 by the same node in the tree anyway and will be deleted when the new data
3577 static void *add_killtcp_callback(void *parm, void *data)
3583 add a tcp socket to the list of connections we want to RST
3585 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3589 ctdb_sock_addr src, dst;
3590 struct ctdb_kill_tcp *killtcp;
3591 struct ctdb_killtcp_con *con;
3592 struct ctdb_vnn *vnn;
3594 ctdb_canonicalize_ip(s, &src);
3595 ctdb_canonicalize_ip(d, &dst);
3597 vnn = find_public_ip_vnn(ctdb, &dst);
3599 vnn = find_public_ip_vnn(ctdb, &src);
3602 /* if it is not a public ip it could be our 'single ip' */
3603 if (ctdb->single_ip_vnn) {
3604 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3605 vnn = ctdb->single_ip_vnn;
3610 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3614 killtcp = vnn->killtcp;
3616 /* If this is the first connection to kill we must allocate
3619 if (killtcp == NULL) {
3620 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3621 CTDB_NO_MEMORY(ctdb, killtcp);
3624 killtcp->ctdb = ctdb;
3625 killtcp->capture_fd = -1;
3626 killtcp->connections = trbt_create(killtcp, 0);
3628 vnn->killtcp = killtcp;
3629 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3634 /* create a structure that describes this connection we want to
3635 RST and store it in killtcp->connections
3637 con = talloc(killtcp, struct ctdb_killtcp_con);
3638 CTDB_NO_MEMORY(ctdb, con);
3639 con->src_addr = src;
3640 con->dst_addr = dst;
3642 con->killtcp = killtcp;
3645 trbt_insertarray32_callback(killtcp->connections,
3646 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3647 add_killtcp_callback, con);
3650 If we don't have a socket to listen on yet we must create it
3652 if (killtcp->capture_fd == -1) {
3653 const char *iface = ctdb_vnn_iface_string(vnn);
3654 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3655 if (killtcp->capture_fd == -1) {
3656 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3657 "socket on iface '%s' for killtcp (%s)\n",
3658 iface, strerror(errno)));
3664 if (killtcp->fde == NULL) {
3665 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3666 killtcp->capture_fd,
3668 capture_tcp_handler, killtcp);
3669 tevent_fd_set_auto_close(killtcp->fde);
3671 /* We also need to set up some events to tickle all these connections
3672 until they are all reset
3674 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3675 ctdb_tickle_sentenced_connections, killtcp);
3678 /* tickle him once now */
3687 talloc_free(vnn->killtcp);
3688 vnn->killtcp = NULL;
3693 kill a TCP connection.
3695 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3697 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3699 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3703 called by a daemon to inform us of the entire list of TCP tickles for
3704 a particular public address.
3705 this control should only be sent by the node that is currently serving
3706 that public address.
3708 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3710 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3711 struct ctdb_tcp_array *tcparray;
3712 struct ctdb_vnn *vnn;
3714 /* We must at least have tickles.num or else we cant verify the size
3715 of the received data blob
3717 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3718 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3722 /* verify that the size of data matches what we expect */
3723 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3724 + sizeof(struct ctdb_connection) * list->num) {
3725 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3729 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3730 ctdb_addr_to_str(&list->addr)));
3732 vnn = find_public_ip_vnn(ctdb, &list->addr);
3734 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3735 ctdb_addr_to_str(&list->addr)));
3740 /* remove any old ticklelist we might have */
3741 talloc_free(vnn->tcp_array);
3742 vnn->tcp_array = NULL;
3744 tcparray = talloc(vnn, struct ctdb_tcp_array);
3745 CTDB_NO_MEMORY(ctdb, tcparray);
3747 tcparray->num = list->num;
3749 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3750 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3752 memcpy(tcparray->connections, &list->connections[0],
3753 sizeof(struct ctdb_connection)*tcparray->num);
3755 /* We now have a new fresh tickle list array for this vnn */
3756 vnn->tcp_array = tcparray;
3762 called to return the full list of tickles for the puclic address associated
3763 with the provided vnn
3765 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3767 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3768 struct ctdb_tickle_list_old *list;
3769 struct ctdb_tcp_array *tcparray;
3771 struct ctdb_vnn *vnn;
3773 vnn = find_public_ip_vnn(ctdb, addr);
3775 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3776 ctdb_addr_to_str(addr)));
3781 tcparray = vnn->tcp_array;
3783 num = tcparray->num;
3788 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3789 + sizeof(struct ctdb_connection) * num;
3791 outdata->dptr = talloc_size(outdata, outdata->dsize);
3792 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3793 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3798 memcpy(&list->connections[0], tcparray->connections,
3799 sizeof(struct ctdb_connection) * num);
3807 set the list of all tcp tickles for a public address
3809 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3810 ctdb_sock_addr *addr,
3811 struct ctdb_tcp_array *tcparray)
3815 struct ctdb_tickle_list_old *list;
3818 num = tcparray->num;
3823 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3824 sizeof(struct ctdb_connection) * num;
3825 data.dptr = talloc_size(ctdb, data.dsize);
3826 CTDB_NO_MEMORY(ctdb, data.dptr);
3828 list = (struct ctdb_tickle_list_old *)data.dptr;
3832 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3835 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3836 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3837 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3839 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3843 talloc_free(data.dptr);
3850 perform tickle updates if required
3852 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3853 struct tevent_timer *te,
3854 struct timeval t, void *private_data)
3856 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3858 struct ctdb_vnn *vnn;
3860 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3861 /* we only send out updates for public addresses that
3864 if (ctdb->pnn != vnn->pnn) {
3867 /* We only send out the updates if we need to */
3868 if (!vnn->tcp_update_needed) {
3871 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3872 &vnn->public_address,
3875 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3876 ctdb_addr_to_str(&vnn->public_address)));
3879 ("Sent tickle update for public address %s\n",
3880 ctdb_addr_to_str(&vnn->public_address)));
3881 vnn->tcp_update_needed = false;
3885 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3886 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3887 ctdb_update_tcp_tickles, ctdb);
3891 start periodic update of tcp tickles
3893 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3895 ctdb->tickle_update_context = talloc_new(ctdb);
3897 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3898 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3899 ctdb_update_tcp_tickles, ctdb);
3905 struct control_gratious_arp {
3906 struct ctdb_context *ctdb;
3907 ctdb_sock_addr addr;
3913 send a control_gratuitous arp
3915 static void send_gratious_arp(struct tevent_context *ev,
3916 struct tevent_timer *te,
3917 struct timeval t, void *private_data)
3920 struct control_gratious_arp *arp = talloc_get_type(private_data,
3921 struct control_gratious_arp);
3923 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3925 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3926 arp->iface, strerror(errno)));
3931 if (arp->count == CTDB_ARP_REPEAT) {
3936 tevent_add_timer(arp->ctdb->ev, arp,
3937 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3938 send_gratious_arp, arp);
3945 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3947 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3948 struct control_gratious_arp *arp;
3950 /* verify the size of indata */
3951 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3952 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3953 (unsigned)indata.dsize,
3954 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3958 ( offsetof(struct ctdb_addr_info_old, iface)
3959 + gratious_arp->len ) ){
3961 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3962 "but should be %u bytes\n",
3963 (unsigned)indata.dsize,
3964 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
3969 arp = talloc(ctdb, struct control_gratious_arp);
3970 CTDB_NO_MEMORY(ctdb, arp);
3973 arp->addr = gratious_arp->addr;
3974 arp->iface = talloc_strdup(arp, gratious_arp->iface);
3975 CTDB_NO_MEMORY(ctdb, arp->iface);
3978 tevent_add_timer(arp->ctdb->ev, arp,
3979 timeval_zero(), send_gratious_arp, arp);
3984 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3986 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3989 /* verify the size of indata */
3990 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3991 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3995 ( offsetof(struct ctdb_addr_info_old, iface)
3998 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3999 "but should be %u bytes\n",
4000 (unsigned)indata.dsize,
4001 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4005 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4007 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4010 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4017 struct delete_ip_callback_state {
4018 struct ctdb_req_control_old *c;
4022 called when releaseip event finishes for del_public_address
4024 static void delete_ip_callback(struct ctdb_context *ctdb,
4025 int32_t status, TDB_DATA data,
4026 const char *errormsg,
4029 struct delete_ip_callback_state *state =
4030 talloc_get_type(private_data, struct delete_ip_callback_state);
4032 /* If release failed then fail. */
4033 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4034 talloc_free(private_data);
4037 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4038 struct ctdb_req_control_old *c,
4039 TDB_DATA indata, bool *async_reply)
4041 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4042 struct ctdb_vnn *vnn;
4044 /* verify the size of indata */
4045 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4046 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4050 ( offsetof(struct ctdb_addr_info_old, iface)
4053 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4054 "but should be %u bytes\n",
4055 (unsigned)indata.dsize,
4056 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4060 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4062 /* walk over all public addresses until we find a match */
4063 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4064 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4065 if (vnn->pnn == ctdb->pnn) {
4066 struct delete_ip_callback_state *state;
4067 struct ctdb_public_ip *ip;
4071 vnn->delete_pending = true;
4073 state = talloc(ctdb,
4074 struct delete_ip_callback_state);
4075 CTDB_NO_MEMORY(ctdb, state);
4078 ip = talloc(state, struct ctdb_public_ip);
4081 (__location__ " Out of memory\n"));
4086 ip->addr = pub->addr;
4088 data.dsize = sizeof(struct ctdb_public_ip);
4089 data.dptr = (unsigned char *)ip;
4091 ret = ctdb_daemon_send_control(ctdb,
4094 CTDB_CONTROL_RELEASE_IP,
4101 (__location__ "Unable to send "
4102 "CTDB_CONTROL_RELEASE_IP\n"));
4107 state->c = talloc_steal(state, c);
4108 *async_reply = true;
4110 /* This IP is not hosted on the
4111 * current node so just delete it
4113 do_delete_ip(ctdb, vnn);
4120 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4121 ctdb_addr_to_str(&pub->addr)));
4126 struct ipreallocated_callback_state {
4127 struct ctdb_req_control_old *c;
4130 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4131 int status, void *p)
4133 struct ipreallocated_callback_state *state =
4134 talloc_get_type(p, struct ipreallocated_callback_state);
4138 (" \"ipreallocated\" event script failed (status %d)\n",
4140 if (status == -ETIME) {
4141 ctdb_ban_self(ctdb);
4145 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4149 /* A control to run the ipreallocated event */
4150 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4151 struct ctdb_req_control_old *c,
4155 struct ipreallocated_callback_state *state;
4157 state = talloc(ctdb, struct ipreallocated_callback_state);
4158 CTDB_NO_MEMORY(ctdb, state);
4160 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4162 ret = ctdb_event_script_callback(ctdb, state,
4163 ctdb_ipreallocated_callback, state,
4164 CTDB_EVENT_IPREALLOCATED,
4168 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4173 /* tell the control that we will be reply asynchronously */
4174 state->c = talloc_steal(state, c);
4175 *async_reply = true;
4181 /* This function is called from the recovery daemon to verify that a remote
4182 node has the expected ip allocation.
4183 This is verified against ctdb->ip_tree
4185 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4186 struct ctdb_public_ip_list_old *ips,
4189 struct public_ip_list *tmp_ip;
4192 if (ctdb->ip_tree == NULL) {
4193 /* don't know the expected allocation yet, assume remote node
4202 for (i=0; i<ips->num; i++) {
4203 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4204 if (tmp_ip == NULL) {
4205 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4209 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4213 if (tmp_ip->pnn != ips->ips[i].pnn) {
4215 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4217 ctdb_addr_to_str(&ips->ips[i].addr),
4218 ips->ips[i].pnn, tmp_ip->pnn));
4226 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4228 struct public_ip_list *tmp_ip;
4230 /* IP tree is never built if DisableIPFailover is set */
4231 if (ctdb->tunable.disable_ip_failover != 0) {
4235 if (ctdb->ip_tree == NULL) {
4236 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4240 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4241 if (tmp_ip == NULL) {
4242 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4246 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4247 tmp_ip->pnn = ip->pnn;
4252 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4254 TALLOC_FREE(ctdb->ip_tree);
4257 struct ctdb_reloadips_handle {
4258 struct ctdb_context *ctdb;
4259 struct ctdb_req_control_old *c;
4263 struct tevent_fd *fde;
4266 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4268 if (h == h->ctdb->reload_ips) {
4269 h->ctdb->reload_ips = NULL;
4272 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4275 ctdb_kill(h->ctdb, h->child, SIGKILL);
4279 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4280 struct tevent_timer *te,
4281 struct timeval t, void *private_data)
4283 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4288 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4289 struct tevent_fd *fde,
4290 uint16_t flags, void *private_data)
4292 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4297 ret = sys_read(h->fd[0], &res, 1);
4298 if (ret < 1 || res != 0) {
4299 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4307 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4309 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4310 struct ctdb_public_ip_list_old *ips;
4311 struct ctdb_vnn *vnn;
4312 struct client_async_data *async_data;
4313 struct timeval timeout;
4315 struct ctdb_client_control_state *state;
4319 CTDB_NO_MEMORY(ctdb, mem_ctx);
4321 /* Read IPs from local node */
4322 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4323 CTDB_CURRENT_NODE, mem_ctx, &ips);
4326 ("Unable to fetch public IPs from local node\n"));
4327 talloc_free(mem_ctx);
4331 /* Read IPs file - this is safe since this is a child process */
4333 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4334 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4335 talloc_free(mem_ctx);
4339 async_data = talloc_zero(mem_ctx, struct client_async_data);
4340 CTDB_NO_MEMORY(ctdb, async_data);
4342 /* Compare IPs between node and file for IPs to be deleted */
4343 for (i = 0; i < ips->num; i++) {
4345 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4346 if (ctdb_same_ip(&vnn->public_address,
4347 &ips->ips[i].addr)) {
4348 /* IP is still in file */
4354 /* Delete IP ips->ips[i] */
4355 struct ctdb_addr_info_old *pub;
4358 ("IP %s no longer configured, deleting it\n",
4359 ctdb_addr_to_str(&ips->ips[i].addr)));
4361 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4362 CTDB_NO_MEMORY(ctdb, pub);
4364 pub->addr = ips->ips[i].addr;
4368 timeout = TAKEOVER_TIMEOUT();
4370 data.dsize = offsetof(struct ctdb_addr_info_old,
4372 data.dptr = (uint8_t *)pub;
4374 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4375 CTDB_CONTROL_DEL_PUBLIC_IP,
4376 0, data, async_data,
4378 if (state == NULL) {
4381 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4385 ctdb_client_async_add(async_data, state);
4389 /* Compare IPs between node and file for IPs to be added */
4391 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4392 for (i = 0; i < ips->num; i++) {
4393 if (ctdb_same_ip(&vnn->public_address,
4394 &ips->ips[i].addr)) {
4395 /* IP already on node */
4399 if (i == ips->num) {
4400 /* Add IP ips->ips[i] */
4401 struct ctdb_addr_info_old *pub;
4402 const char *ifaces = NULL;
4407 ("New IP %s configured, adding it\n",
4408 ctdb_addr_to_str(&vnn->public_address)));
4410 uint32_t pnn = ctdb_get_pnn(ctdb);
4412 data.dsize = sizeof(pnn);
4413 data.dptr = (uint8_t *)&pnn;
4415 ret = ctdb_client_send_message(
4417 CTDB_BROADCAST_CONNECTED,
4418 CTDB_SRVID_REBALANCE_NODE,
4421 DEBUG(DEBUG_WARNING,
4422 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4428 ifaces = vnn->ifaces[0];
4430 while (vnn->ifaces[iface] != NULL) {
4431 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4432 vnn->ifaces[iface]);
4436 len = strlen(ifaces) + 1;
4437 pub = talloc_zero_size(mem_ctx,
4438 offsetof(struct ctdb_addr_info_old, iface) + len);
4439 CTDB_NO_MEMORY(ctdb, pub);
4441 pub->addr = vnn->public_address;
4442 pub->mask = vnn->public_netmask_bits;
4444 memcpy(&pub->iface[0], ifaces, pub->len);
4446 timeout = TAKEOVER_TIMEOUT();
4448 data.dsize = offsetof(struct ctdb_addr_info_old,
4450 data.dptr = (uint8_t *)pub;
4452 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4453 CTDB_CONTROL_ADD_PUBLIC_IP,
4454 0, data, async_data,
4456 if (state == NULL) {
4459 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4463 ctdb_client_async_add(async_data, state);
4467 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4468 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4472 talloc_free(mem_ctx);
4476 talloc_free(mem_ctx);
4480 /* This control is sent to force the node to re-read the public addresses file
4481 and drop any addresses we should nnot longer host, and add new addresses
4482 that we are now able to host
4484 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4486 struct ctdb_reloadips_handle *h;
4487 pid_t parent = getpid();
4489 if (ctdb->reload_ips != NULL) {
4490 talloc_free(ctdb->reload_ips);
4491 ctdb->reload_ips = NULL;
4494 h = talloc(ctdb, struct ctdb_reloadips_handle);
4495 CTDB_NO_MEMORY(ctdb, h);
4500 if (pipe(h->fd) == -1) {
4501 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4506 h->child = ctdb_fork(ctdb);
4507 if (h->child == (pid_t)-1) {
4508 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4516 if (h->child == 0) {
4517 signed char res = 0;
4520 debug_extra = talloc_asprintf(NULL, "reloadips:");
4522 prctl_set_comment("ctdb_reloadips");
4523 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4524 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4527 res = ctdb_reloadips_child(ctdb);
4529 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4533 sys_write(h->fd[1], &res, 1);
4534 /* make sure we die when our parent dies */
4535 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4541 h->c = talloc_steal(h, c);
4544 set_close_on_exec(h->fd[0]);
4546 talloc_set_destructor(h, ctdb_reloadips_destructor);
4549 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4550 ctdb_reloadips_child_handler, (void *)h);
4551 tevent_fd_set_auto_close(h->fde);
4553 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4554 ctdb_reloadips_timeout_event, h);
4556 /* we reply later */
4557 *async_reply = true;