4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
56 enum ipalloc_algorithm {
57 IPALLOC_DETERMINISTIC,
58 IPALLOC_NONDETERMINISTIC,
62 struct ipalloc_state {
65 /* Arrays with data for each node */
66 struct ctdb_public_ip_list_old **known_public_ips;
67 struct ctdb_public_ip_list_old **available_public_ips;
69 enum ipalloc_algorithm algorithm;
70 uint32_t no_ip_failback;
73 struct ctdb_interface {
74 struct ctdb_interface *prev, *next;
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
83 return vnn->iface->name;
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
91 struct ctdb_interface *i;
93 /* Verify that we don't have an entry for this ip yet */
94 for (i=ctdb->ifaces;i;i=i->next) {
95 if (strcmp(i->name, iface) == 0) {
100 /* create a new structure for this interface */
101 i = talloc_zero(ctdb, struct ctdb_interface);
102 CTDB_NO_MEMORY_FATAL(ctdb, i);
103 i->name = talloc_strdup(i, iface);
104 CTDB_NO_MEMORY(ctdb, i->name);
108 DLIST_ADD(ctdb->ifaces, i);
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
118 for (n = 0; vnn->ifaces[n] != NULL; n++) {
119 if (strcmp(name, vnn->ifaces[n]) == 0) {
127 /* If any interfaces now have no possible IPs then delete them. This
128 * implementation is naive (i.e. simple) rather than clever
129 * (i.e. complex). Given that this is run on delip and that operation
130 * is rare, this doesn't need to be efficient - it needs to be
131 * foolproof. One alternative is reference counting, where the logic
132 * is distributed and can, therefore, be broken in multiple places.
133 * Another alternative is to build a red-black tree of interfaces that
134 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135 * once) and then walking ctdb->ifaces once and deleting those not in
136 * the tree. Let's go to one of those if the naive implementation
137 * causes problems... :-)
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140 struct ctdb_vnn *vnn)
142 struct ctdb_interface *i, *next;
144 /* For each interface, check if there's an IP using it. */
145 for (i = ctdb->ifaces; i != NULL; i = next) {
150 /* Only consider interfaces named in the given VNN. */
151 if (!vnn_has_interface_with_name(vnn, i->name)) {
155 /* Is the "single IP" on this interface? */
156 if ((ctdb->single_ip_vnn != NULL) &&
157 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159 /* Found, next interface please... */
162 /* Search for a vnn with this interface. */
164 for (tv=ctdb->vnn; tv; tv=tv->next) {
165 if (vnn_has_interface_with_name(tv, i->name)) {
172 /* None of the VNNs are using this interface. */
173 DLIST_REMOVE(ctdb->ifaces, i);
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
183 struct ctdb_interface *i;
185 for (i=ctdb->ifaces;i;i=i->next) {
186 if (strcmp(i->name, iface) == 0) {
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195 struct ctdb_vnn *vnn)
198 struct ctdb_interface *cur = NULL;
199 struct ctdb_interface *best = NULL;
201 for (i=0; vnn->ifaces[i]; i++) {
203 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
217 if (cur->references < best->references) {
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227 struct ctdb_vnn *vnn)
229 struct ctdb_interface *best = NULL;
232 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233 "still assigned to iface '%s'\n",
234 ctdb_addr_to_str(&vnn->public_address),
235 ctdb_vnn_iface_string(vnn)));
239 best = ctdb_vnn_best_iface(ctdb, vnn);
241 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242 "cannot assign to iface any iface\n",
243 ctdb_addr_to_str(&vnn->public_address)));
249 vnn->pnn = ctdb->pnn;
251 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252 "now assigned to iface '%s' refs[%d]\n",
253 ctdb_addr_to_str(&vnn->public_address),
254 ctdb_vnn_iface_string(vnn),
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260 struct ctdb_vnn *vnn)
262 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263 "now unassigned (old iface '%s' refs[%d])\n",
264 ctdb_addr_to_str(&vnn->public_address),
265 ctdb_vnn_iface_string(vnn),
266 vnn->iface?vnn->iface->references:0));
268 vnn->iface->references--;
271 if (vnn->pnn == ctdb->pnn) {
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277 struct ctdb_vnn *vnn)
281 /* Nodes that are not RUNNING can not host IPs */
282 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
286 if (vnn->delete_pending) {
290 if (vnn->iface && vnn->iface->link_up) {
294 for (i=0; vnn->ifaces[i]; i++) {
295 struct ctdb_interface *cur;
297 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
310 struct ctdb_takeover_arp {
311 struct ctdb_context *ctdb;
314 struct ctdb_tcp_array *tcparray;
315 struct ctdb_vnn *vnn;
320 lists of tcp endpoints
322 struct ctdb_tcp_list {
323 struct ctdb_tcp_list *prev, *next;
324 struct ctdb_connection connection;
328 list of clients to kill on IP release
330 struct ctdb_client_ip {
331 struct ctdb_client_ip *prev, *next;
332 struct ctdb_context *ctdb;
339 send a gratuitous arp
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342 struct tevent_timer *te,
343 struct timeval t, void *private_data)
345 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
346 struct ctdb_takeover_arp);
348 struct ctdb_tcp_array *tcparray;
349 const char *iface = ctdb_vnn_iface_string(arp->vnn);
351 ret = ctdb_sys_send_arp(&arp->addr, iface);
353 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354 iface, strerror(errno)));
357 tcparray = arp->tcparray;
359 for (i=0;i<tcparray->num;i++) {
360 struct ctdb_connection *tcon;
362 tcon = &tcparray->connections[i];
363 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364 (unsigned)ntohs(tcon->dst.ip.sin_port),
365 ctdb_addr_to_str(&tcon->src),
366 (unsigned)ntohs(tcon->src.ip.sin_port)));
367 ret = ctdb_sys_send_tcp(
372 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373 ctdb_addr_to_str(&tcon->src)));
380 if (arp->count == CTDB_ARP_REPEAT) {
385 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387 ctdb_control_send_arp, arp);
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391 struct ctdb_vnn *vnn)
393 struct ctdb_takeover_arp *arp;
394 struct ctdb_tcp_array *tcparray;
396 if (!vnn->takeover_ctx) {
397 vnn->takeover_ctx = talloc_new(vnn);
398 if (!vnn->takeover_ctx) {
403 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
409 arp->addr = vnn->public_address;
412 tcparray = vnn->tcp_array;
414 /* add all of the known tcp connections for this IP to the
415 list of tcp connections to send tickle acks for */
416 arp->tcparray = talloc_steal(arp, tcparray);
418 vnn->tcp_array = NULL;
419 vnn->tcp_update_needed = true;
422 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423 timeval_zero(), ctdb_control_send_arp, arp);
428 struct takeover_callback_state {
429 struct ctdb_req_control_old *c;
430 ctdb_sock_addr *addr;
431 struct ctdb_vnn *vnn;
434 struct ctdb_do_takeip_state {
435 struct ctdb_req_control_old *c;
436 struct ctdb_vnn *vnn;
440 called when takeip event finishes
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
445 struct ctdb_do_takeip_state *state =
446 talloc_get_type(private_data, struct ctdb_do_takeip_state);
451 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
453 if (status == -ETIME) {
456 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457 ctdb_addr_to_str(&state->vnn->public_address),
458 ctdb_vnn_iface_string(state->vnn)));
459 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
461 node->flags |= NODE_FLAGS_UNHEALTHY;
466 if (ctdb->do_checkpublicip) {
468 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
470 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
477 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478 data.dsize = strlen((char *)data.dptr) + 1;
479 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
481 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
484 /* the control succeeded */
485 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
492 state->vnn->update_in_flight = false;
497 take over an ip address
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500 struct ctdb_req_control_old *c,
501 struct ctdb_vnn *vnn)
504 struct ctdb_do_takeip_state *state;
506 if (vnn->update_in_flight) {
507 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508 "update for this IP already in flight\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits));
514 ret = ctdb_vnn_assign_iface(ctdb, vnn);
516 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517 "assign a usable interface\n",
518 ctdb_addr_to_str(&vnn->public_address),
519 vnn->public_netmask_bits));
523 state = talloc(vnn, struct ctdb_do_takeip_state);
524 CTDB_NO_MEMORY(ctdb, state);
526 state->c = talloc_steal(ctdb, c);
529 vnn->update_in_flight = true;
530 talloc_set_destructor(state, ctdb_takeip_destructor);
532 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533 ctdb_addr_to_str(&vnn->public_address),
534 vnn->public_netmask_bits,
535 ctdb_vnn_iface_string(vnn)));
537 ret = ctdb_event_script_callback(ctdb,
539 ctdb_do_takeip_callback,
543 ctdb_vnn_iface_string(vnn),
544 ctdb_addr_to_str(&vnn->public_address),
545 vnn->public_netmask_bits);
548 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549 ctdb_addr_to_str(&vnn->public_address),
550 ctdb_vnn_iface_string(vnn)));
558 struct ctdb_do_updateip_state {
559 struct ctdb_req_control_old *c;
560 struct ctdb_interface *old;
561 struct ctdb_vnn *vnn;
565 called when updateip event finishes
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
570 struct ctdb_do_updateip_state *state =
571 talloc_get_type(private_data, struct ctdb_do_updateip_state);
575 if (status == -ETIME) {
578 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579 ctdb_addr_to_str(&state->vnn->public_address),
581 ctdb_vnn_iface_string(state->vnn)));
584 * All we can do is reset the old interface
585 * and let the next run fix it
587 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588 state->vnn->iface = state->old;
589 state->vnn->iface->references++;
591 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
596 if (ctdb->do_checkpublicip) {
598 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
600 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
607 /* the control succeeded */
608 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
615 state->vnn->update_in_flight = false;
620 update (move) an ip address
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623 struct ctdb_req_control_old *c,
624 struct ctdb_vnn *vnn)
627 struct ctdb_do_updateip_state *state;
628 struct ctdb_interface *old = vnn->iface;
629 const char *new_name;
631 if (vnn->update_in_flight) {
632 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633 "update for this IP already in flight\n",
634 ctdb_addr_to_str(&vnn->public_address),
635 vnn->public_netmask_bits));
639 ctdb_vnn_unassign_iface(ctdb, vnn);
640 ret = ctdb_vnn_assign_iface(ctdb, vnn);
642 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643 "assin a usable interface (old iface '%s')\n",
644 ctdb_addr_to_str(&vnn->public_address),
645 vnn->public_netmask_bits,
650 new_name = ctdb_vnn_iface_string(vnn);
651 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652 /* A benign update from one interface onto itself.
653 * no need to run the eventscripts in this case, just return
656 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
660 state = talloc(vnn, struct ctdb_do_updateip_state);
661 CTDB_NO_MEMORY(ctdb, state);
663 state->c = talloc_steal(ctdb, c);
667 vnn->update_in_flight = true;
668 talloc_set_destructor(state, ctdb_updateip_destructor);
670 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671 "interface %s to %s\n",
672 ctdb_addr_to_str(&vnn->public_address),
673 vnn->public_netmask_bits,
677 ret = ctdb_event_script_callback(ctdb,
679 ctdb_do_updateip_callback,
681 CTDB_EVENT_UPDATE_IP,
685 ctdb_addr_to_str(&vnn->public_address),
686 vnn->public_netmask_bits);
688 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689 ctdb_addr_to_str(&vnn->public_address),
690 old->name, new_name));
699 Find the vnn of the node that has a public ip address
700 returns -1 if the address is not known as a public address
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
704 struct ctdb_vnn *vnn;
706 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707 if (ctdb_same_ip(&vnn->public_address, addr)) {
716 take over an ip address
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719 struct ctdb_req_control_old *c,
724 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725 struct ctdb_vnn *vnn;
726 bool have_ip = false;
727 bool do_updateip = false;
728 bool do_takeip = false;
729 struct ctdb_interface *best_iface = NULL;
731 if (pip->pnn != ctdb->pnn) {
732 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733 "with pnn %d, but we're node %d\n",
734 ctdb_addr_to_str(&pip->addr),
735 pip->pnn, ctdb->pnn));
739 /* update out vnn list */
740 vnn = find_public_ip_vnn(ctdb, &pip->addr);
742 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743 ctdb_addr_to_str(&pip->addr)));
747 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748 have_ip = ctdb_sys_have_ip(&pip->addr);
750 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751 if (best_iface == NULL) {
752 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753 "a usable interface (old %s, have_ip %d)\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 vnn->public_netmask_bits,
756 ctdb_vnn_iface_string(vnn),
761 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
767 if (vnn->iface == NULL && have_ip) {
768 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770 ctdb_addr_to_str(&vnn->public_address)));
774 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776 "and we have it on iface[%s], but it was assigned to node %d"
777 "and we are node %d, banning ourself\n",
778 ctdb_addr_to_str(&vnn->public_address),
779 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
784 if (vnn->pnn == -1 && have_ip) {
785 vnn->pnn = ctdb->pnn;
786 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787 "and we already have it on iface[%s], update local daemon\n",
788 ctdb_addr_to_str(&vnn->public_address),
789 ctdb_vnn_iface_string(vnn)));
794 if (vnn->iface != best_iface) {
795 if (!vnn->iface->link_up) {
797 } else if (vnn->iface->references > (best_iface->references + 1)) {
798 /* only move when the rebalance gains something */
806 ctdb_vnn_unassign_iface(ctdb, vnn);
813 ret = ctdb_do_takeip(ctdb, c, vnn);
817 } else if (do_updateip) {
818 ret = ctdb_do_updateip(ctdb, c, vnn);
824 * The interface is up and the kernel known the ip
827 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828 ctdb_addr_to_str(&pip->addr),
829 vnn->public_netmask_bits,
830 ctdb_vnn_iface_string(vnn)));
834 /* tell ctdb_control.c that we will be replying asynchronously */
841 kill any clients that are registered with a IP that is being released
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
845 struct ctdb_client_ip *ip;
847 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848 ctdb_addr_to_str(addr)));
850 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851 ctdb_sock_addr tmp_addr;
854 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
856 ctdb_addr_to_str(&ip->addr)));
858 if (ctdb_same_ip(&tmp_addr, addr)) {
859 struct ctdb_client *client = reqid_find(ctdb->idr,
862 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
864 ctdb_addr_to_str(&ip->addr),
867 if (client->pid != 0) {
868 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869 (unsigned)client->pid,
870 ctdb_addr_to_str(addr),
872 kill(client->pid, SIGKILL);
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
880 DLIST_REMOVE(ctdb->vnn, vnn);
881 ctdb_vnn_unassign_iface(ctdb, vnn);
882 ctdb_remove_orphaned_ifaces(ctdb, vnn);
887 called when releaseip event finishes
889 static void release_ip_callback(struct ctdb_context *ctdb, int status,
892 struct takeover_callback_state *state =
893 talloc_get_type(private_data, struct takeover_callback_state);
896 if (status == -ETIME) {
900 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901 if (ctdb_sys_have_ip(state->addr)) {
903 ("IP %s still hosted during release IP callback, failing\n",
904 ctdb_addr_to_str(state->addr)));
905 ctdb_request_control_reply(ctdb, state->c,
912 /* send a message to all clients of this node telling them
913 that the cluster has been reconfigured and they should
914 release any sockets on this IP */
915 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917 data.dsize = strlen((char *)data.dptr)+1;
919 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
921 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
923 /* kill clients that have registered with this IP */
924 release_kill_clients(ctdb, state->addr);
926 ctdb_vnn_unassign_iface(ctdb, state->vnn);
928 /* Process the IP if it has been marked for deletion */
929 if (state->vnn->delete_pending) {
930 do_delete_ip(ctdb, state->vnn);
934 /* the control succeeded */
935 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
941 if (state->vnn != NULL) {
942 state->vnn->update_in_flight = false;
948 release an ip address
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
951 struct ctdb_req_control_old *c,
956 struct takeover_callback_state *state;
957 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958 struct ctdb_vnn *vnn;
961 /* update our vnn list */
962 vnn = find_public_ip_vnn(ctdb, &pip->addr);
964 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965 ctdb_addr_to_str(&pip->addr)));
970 /* stop any previous arps */
971 talloc_free(vnn->takeover_ctx);
972 vnn->takeover_ctx = NULL;
974 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975 * lazy multicast to drop an IP from any node that isn't the
976 * intended new node. The following causes makes ctdbd ignore
977 * a release for any address it doesn't host.
979 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980 if (!ctdb_sys_have_ip(&pip->addr)) {
981 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982 ctdb_addr_to_str(&pip->addr),
983 vnn->public_netmask_bits,
984 ctdb_vnn_iface_string(vnn)));
985 ctdb_vnn_unassign_iface(ctdb, vnn);
989 if (vnn->iface == NULL) {
990 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991 ctdb_addr_to_str(&pip->addr),
992 vnn->public_netmask_bits));
997 /* There is a potential race between take_ip and us because we
998 * update the VNN via a callback that run when the
999 * eventscripts have been run. Avoid the race by allowing one
1000 * update to be in flight at a time.
1002 if (vnn->update_in_flight) {
1003 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004 "update for this IP already in flight\n",
1005 ctdb_addr_to_str(&vnn->public_address),
1006 vnn->public_netmask_bits));
1010 iface = strdup(ctdb_vnn_iface_string(vnn));
1012 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1013 ctdb_addr_to_str(&pip->addr),
1014 vnn->public_netmask_bits,
1018 state = talloc(ctdb, struct takeover_callback_state);
1019 if (state == NULL) {
1020 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021 __FILE__, __LINE__);
1026 state->c = talloc_steal(state, c);
1027 state->addr = talloc(state, ctdb_sock_addr);
1028 if (state->addr == NULL) {
1029 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030 __FILE__, __LINE__);
1035 *state->addr = pip->addr;
1038 vnn->update_in_flight = true;
1039 talloc_set_destructor(state, ctdb_releaseip_destructor);
1041 ret = ctdb_event_script_callback(ctdb,
1042 state, release_ip_callback, state,
1043 CTDB_EVENT_RELEASE_IP,
1046 ctdb_addr_to_str(&pip->addr),
1047 vnn->public_netmask_bits);
1050 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051 ctdb_addr_to_str(&pip->addr),
1052 ctdb_vnn_iface_string(vnn)));
1057 /* tell the control that we will be reply asynchronously */
1058 *async_reply = true;
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063 ctdb_sock_addr *addr,
1064 unsigned mask, const char *ifaces,
1067 struct ctdb_vnn *vnn;
1074 tmp = strdup(ifaces);
1075 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076 if (!ctdb_sys_check_iface_exists(iface)) {
1077 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1084 /* Verify that we don't have an entry for this ip yet */
1085 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1088 ctdb_addr_to_str(addr)));
1093 /* create a new vnn structure for this ip address */
1094 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097 tmp = talloc_strdup(vnn, ifaces);
1098 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1107 vnn->ifaces[num] = NULL;
1108 vnn->public_address = *addr;
1109 vnn->public_netmask_bits = mask;
1111 if (check_address) {
1112 if (ctdb_sys_have_ip(addr)) {
1113 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114 vnn->pnn = ctdb->pnn;
1118 for (i=0; vnn->ifaces[i]; i++) {
1119 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122 "for public_address[%s]\n",
1123 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1129 DLIST_ADD(ctdb->vnn, vnn);
1135 setup the public address lists from a file
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1143 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144 if (lines == NULL) {
1145 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1148 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1152 for (i=0;i<nlines;i++) {
1154 ctdb_sock_addr addr;
1155 const char *addrstr;
1160 while ((*line == ' ') || (*line == '\t')) {
1166 if (strcmp(line, "") == 0) {
1169 tok = strtok(line, " \t");
1171 tok = strtok(NULL, " \t");
1173 if (NULL == ctdb->default_public_interface) {
1174 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1179 ifaces = ctdb->default_public_interface;
1184 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1189 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1205 struct ctdb_vnn *svnn;
1206 struct ctdb_interface *cur = NULL;
1210 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211 CTDB_NO_MEMORY(ctdb, svnn);
1213 svnn->ifaces = talloc_array(svnn, const char *, 2);
1214 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217 svnn->ifaces[1] = NULL;
1219 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1225 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1227 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228 "for single_ip[%s]\n",
1230 ctdb_addr_to_str(&svnn->public_address)));
1235 /* assume the single public ip interface is initially "good" */
1236 cur = ctdb_find_iface(ctdb, iface);
1238 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1241 cur->link_up = true;
1243 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1249 ctdb->single_ip_vnn = svnn;
1253 struct public_ip_list {
1254 struct public_ip_list *next;
1256 ctdb_sock_addr addr;
1259 /* Given a physical node, return the number of
1260 public addresses that is currently assigned to this node.
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1266 for (;ips;ips=ips->next) {
1267 if (ips->pnn == pnn) {
1275 /* Can the given node host the given IP: is the public IP known to the
1276 * node and is NOIPHOST unset?
1278 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1279 struct ctdb_ipflags ipflags,
1280 struct public_ip_list *ip)
1282 struct ctdb_public_ip_list_old *public_ips;
1285 if (ipflags.noiphost) {
1289 public_ips = ctdb->ipalloc_state->available_public_ips[pnn];
1291 if (public_ips == NULL) {
1295 for (i=0; i<public_ips->num; i++) {
1296 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1297 /* yes, this node can serve this public ip */
1305 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1306 struct ctdb_ipflags ipflags,
1307 struct public_ip_list *ip)
1309 if (ipflags.noiptakeover) {
1313 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1316 /* search the node lists list for a node to takeover this ip.
1317 pick the node that currently are serving the least number of ips
1318 so that the ips get spread out evenly.
1320 static int find_takeover_node(struct ctdb_context *ctdb,
1321 struct ctdb_ipflags *ipflags,
1322 struct public_ip_list *ip,
1323 struct public_ip_list *all_ips)
1325 int pnn, min=0, num;
1328 numnodes = talloc_array_length(ipflags);
1330 for (i=0; i<numnodes; i++) {
1331 /* verify that this node can serve this ip */
1332 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1333 /* no it couldnt so skip to the next node */
1337 num = node_ip_coverage(i, all_ips);
1338 /* was this the first node we checked ? */
1350 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1351 ctdb_addr_to_str(&ip->addr)));
1361 static uint32_t *ip_key(ctdb_sock_addr *ip)
1363 static uint32_t key[IP_KEYLEN];
1365 bzero(key, sizeof(key));
1367 switch (ip->sa.sa_family) {
1369 key[3] = htonl(ip->ip.sin_addr.s_addr);
1372 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1373 key[0] = htonl(s6_a32[0]);
1374 key[1] = htonl(s6_a32[1]);
1375 key[2] = htonl(s6_a32[2]);
1376 key[3] = htonl(s6_a32[3]);
1380 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1387 static void *add_ip_callback(void *parm, void *data)
1389 struct public_ip_list *this_ip = parm;
1390 struct public_ip_list *prev_ip = data;
1392 if (prev_ip == NULL) {
1395 if (this_ip->pnn == -1) {
1396 this_ip->pnn = prev_ip->pnn;
1402 static int getips_count_callback(void *param, void *data)
1404 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1405 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1407 new_ip->next = *ip_list;
1412 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1413 struct ctdb_public_ip_list_old *ips,
1416 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1417 struct ipalloc_state *ipalloc_state,
1418 struct ctdb_node_map_old *nodemap)
1423 if (ipalloc_state->num != nodemap->num) {
1426 " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1427 ipalloc_state->num, nodemap->num));
1431 for (j=0; j<nodemap->num; j++) {
1432 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1436 /* Retrieve the list of known public IPs from the node */
1437 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1442 &ipalloc_state->known_public_ips[j]);
1445 ("Failed to read known public IPs from node: %u\n",
1450 if (ctdb->do_checkpublicip) {
1451 verify_remote_ip_allocation(ctdb,
1452 ipalloc_state->known_public_ips[j],
1456 /* Retrieve the list of available public IPs from the node */
1457 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1461 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1462 &ipalloc_state->available_public_ips[j]);
1465 ("Failed to read available public IPs from node: %u\n",
1474 static struct public_ip_list *
1475 create_merged_ip_list(struct ctdb_context *ctdb)
1478 struct public_ip_list *ip_list;
1479 struct ctdb_public_ip_list_old *public_ips;
1481 TALLOC_FREE(ctdb->ip_tree);
1482 ctdb->ip_tree = trbt_create(ctdb, 0);
1484 for (i=0; i < ctdb->num_nodes; i++) {
1485 public_ips = ctdb->ipalloc_state->known_public_ips[i];
1487 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1491 /* there were no public ips for this node */
1492 if (public_ips == NULL) {
1496 for (j=0; j < public_ips->num; j++) {
1497 struct public_ip_list *tmp_ip;
1499 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1500 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1501 /* Do not use information about IP addresses hosted
1502 * on other nodes, it may not be accurate */
1503 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1504 tmp_ip->pnn = public_ips->ips[j].pnn;
1508 tmp_ip->addr = public_ips->ips[j].addr;
1509 tmp_ip->next = NULL;
1511 trbt_insertarray32_callback(ctdb->ip_tree,
1512 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1519 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1525 * This is the length of the longtest common prefix between the IPs.
1526 * It is calculated by XOR-ing the 2 IPs together and counting the
1527 * number of leading zeroes. The implementation means that all
1528 * addresses end up being 128 bits long.
1530 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1531 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1532 * lots of nodes and IP addresses?
1534 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1536 uint32_t ip1_k[IP_KEYLEN];
1541 uint32_t distance = 0;
1543 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1545 for (i=0; i<IP_KEYLEN; i++) {
1546 x = ip1_k[i] ^ t[i];
1550 /* Count number of leading zeroes.
1551 * FIXME? This could be optimised...
1553 while ((x & (1 << 31)) == 0) {
1563 /* Calculate the IP distance for the given IP relative to IPs on the
1564 given node. The ips argument is generally the all_ips variable
1565 used in the main part of the algorithm.
1567 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1568 struct public_ip_list *ips,
1571 struct public_ip_list *t;
1576 for (t=ips; t != NULL; t=t->next) {
1577 if (t->pnn != pnn) {
1581 /* Optimisation: We never calculate the distance
1582 * between an address and itself. This allows us to
1583 * calculate the effect of removing an address from a
1584 * node by simply calculating the distance between
1585 * that address and all of the exitsing addresses.
1586 * Moreover, we assume that we're only ever dealing
1587 * with addresses from all_ips so we can identify an
1588 * address via a pointer rather than doing a more
1589 * expensive address comparison. */
1590 if (&(t->addr) == ip) {
1594 d = ip_distance(ip, &(t->addr));
1595 sum += d * d; /* Cheaper than pulling in math.h :-) */
1601 /* Return the LCP2 imbalance metric for addresses currently assigned
1604 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1606 struct public_ip_list *t;
1608 uint32_t imbalance = 0;
1610 for (t=all_ips; t!=NULL; t=t->next) {
1611 if (t->pnn != pnn) {
1614 /* Pass the rest of the IPs rather than the whole
1617 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1623 /* Allocate any unassigned IPs just by looping through the IPs and
1624 * finding the best node for each.
1626 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1627 struct ctdb_ipflags *ipflags,
1628 struct public_ip_list *all_ips)
1630 struct public_ip_list *tmp_ip;
1632 /* loop over all ip's and find a physical node to cover for
1635 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1636 if (tmp_ip->pnn == -1) {
1637 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1638 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1639 ctdb_addr_to_str(&tmp_ip->addr)));
1645 /* Basic non-deterministic rebalancing algorithm.
1647 static void basic_failback(struct ctdb_context *ctdb,
1648 struct ctdb_ipflags *ipflags,
1649 struct public_ip_list *all_ips,
1653 int maxnode, maxnum, minnode, minnum, num, retries;
1654 struct public_ip_list *tmp_ip;
1656 numnodes = talloc_array_length(ipflags);
1663 /* for each ip address, loop over all nodes that can serve
1664 this ip and make sure that the difference between the node
1665 serving the most and the node serving the least ip's are
1668 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1669 if (tmp_ip->pnn == -1) {
1673 /* Get the highest and lowest number of ips's served by any
1674 valid node which can serve this ip.
1678 for (i=0; i<numnodes; i++) {
1679 /* only check nodes that can actually serve this ip */
1680 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1681 /* no it couldnt so skip to the next node */
1685 num = node_ip_coverage(i, all_ips);
1686 if (maxnode == -1) {
1695 if (minnode == -1) {
1705 if (maxnode == -1) {
1706 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1707 ctdb_addr_to_str(&tmp_ip->addr)));
1712 /* if the spread between the smallest and largest coverage by
1713 a node is >=2 we steal one of the ips from the node with
1714 most coverage to even things out a bit.
1715 try to do this a limited number of times since we dont
1716 want to spend too much time balancing the ip coverage.
1718 if ( (maxnum > minnum+1)
1719 && (retries < (num_ips + 5)) ){
1720 struct public_ip_list *tmp;
1722 /* Reassign one of maxnode's VNNs */
1723 for (tmp=all_ips;tmp;tmp=tmp->next) {
1724 if (tmp->pnn == maxnode) {
1725 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1734 static void lcp2_init(struct ctdb_context *tmp_ctx,
1735 struct ctdb_ipflags *ipflags,
1736 struct public_ip_list *all_ips,
1737 uint32_t *force_rebalance_nodes,
1738 uint32_t **lcp2_imbalances,
1739 bool **rebalance_candidates)
1742 struct public_ip_list *tmp_ip;
1744 numnodes = talloc_array_length(ipflags);
1746 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1747 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1748 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1749 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1751 for (i=0; i<numnodes; i++) {
1752 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1753 /* First step: assume all nodes are candidates */
1754 (*rebalance_candidates)[i] = true;
1757 /* 2nd step: if a node has IPs assigned then it must have been
1758 * healthy before, so we remove it from consideration. This
1759 * is overkill but is all we have because we don't maintain
1760 * state between takeover runs. An alternative would be to
1761 * keep state and invalidate it every time the recovery master
1764 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1765 if (tmp_ip->pnn != -1) {
1766 (*rebalance_candidates)[tmp_ip->pnn] = false;
1770 /* 3rd step: if a node is forced to re-balance then
1771 we allow failback onto the node */
1772 if (force_rebalance_nodes == NULL) {
1775 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1776 uint32_t pnn = force_rebalance_nodes[i];
1777 if (pnn >= numnodes) {
1779 (__location__ "unknown node %u\n", pnn));
1784 ("Forcing rebalancing of IPs to node %u\n", pnn));
1785 (*rebalance_candidates)[pnn] = true;
1789 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1790 * the IP/node combination that will cost the least.
1792 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1793 struct ctdb_ipflags *ipflags,
1794 struct public_ip_list *all_ips,
1795 uint32_t *lcp2_imbalances)
1797 struct public_ip_list *tmp_ip;
1798 int dstnode, numnodes;
1801 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1802 struct public_ip_list *minip;
1804 bool should_loop = true;
1805 bool have_unassigned = true;
1807 numnodes = talloc_array_length(ipflags);
1809 while (have_unassigned && should_loop) {
1810 should_loop = false;
1812 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1813 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1819 /* loop over each unassigned ip. */
1820 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1821 if (tmp_ip->pnn != -1) {
1825 for (dstnode=0; dstnode<numnodes; dstnode++) {
1826 /* only check nodes that can actually takeover this ip */
1827 if (!can_node_takeover_ip(ctdb, dstnode,
1830 /* no it couldnt so skip to the next node */
1834 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1835 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1836 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1837 ctdb_addr_to_str(&(tmp_ip->addr)),
1839 dstimbl - lcp2_imbalances[dstnode]));
1842 if ((minnode == -1) || (dstdsum < mindsum)) {
1852 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1854 /* If we found one then assign it to the given node. */
1855 if (minnode != -1) {
1856 minip->pnn = minnode;
1857 lcp2_imbalances[minnode] = minimbl;
1858 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1859 ctdb_addr_to_str(&(minip->addr)),
1864 /* There might be a better way but at least this is clear. */
1865 have_unassigned = false;
1866 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1867 if (tmp_ip->pnn == -1) {
1868 have_unassigned = true;
1873 /* We know if we have an unassigned addresses so we might as
1876 if (have_unassigned) {
1877 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1878 if (tmp_ip->pnn == -1) {
1879 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1880 ctdb_addr_to_str(&tmp_ip->addr)));
1886 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1887 * to move IPs from, determines the best IP/destination node
1888 * combination to move from the source node.
1890 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1891 struct ctdb_ipflags *ipflags,
1892 struct public_ip_list *all_ips,
1894 uint32_t *lcp2_imbalances,
1895 bool *rebalance_candidates)
1897 int dstnode, mindstnode, numnodes;
1898 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1899 uint32_t minsrcimbl, mindstimbl;
1900 struct public_ip_list *minip;
1901 struct public_ip_list *tmp_ip;
1903 /* Find an IP and destination node that best reduces imbalance. */
1910 numnodes = talloc_array_length(ipflags);
1912 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1913 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1914 srcnode, lcp2_imbalances[srcnode]));
1916 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1917 /* Only consider addresses on srcnode. */
1918 if (tmp_ip->pnn != srcnode) {
1922 /* What is this IP address costing the source node? */
1923 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1924 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1926 /* Consider this IP address would cost each potential
1927 * destination node. Destination nodes are limited to
1928 * those that are newly healthy, since we don't want
1929 * to do gratuitous failover of IPs just to make minor
1930 * balance improvements.
1932 for (dstnode=0; dstnode<numnodes; dstnode++) {
1933 if (!rebalance_candidates[dstnode]) {
1937 /* only check nodes that can actually takeover this ip */
1938 if (!can_node_takeover_ip(ctdb, dstnode,
1939 ipflags[dstnode], tmp_ip)) {
1940 /* no it couldnt so skip to the next node */
1944 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1945 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1946 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1948 ctdb_addr_to_str(&(tmp_ip->addr)),
1951 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1952 (dstdsum < srcdsum) && \
1953 ((mindstnode == -1) || \
1954 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1957 minsrcimbl = srcimbl;
1958 mindstnode = dstnode;
1959 mindstimbl = dstimbl;
1963 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1965 if (mindstnode != -1) {
1966 /* We found a move that makes things better... */
1967 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1968 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1969 ctdb_addr_to_str(&(minip->addr)),
1970 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1973 lcp2_imbalances[srcnode] = minsrcimbl;
1974 lcp2_imbalances[mindstnode] = mindstimbl;
1975 minip->pnn = mindstnode;
1984 struct lcp2_imbalance_pnn {
1989 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1991 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1992 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1994 if (lipa->imbalance > lipb->imbalance) {
1996 } else if (lipa->imbalance == lipb->imbalance) {
2003 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2004 * node with the highest LCP2 imbalance, and then determines the best
2005 * IP/destination node combination to move from the source node.
2007 static void lcp2_failback(struct ctdb_context *ctdb,
2008 struct ctdb_ipflags *ipflags,
2009 struct public_ip_list *all_ips,
2010 uint32_t *lcp2_imbalances,
2011 bool *rebalance_candidates)
2014 struct lcp2_imbalance_pnn * lips;
2017 numnodes = talloc_array_length(ipflags);
2020 /* Put the imbalances and nodes into an array, sort them and
2021 * iterate through candidates. Usually the 1st one will be
2022 * used, so this doesn't cost much...
2024 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2025 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2026 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2027 for (i=0; i<numnodes; i++) {
2028 lips[i].imbalance = lcp2_imbalances[i];
2030 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2032 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2033 lcp2_cmp_imbalance_pnn);
2036 for (i=0; i<numnodes; i++) {
2037 /* This means that all nodes had 0 or 1 addresses, so
2038 * can't be imbalanced.
2040 if (lips[i].imbalance == 0) {
2044 if (lcp2_failback_candidate(ctdb,
2049 rebalance_candidates)) {
2061 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2062 struct ctdb_ipflags *ipflags,
2063 struct public_ip_list *all_ips)
2065 struct public_ip_list *tmp_ip;
2067 /* verify that the assigned nodes can serve that public ip
2068 and set it to -1 if not
2070 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2071 if (tmp_ip->pnn == -1) {
2074 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2075 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2076 /* this node can not serve this ip. */
2077 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2078 ctdb_addr_to_str(&(tmp_ip->addr)),
2085 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2086 struct ctdb_ipflags *ipflags,
2087 struct public_ip_list *all_ips)
2089 struct public_ip_list *tmp_ip;
2092 numnodes = talloc_array_length(ipflags);
2094 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2095 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2096 * always be allocated the same way for a specific set of
2097 * available/unavailable nodes.
2100 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2101 tmp_ip->pnn = i % numnodes;
2104 /* IP failback doesn't make sense with deterministic
2105 * IPs, since the modulo step above implicitly fails
2106 * back IPs to their "home" node.
2108 if (1 == ctdb->ipalloc_state->no_ip_failback) {
2109 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2112 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2114 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2116 /* No failback here! */
2119 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2120 struct ctdb_ipflags *ipflags,
2121 struct public_ip_list *all_ips)
2123 /* This should be pushed down into basic_failback. */
2124 struct public_ip_list *tmp_ip;
2126 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2130 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2132 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2134 /* If we don't want IPs to fail back then don't rebalance IPs. */
2135 if (1 == ctdb->ipalloc_state->no_ip_failback) {
2139 /* Now, try to make sure the ip adresses are evenly distributed
2142 basic_failback(ctdb, ipflags, all_ips, num_ips);
2145 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2146 struct ctdb_ipflags *ipflags,
2147 struct public_ip_list *all_ips,
2148 uint32_t *force_rebalance_nodes)
2150 uint32_t *lcp2_imbalances;
2151 bool *rebalance_candidates;
2152 int numnodes, num_rebalance_candidates, i;
2154 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2156 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2158 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2159 &lcp2_imbalances, &rebalance_candidates);
2161 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2163 /* If we don't want IPs to fail back then don't rebalance IPs. */
2164 if (1 == ctdb->ipalloc_state->no_ip_failback) {
2168 /* It is only worth continuing if we have suitable target
2169 * nodes to transfer IPs to. This check is much cheaper than
2172 numnodes = talloc_array_length(ipflags);
2173 num_rebalance_candidates = 0;
2174 for (i=0; i<numnodes; i++) {
2175 if (rebalance_candidates[i]) {
2176 num_rebalance_candidates++;
2179 if (num_rebalance_candidates == 0) {
2183 /* Now, try to make sure the ip adresses are evenly distributed
2186 lcp2_failback(ctdb, ipflags, all_ips,
2187 lcp2_imbalances, rebalance_candidates);
2190 talloc_free(tmp_ctx);
2193 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2197 for (i=0;i<nodemap->num;i++) {
2198 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2199 /* Found one completely healthy node */
2207 /* The calculation part of the IP allocation algorithm. */
2208 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2209 struct ctdb_ipflags *ipflags,
2210 struct public_ip_list *all_ips,
2211 uint32_t *force_rebalance_nodes)
2213 switch (ctdb->ipalloc_state->algorithm) {
2215 ip_alloc_lcp2(ctdb, ipflags, all_ips, force_rebalance_nodes);
2217 case IPALLOC_DETERMINISTIC:
2218 ip_alloc_deterministic_ips(ctdb, ipflags, all_ips);
2220 case IPALLOC_NONDETERMINISTIC:
2221 ip_alloc_nondeterministic_ips(ctdb, ipflags, all_ips);
2225 /* at this point ->pnn is the node which will own each IP
2226 or -1 if there is no node that can cover this ip
2232 struct get_tunable_callback_data {
2233 const char *tunable;
2238 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2239 int32_t res, TDB_DATA outdata,
2242 struct get_tunable_callback_data *cd =
2243 (struct get_tunable_callback_data *)callback;
2247 /* Already handled in fail callback */
2251 if (outdata.dsize != sizeof(uint32_t)) {
2252 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2253 cd->tunable, pnn, (int)sizeof(uint32_t),
2254 (int)outdata.dsize));
2259 size = talloc_array_length(cd->out);
2261 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2262 cd->tunable, pnn, size));
2267 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2270 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2271 int32_t res, TDB_DATA outdata,
2274 struct get_tunable_callback_data *cd =
2275 (struct get_tunable_callback_data *)callback;
2280 ("Timed out getting tunable \"%s\" from node %d\n",
2286 DEBUG(DEBUG_WARNING,
2287 ("Tunable \"%s\" not implemented on node %d\n",
2292 ("Unexpected error getting tunable \"%s\" from node %d\n",
2298 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2299 TALLOC_CTX *tmp_ctx,
2300 struct ctdb_node_map_old *nodemap,
2301 const char *tunable,
2302 uint32_t default_value)
2305 struct ctdb_control_get_tunable *t;
2308 struct get_tunable_callback_data callback_data;
2311 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2312 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2313 for (i=0; i<nodemap->num; i++) {
2314 tvals[i] = default_value;
2317 callback_data.out = tvals;
2318 callback_data.tunable = tunable;
2319 callback_data.fatal = false;
2321 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2322 data.dptr = talloc_size(tmp_ctx, data.dsize);
2323 t = (struct ctdb_control_get_tunable *)data.dptr;
2324 t->length = strlen(tunable)+1;
2325 memcpy(t->name, tunable, t->length);
2326 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2327 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2328 nodes, 0, TAKEOVER_TIMEOUT(),
2330 get_tunable_callback,
2331 get_tunable_fail_callback,
2332 &callback_data) != 0) {
2333 if (callback_data.fatal) {
2339 talloc_free(data.dptr);
2344 /* Set internal flags for IP allocation:
2346 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2347 * Set NOIPHOST ip flag for each INACTIVE node
2348 * if all nodes are disabled:
2349 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2351 * Set NOIPHOST ip flags for disabled nodes
2353 static struct ctdb_ipflags *
2354 set_ipflags_internal(struct ctdb_context *ctdb,
2355 TALLOC_CTX *tmp_ctx,
2356 struct ctdb_node_map_old *nodemap,
2357 uint32_t *tval_noiptakeover,
2358 uint32_t *tval_noiphostonalldisabled)
2361 struct ctdb_ipflags *ipflags;
2363 /* Clear IP flags - implicit due to talloc_zero */
2364 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2365 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2367 for (i=0;i<nodemap->num;i++) {
2368 /* Can not take IPs on node with NoIPTakeover set */
2369 if (tval_noiptakeover[i] != 0) {
2370 ipflags[i].noiptakeover = true;
2373 /* Can not host IPs on INACTIVE node */
2374 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2375 ipflags[i].noiphost = true;
2379 if (all_nodes_are_disabled(nodemap)) {
2380 /* If all nodes are disabled, can not host IPs on node
2381 * with NoIPHostOnAllDisabled set
2383 for (i=0;i<nodemap->num;i++) {
2384 if (tval_noiphostonalldisabled[i] != 0) {
2385 ipflags[i].noiphost = true;
2389 /* If some nodes are not disabled, then can not host
2390 * IPs on DISABLED node
2392 for (i=0;i<nodemap->num;i++) {
2393 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2394 ipflags[i].noiphost = true;
2402 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2403 TALLOC_CTX *tmp_ctx,
2404 struct ctdb_node_map_old *nodemap)
2406 uint32_t *tval_noiptakeover;
2407 uint32_t *tval_noiphostonalldisabled;
2408 struct ctdb_ipflags *ipflags;
2411 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2413 if (tval_noiptakeover == NULL) {
2417 tval_noiphostonalldisabled =
2418 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2419 "NoIPHostOnAllDisabled", 0);
2420 if (tval_noiphostonalldisabled == NULL) {
2421 /* Caller frees tmp_ctx */
2425 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2427 tval_noiphostonalldisabled);
2429 talloc_free(tval_noiptakeover);
2430 talloc_free(tval_noiphostonalldisabled);
2435 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2436 TALLOC_CTX *mem_ctx)
2438 struct ipalloc_state *ipalloc_state =
2439 talloc_zero(mem_ctx, struct ipalloc_state);
2440 if (ipalloc_state == NULL) {
2441 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2445 ipalloc_state->num = ctdb->num_nodes;
2446 ipalloc_state->known_public_ips =
2447 talloc_zero_array(ipalloc_state,
2448 struct ctdb_public_ip_list_old *,
2449 ipalloc_state->num);
2450 if (ipalloc_state->known_public_ips == NULL) {
2451 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2452 talloc_free(ipalloc_state);
2455 ipalloc_state->available_public_ips =
2456 talloc_zero_array(ipalloc_state,
2457 struct ctdb_public_ip_list_old *,
2458 ipalloc_state->num);
2459 if (ipalloc_state->available_public_ips == NULL) {
2460 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2461 talloc_free(ipalloc_state);
2465 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2466 ipalloc_state->algorithm = IPALLOC_LCP2;
2467 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2468 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2470 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2473 ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2475 return ipalloc_state;
2478 struct iprealloc_callback_data {
2481 client_async_callback fail_callback;
2482 void *fail_callback_data;
2483 struct ctdb_node_map_old *nodemap;
2486 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2487 int32_t res, TDB_DATA outdata,
2491 struct iprealloc_callback_data *cd =
2492 (struct iprealloc_callback_data *)callback;
2494 numnodes = talloc_array_length(cd->retry_nodes);
2495 if (pnn > numnodes) {
2497 ("ipreallocated failure from node %d, "
2498 "but only %d nodes in nodemap\n",
2503 /* Can't run the "ipreallocated" event on a INACTIVE node */
2504 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2505 DEBUG(DEBUG_WARNING,
2506 ("ipreallocated failed on inactive node %d, ignoring\n",
2513 /* If the control timed out then that's a real error,
2514 * so call the real fail callback
2516 if (cd->fail_callback) {
2517 cd->fail_callback(ctdb, pnn, res, outdata,
2518 cd->fail_callback_data);
2520 DEBUG(DEBUG_WARNING,
2521 ("iprealloc timed out but no callback registered\n"));
2525 /* If not a timeout then either the ipreallocated
2526 * eventscript (or some setup) failed. This might
2527 * have failed because the IPREALLOCATED control isn't
2528 * implemented - right now there is no way of knowing
2529 * because the error codes are all folded down to -1.
2530 * Consider retrying using EVENTSCRIPT control...
2532 DEBUG(DEBUG_WARNING,
2533 ("ipreallocated failure from node %d, flagging retry\n",
2535 cd->retry_nodes[pnn] = true;
2540 struct takeover_callback_data {
2542 client_async_callback fail_callback;
2543 void *fail_callback_data;
2544 struct ctdb_node_map_old *nodemap;
2547 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2548 uint32_t node_pnn, int32_t res,
2549 TDB_DATA outdata, void *callback_data)
2551 struct takeover_callback_data *cd =
2552 talloc_get_type_abort(callback_data,
2553 struct takeover_callback_data);
2556 for (i = 0; i < cd->nodemap->num; i++) {
2557 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2562 if (i == cd->nodemap->num) {
2563 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2567 if (!cd->node_failed[i]) {
2568 cd->node_failed[i] = true;
2569 cd->fail_callback(ctdb, node_pnn, res, outdata,
2570 cd->fail_callback_data);
2575 make any IP alias changes for public addresses that are necessary
2577 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2578 uint32_t *force_rebalance_nodes,
2579 client_async_callback fail_callback, void *callback_data)
2582 struct ctdb_public_ip ip;
2584 struct public_ip_list *all_ips, *tmp_ip;
2586 struct timeval timeout;
2587 struct client_async_data *async_data;
2588 struct ctdb_client_control_state *state;
2589 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2590 struct ctdb_ipflags *ipflags;
2591 struct ipalloc_state *ipalloc_state;
2592 struct takeover_callback_data *takeover_data;
2593 struct iprealloc_callback_data iprealloc_data;
2598 * ip failover is completely disabled, just send out the
2599 * ipreallocated event.
2601 if (ctdb->tunable.disable_ip_failover != 0) {
2605 ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2606 if (ipalloc_state == NULL) {
2607 talloc_free(tmp_ctx);
2610 ctdb->ipalloc_state = ipalloc_state;
2612 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2613 if (ipflags == NULL) {
2614 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2615 talloc_free(tmp_ctx);
2619 /* Fetch known/available public IPs from each active node */
2620 ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2622 talloc_free(tmp_ctx);
2626 /* Short-circuit IP allocation if no node has available IPs */
2627 can_host_ips = false;
2628 for (i=0; i < ipalloc_state->num; i++) {
2629 if (ipalloc_state->available_public_ips[i] != NULL) {
2630 can_host_ips = true;
2633 if (!can_host_ips) {
2634 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2638 /* since nodes only know about those public addresses that
2639 can be served by that particular node, no single node has
2640 a full list of all public addresses that exist in the cluster.
2641 Walk over all node structures and create a merged list of
2642 all public addresses that exist in the cluster.
2644 keep the tree of ips around as ctdb->ip_tree
2646 all_ips = create_merged_ip_list(ctdb);
2648 /* Do the IP reassignment calculations */
2649 ctdb_takeover_run_core(ctdb, ipflags, all_ips, force_rebalance_nodes);
2651 /* Now tell all nodes to release any public IPs should not
2652 * host. This will be a NOOP on nodes that don't currently
2653 * hold the given IP.
2655 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2656 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2658 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2659 bool, nodemap->num);
2660 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2661 takeover_data->fail_callback = fail_callback;
2662 takeover_data->fail_callback_data = callback_data;
2663 takeover_data->nodemap = nodemap;
2665 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2666 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2668 async_data->fail_callback = takeover_run_fail_callback;
2669 async_data->callback_data = takeover_data;
2671 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2673 /* Send a RELEASE_IP to all nodes that should not be hosting
2674 * each IP. For each IP, all but one of these will be
2675 * redundant. However, the redundant ones are used to tell
2676 * nodes which node should be hosting the IP so that commands
2677 * like "ctdb ip" can display a particular nodes idea of who
2678 * is hosting what. */
2679 for (i=0;i<nodemap->num;i++) {
2680 /* don't talk to unconnected nodes, but do talk to banned nodes */
2681 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2685 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2686 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2687 /* This node should be serving this
2688 vnn so don't tell it to release the ip
2692 ip.pnn = tmp_ip->pnn;
2693 ip.addr = tmp_ip->addr;
2695 timeout = TAKEOVER_TIMEOUT();
2696 data.dsize = sizeof(ip);
2697 data.dptr = (uint8_t *)&ip;
2698 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2699 0, CTDB_CONTROL_RELEASE_IP, 0,
2702 if (state == NULL) {
2703 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2704 talloc_free(tmp_ctx);
2708 ctdb_client_async_add(async_data, state);
2711 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2712 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2713 talloc_free(tmp_ctx);
2716 talloc_free(async_data);
2719 /* For each IP, send a TAKOVER_IP to the node that should be
2720 * hosting it. Many of these will often be redundant (since
2721 * the allocation won't have changed) but they can be useful
2722 * to recover from inconsistencies. */
2723 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2724 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2726 async_data->fail_callback = fail_callback;
2727 async_data->callback_data = callback_data;
2729 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2730 if (tmp_ip->pnn == -1) {
2731 /* this IP won't be taken over */
2735 ip.pnn = tmp_ip->pnn;
2736 ip.addr = tmp_ip->addr;
2738 timeout = TAKEOVER_TIMEOUT();
2739 data.dsize = sizeof(ip);
2740 data.dptr = (uint8_t *)&ip;
2741 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2742 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2743 data, async_data, &timeout, NULL);
2744 if (state == NULL) {
2745 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2746 talloc_free(tmp_ctx);
2750 ctdb_client_async_add(async_data, state);
2752 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2753 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2754 talloc_free(tmp_ctx);
2760 * Tell all nodes to run eventscripts to process the
2761 * "ipreallocated" event. This can do a lot of things,
2762 * including restarting services to reconfigure them if public
2763 * IPs have moved. Once upon a time this event only used to
2766 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2767 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2768 iprealloc_data.retry_nodes = retry_data;
2769 iprealloc_data.retry_count = 0;
2770 iprealloc_data.fail_callback = fail_callback;
2771 iprealloc_data.fail_callback_data = callback_data;
2772 iprealloc_data.nodemap = nodemap;
2774 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2775 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2776 nodes, 0, TAKEOVER_TIMEOUT(),
2778 NULL, iprealloc_fail_callback,
2781 /* If the control failed then we should retry to any
2782 * nodes flagged by iprealloc_fail_callback using the
2783 * EVENTSCRIPT control. This is a best-effort at
2784 * backward compatiblity when running a mixed cluster
2785 * where some nodes have not yet been upgraded to
2786 * support the IPREALLOCATED control.
2788 DEBUG(DEBUG_WARNING,
2789 ("Retry ipreallocated to some nodes using eventscript control\n"));
2791 nodes = talloc_array(tmp_ctx, uint32_t,
2792 iprealloc_data.retry_count);
2793 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2796 for (i=0; i<nodemap->num; i++) {
2797 if (iprealloc_data.retry_nodes[i]) {
2803 data.dptr = discard_const("ipreallocated");
2804 data.dsize = strlen((char *)data.dptr) + 1;
2805 ret = ctdb_client_async_control(ctdb,
2806 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2807 nodes, 0, TAKEOVER_TIMEOUT(),
2809 NULL, fail_callback,
2812 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2816 talloc_free(tmp_ctx);
2822 destroy a ctdb_client_ip structure
2824 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2826 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2827 ctdb_addr_to_str(&ip->addr),
2828 ntohs(ip->addr.ip.sin_port),
2831 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2836 called by a client to inform us of a TCP connection that it is managing
2837 that should tickled with an ACK when IP takeover is done
2839 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2842 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2843 struct ctdb_connection *tcp_sock = NULL;
2844 struct ctdb_tcp_list *tcp;
2845 struct ctdb_connection t;
2848 struct ctdb_client_ip *ip;
2849 struct ctdb_vnn *vnn;
2850 ctdb_sock_addr addr;
2852 /* If we don't have public IPs, tickles are useless */
2853 if (ctdb->vnn == NULL) {
2857 tcp_sock = (struct ctdb_connection *)indata.dptr;
2859 addr = tcp_sock->src;
2860 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2861 addr = tcp_sock->dst;
2862 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2865 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2866 vnn = find_public_ip_vnn(ctdb, &addr);
2868 switch (addr.sa.sa_family) {
2870 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2871 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2872 ctdb_addr_to_str(&addr)));
2876 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2877 ctdb_addr_to_str(&addr)));
2880 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2886 if (vnn->pnn != ctdb->pnn) {
2887 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2888 ctdb_addr_to_str(&addr),
2889 client_id, client->pid));
2890 /* failing this call will tell smbd to die */
2894 ip = talloc(client, struct ctdb_client_ip);
2895 CTDB_NO_MEMORY(ctdb, ip);
2899 ip->client_id = client_id;
2900 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2901 DLIST_ADD(ctdb->client_ip_list, ip);
2903 tcp = talloc(client, struct ctdb_tcp_list);
2904 CTDB_NO_MEMORY(ctdb, tcp);
2906 tcp->connection.src = tcp_sock->src;
2907 tcp->connection.dst = tcp_sock->dst;
2909 DLIST_ADD(client->tcp_list, tcp);
2911 t.src = tcp_sock->src;
2912 t.dst = tcp_sock->dst;
2914 data.dptr = (uint8_t *)&t;
2915 data.dsize = sizeof(t);
2917 switch (addr.sa.sa_family) {
2919 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2920 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2921 ctdb_addr_to_str(&tcp_sock->src),
2922 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2925 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2926 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2927 ctdb_addr_to_str(&tcp_sock->src),
2928 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2931 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2935 /* tell all nodes about this tcp connection */
2936 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2937 CTDB_CONTROL_TCP_ADD,
2938 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2940 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2948 find a tcp address on a list
2950 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2951 struct ctdb_connection *tcp)
2955 if (array == NULL) {
2959 for (i=0;i<array->num;i++) {
2960 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2961 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2962 return &array->connections[i];
2971 called by a daemon to inform us of a TCP connection that one of its
2972 clients managing that should tickled with an ACK when IP takeover is
2975 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2977 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2978 struct ctdb_tcp_array *tcparray;
2979 struct ctdb_connection tcp;
2980 struct ctdb_vnn *vnn;
2982 /* If we don't have public IPs, tickles are useless */
2983 if (ctdb->vnn == NULL) {
2987 vnn = find_public_ip_vnn(ctdb, &p->dst);
2989 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2990 ctdb_addr_to_str(&p->dst)));
2996 tcparray = vnn->tcp_array;
2998 /* If this is the first tickle */
2999 if (tcparray == NULL) {
3000 tcparray = talloc(vnn, struct ctdb_tcp_array);
3001 CTDB_NO_MEMORY(ctdb, tcparray);
3002 vnn->tcp_array = tcparray;
3005 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3006 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3008 tcparray->connections[tcparray->num].src = p->src;
3009 tcparray->connections[tcparray->num].dst = p->dst;
3012 if (tcp_update_needed) {
3013 vnn->tcp_update_needed = true;
3019 /* Do we already have this tickle ?*/
3022 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3023 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3024 ctdb_addr_to_str(&tcp.dst),
3025 ntohs(tcp.dst.ip.sin_port),
3030 /* A new tickle, we must add it to the array */
3031 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3032 struct ctdb_connection,
3034 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3036 tcparray->connections[tcparray->num].src = p->src;
3037 tcparray->connections[tcparray->num].dst = p->dst;
3040 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3041 ctdb_addr_to_str(&tcp.dst),
3042 ntohs(tcp.dst.ip.sin_port),
3045 if (tcp_update_needed) {
3046 vnn->tcp_update_needed = true;
3054 called by a daemon to inform us of a TCP connection that one of its
3055 clients managing that should tickled with an ACK when IP takeover is
3058 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3060 struct ctdb_connection *tcpp;
3061 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3064 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3065 ctdb_addr_to_str(&conn->dst)));
3069 /* if the array is empty we cant remove it
3070 and we don't need to do anything
3072 if (vnn->tcp_array == NULL) {
3073 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3074 ctdb_addr_to_str(&conn->dst),
3075 ntohs(conn->dst.ip.sin_port)));
3080 /* See if we know this connection
3081 if we don't know this connection then we dont need to do anything
3083 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3085 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3086 ctdb_addr_to_str(&conn->dst),
3087 ntohs(conn->dst.ip.sin_port)));
3092 /* We need to remove this entry from the array.
3093 Instead of allocating a new array and copying data to it
3094 we cheat and just copy the last entry in the existing array
3095 to the entry that is to be removed and just shring the
3098 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3099 vnn->tcp_array->num--;
3101 /* If we deleted the last entry we also need to remove the entire array
3103 if (vnn->tcp_array->num == 0) {
3104 talloc_free(vnn->tcp_array);
3105 vnn->tcp_array = NULL;
3108 vnn->tcp_update_needed = true;
3110 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3111 ctdb_addr_to_str(&conn->src),
3112 ntohs(conn->src.ip.sin_port)));
3117 called by a daemon to inform us of a TCP connection that one of its
3118 clients used are no longer needed in the tickle database
3120 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3122 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3124 /* If we don't have public IPs, tickles are useless */
3125 if (ctdb->vnn == NULL) {
3129 ctdb_remove_connection(ctdb, conn);
3136 Called when another daemon starts - causes all tickles for all
3137 public addresses we are serving to be sent to the new node on the
3138 next check. This actually causes the next scheduled call to
3139 tdb_update_tcp_tickles() to update all nodes. This is simple and
3140 doesn't require careful error handling.
3142 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3144 struct ctdb_vnn *vnn;
3146 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3147 (unsigned long) pnn));
3149 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3150 vnn->tcp_update_needed = true;
3158 called when a client structure goes away - hook to remove
3159 elements from the tcp_list in all daemons
3161 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3163 while (client->tcp_list) {
3164 struct ctdb_tcp_list *tcp = client->tcp_list;
3165 DLIST_REMOVE(client->tcp_list, tcp);
3166 ctdb_remove_connection(client->ctdb, &tcp->connection);
3171 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3173 struct ctdb_vnn *vnn;
3176 if (ctdb->tunable.disable_ip_failover == 1) {
3180 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3181 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3182 ctdb_vnn_unassign_iface(ctdb, vnn);
3189 /* Don't allow multiple releases at once. Some code,
3190 * particularly ctdb_tickle_sentenced_connections() is
3192 if (vnn->update_in_flight) {
3193 DEBUG(DEBUG_WARNING,
3195 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3196 ctdb_addr_to_str(&vnn->public_address),
3197 vnn->public_netmask_bits,
3198 ctdb_vnn_iface_string(vnn)));
3201 vnn->update_in_flight = true;
3203 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3204 ctdb_addr_to_str(&vnn->public_address),
3205 vnn->public_netmask_bits,
3206 ctdb_vnn_iface_string(vnn)));
3208 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3209 ctdb_vnn_iface_string(vnn),
3210 ctdb_addr_to_str(&vnn->public_address),
3211 vnn->public_netmask_bits);
3212 release_kill_clients(ctdb, &vnn->public_address);
3213 ctdb_vnn_unassign_iface(ctdb, vnn);
3214 vnn->update_in_flight = false;
3218 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3223 get list of public IPs
3225 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3226 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3229 struct ctdb_public_ip_list_old *ips;
3230 struct ctdb_vnn *vnn;
3231 bool only_available = false;
3233 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3234 only_available = true;
3237 /* count how many public ip structures we have */
3239 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3243 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3244 num*sizeof(struct ctdb_public_ip);
3245 ips = talloc_zero_size(outdata, len);
3246 CTDB_NO_MEMORY(ctdb, ips);
3249 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3250 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3253 ips->ips[i].pnn = vnn->pnn;
3254 ips->ips[i].addr = vnn->public_address;
3258 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3259 i*sizeof(struct ctdb_public_ip);
3261 outdata->dsize = len;
3262 outdata->dptr = (uint8_t *)ips;
3268 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3269 struct ctdb_req_control_old *c,
3274 ctdb_sock_addr *addr;
3275 struct ctdb_public_ip_info_old *info;
3276 struct ctdb_vnn *vnn;
3278 addr = (ctdb_sock_addr *)indata.dptr;
3280 vnn = find_public_ip_vnn(ctdb, addr);
3282 /* if it is not a public ip it could be our 'single ip' */
3283 if (ctdb->single_ip_vnn) {
3284 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3285 vnn = ctdb->single_ip_vnn;
3290 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3291 "'%s'not a public address\n",
3292 ctdb_addr_to_str(addr)));
3296 /* count how many public ip structures we have */
3298 for (;vnn->ifaces[num];) {
3302 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3303 num*sizeof(struct ctdb_iface);
3304 info = talloc_zero_size(outdata, len);
3305 CTDB_NO_MEMORY(ctdb, info);
3307 info->ip.addr = vnn->public_address;
3308 info->ip.pnn = vnn->pnn;
3309 info->active_idx = 0xFFFFFFFF;
3311 for (i=0; vnn->ifaces[i]; i++) {
3312 struct ctdb_interface *cur;
3314 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3316 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3320 if (vnn->iface == cur) {
3321 info->active_idx = i;
3323 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3324 info->ifaces[i].link_state = cur->link_up;
3325 info->ifaces[i].references = cur->references;
3328 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3329 i*sizeof(struct ctdb_iface);
3331 outdata->dsize = len;
3332 outdata->dptr = (uint8_t *)info;
3337 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3338 struct ctdb_req_control_old *c,
3342 struct ctdb_iface_list_old *ifaces;
3343 struct ctdb_interface *cur;
3345 /* count how many public ip structures we have */
3347 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3351 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3352 num*sizeof(struct ctdb_iface);
3353 ifaces = talloc_zero_size(outdata, len);
3354 CTDB_NO_MEMORY(ctdb, ifaces);
3357 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3358 strcpy(ifaces->ifaces[i].name, cur->name);
3359 ifaces->ifaces[i].link_state = cur->link_up;
3360 ifaces->ifaces[i].references = cur->references;
3364 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3365 i*sizeof(struct ctdb_iface);
3367 outdata->dsize = len;
3368 outdata->dptr = (uint8_t *)ifaces;
3373 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3374 struct ctdb_req_control_old *c,
3377 struct ctdb_iface *info;
3378 struct ctdb_interface *iface;
3379 bool link_up = false;
3381 info = (struct ctdb_iface *)indata.dptr;
3383 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3384 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3385 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3386 len, len, info->name));
3390 switch (info->link_state) {
3398 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3399 (unsigned int)info->link_state));
3403 if (info->references != 0) {
3404 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3405 (unsigned int)info->references));
3409 iface = ctdb_find_iface(ctdb, info->name);
3410 if (iface == NULL) {
3414 if (link_up == iface->link_up) {
3418 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3419 ("iface[%s] has changed it's link status %s => %s\n",
3421 iface->link_up?"up":"down",
3422 link_up?"up":"down"));
3424 iface->link_up = link_up;
3430 structure containing the listening socket and the list of tcp connections
3431 that the ctdb daemon is to kill
3433 struct ctdb_kill_tcp {
3434 struct ctdb_vnn *vnn;
3435 struct ctdb_context *ctdb;
3437 struct tevent_fd *fde;
3438 trbt_tree_t *connections;
3443 a tcp connection that is to be killed
3445 struct ctdb_killtcp_con {
3446 ctdb_sock_addr src_addr;
3447 ctdb_sock_addr dst_addr;
3449 struct ctdb_kill_tcp *killtcp;
3452 /* this function is used to create a key to represent this socketpair
3453 in the killtcp tree.
3454 this key is used to insert and lookup matching socketpairs that are
3455 to be tickled and RST
3457 #define KILLTCP_KEYLEN 10
3458 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3460 static uint32_t key[KILLTCP_KEYLEN];
3462 bzero(key, sizeof(key));
3464 if (src->sa.sa_family != dst->sa.sa_family) {
3465 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3469 switch (src->sa.sa_family) {
3471 key[0] = dst->ip.sin_addr.s_addr;
3472 key[1] = src->ip.sin_addr.s_addr;
3473 key[2] = dst->ip.sin_port;
3474 key[3] = src->ip.sin_port;
3477 uint32_t *dst6_addr32 =
3478 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3479 uint32_t *src6_addr32 =
3480 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3481 key[0] = dst6_addr32[3];
3482 key[1] = src6_addr32[3];
3483 key[2] = dst6_addr32[2];
3484 key[3] = src6_addr32[2];
3485 key[4] = dst6_addr32[1];
3486 key[5] = src6_addr32[1];
3487 key[6] = dst6_addr32[0];
3488 key[7] = src6_addr32[0];
3489 key[8] = dst->ip6.sin6_port;
3490 key[9] = src->ip6.sin6_port;
3494 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3502 called when we get a read event on the raw socket
3504 static void capture_tcp_handler(struct tevent_context *ev,
3505 struct tevent_fd *fde,
3506 uint16_t flags, void *private_data)
3508 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3509 struct ctdb_killtcp_con *con;
3510 ctdb_sock_addr src, dst;
3511 uint32_t ack_seq, seq;
3513 if (!(flags & TEVENT_FD_READ)) {
3517 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3518 killtcp->private_data,
3520 &ack_seq, &seq) != 0) {
3521 /* probably a non-tcp ACK packet */
3525 /* check if we have this guy in our list of connections
3528 con = trbt_lookuparray32(killtcp->connections,
3529 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3531 /* no this was some other packet we can just ignore */
3535 /* This one has been tickled !
3536 now reset him and remove him from the list.
3538 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3539 ntohs(con->dst_addr.ip.sin_port),
3540 ctdb_addr_to_str(&con->src_addr),
3541 ntohs(con->src_addr.ip.sin_port)));
3543 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3548 /* when traversing the list of all tcp connections to send tickle acks to
3549 (so that we can capture the ack coming back and kill the connection
3551 this callback is called for each connection we are currently trying to kill
3553 static int tickle_connection_traverse(void *param, void *data)
3555 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3557 /* have tried too many times, just give up */
3558 if (con->count >= 5) {
3559 /* can't delete in traverse: reparent to delete_cons */
3560 talloc_steal(param, con);
3564 /* othervise, try tickling it again */
3567 (ctdb_sock_addr *)&con->dst_addr,
3568 (ctdb_sock_addr *)&con->src_addr,
3575 called every second until all sentenced connections have been reset
3577 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3578 struct tevent_timer *te,
3579 struct timeval t, void *private_data)
3581 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3582 void *delete_cons = talloc_new(NULL);
3584 /* loop over all connections sending tickle ACKs */
3585 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3587 /* now we've finished traverse, it's safe to do deletion. */
3588 talloc_free(delete_cons);
3590 /* If there are no more connections to kill we can remove the
3591 entire killtcp structure
3593 if ( (killtcp->connections == NULL) ||
3594 (killtcp->connections->root == NULL) ) {
3595 talloc_free(killtcp);
3599 /* try tickling them again in a seconds time
3601 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3602 timeval_current_ofs(1, 0),
3603 ctdb_tickle_sentenced_connections, killtcp);
3607 destroy the killtcp structure
3609 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3611 struct ctdb_vnn *tmpvnn;
3613 /* verify that this vnn is still active */
3614 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3615 if (tmpvnn == killtcp->vnn) {
3620 if (tmpvnn == NULL) {
3624 if (killtcp->vnn->killtcp != killtcp) {
3628 killtcp->vnn->killtcp = NULL;
3634 /* nothing fancy here, just unconditionally replace any existing
3635 connection structure with the new one.
3637 don't even free the old one if it did exist, that one is talloc_stolen
3638 by the same node in the tree anyway and will be deleted when the new data
3641 static void *add_killtcp_callback(void *parm, void *data)
3647 add a tcp socket to the list of connections we want to RST
3649 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3653 ctdb_sock_addr src, dst;
3654 struct ctdb_kill_tcp *killtcp;
3655 struct ctdb_killtcp_con *con;
3656 struct ctdb_vnn *vnn;
3658 ctdb_canonicalize_ip(s, &src);
3659 ctdb_canonicalize_ip(d, &dst);
3661 vnn = find_public_ip_vnn(ctdb, &dst);
3663 vnn = find_public_ip_vnn(ctdb, &src);
3666 /* if it is not a public ip it could be our 'single ip' */
3667 if (ctdb->single_ip_vnn) {
3668 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3669 vnn = ctdb->single_ip_vnn;
3674 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3678 killtcp = vnn->killtcp;
3680 /* If this is the first connection to kill we must allocate
3683 if (killtcp == NULL) {
3684 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3685 CTDB_NO_MEMORY(ctdb, killtcp);
3688 killtcp->ctdb = ctdb;
3689 killtcp->capture_fd = -1;
3690 killtcp->connections = trbt_create(killtcp, 0);
3692 vnn->killtcp = killtcp;
3693 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3698 /* create a structure that describes this connection we want to
3699 RST and store it in killtcp->connections
3701 con = talloc(killtcp, struct ctdb_killtcp_con);
3702 CTDB_NO_MEMORY(ctdb, con);
3703 con->src_addr = src;
3704 con->dst_addr = dst;
3706 con->killtcp = killtcp;
3709 trbt_insertarray32_callback(killtcp->connections,
3710 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3711 add_killtcp_callback, con);
3714 If we don't have a socket to listen on yet we must create it
3716 if (killtcp->capture_fd == -1) {
3717 const char *iface = ctdb_vnn_iface_string(vnn);
3718 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3719 if (killtcp->capture_fd == -1) {
3720 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3721 "socket on iface '%s' for killtcp (%s)\n",
3722 iface, strerror(errno)));
3728 if (killtcp->fde == NULL) {
3729 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3730 killtcp->capture_fd,
3732 capture_tcp_handler, killtcp);
3733 tevent_fd_set_auto_close(killtcp->fde);
3735 /* We also need to set up some events to tickle all these connections
3736 until they are all reset
3738 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3739 ctdb_tickle_sentenced_connections, killtcp);
3742 /* tickle him once now */
3751 talloc_free(vnn->killtcp);
3752 vnn->killtcp = NULL;
3757 kill a TCP connection.
3759 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3761 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3763 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3767 called by a daemon to inform us of the entire list of TCP tickles for
3768 a particular public address.
3769 this control should only be sent by the node that is currently serving
3770 that public address.
3772 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3774 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3775 struct ctdb_tcp_array *tcparray;
3776 struct ctdb_vnn *vnn;
3778 /* We must at least have tickles.num or else we cant verify the size
3779 of the received data blob
3781 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3782 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3786 /* verify that the size of data matches what we expect */
3787 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3788 + sizeof(struct ctdb_connection) * list->num) {
3789 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3793 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3794 ctdb_addr_to_str(&list->addr)));
3796 vnn = find_public_ip_vnn(ctdb, &list->addr);
3798 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3799 ctdb_addr_to_str(&list->addr)));
3804 /* remove any old ticklelist we might have */
3805 talloc_free(vnn->tcp_array);
3806 vnn->tcp_array = NULL;
3808 tcparray = talloc(vnn, struct ctdb_tcp_array);
3809 CTDB_NO_MEMORY(ctdb, tcparray);
3811 tcparray->num = list->num;
3813 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3814 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3816 memcpy(tcparray->connections, &list->connections[0],
3817 sizeof(struct ctdb_connection)*tcparray->num);
3819 /* We now have a new fresh tickle list array for this vnn */
3820 vnn->tcp_array = tcparray;
3826 called to return the full list of tickles for the puclic address associated
3827 with the provided vnn
3829 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3831 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3832 struct ctdb_tickle_list_old *list;
3833 struct ctdb_tcp_array *tcparray;
3835 struct ctdb_vnn *vnn;
3837 vnn = find_public_ip_vnn(ctdb, addr);
3839 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3840 ctdb_addr_to_str(addr)));
3845 tcparray = vnn->tcp_array;
3847 num = tcparray->num;
3852 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3853 + sizeof(struct ctdb_connection) * num;
3855 outdata->dptr = talloc_size(outdata, outdata->dsize);
3856 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3857 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3862 memcpy(&list->connections[0], tcparray->connections,
3863 sizeof(struct ctdb_connection) * num);
3871 set the list of all tcp tickles for a public address
3873 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3874 ctdb_sock_addr *addr,
3875 struct ctdb_tcp_array *tcparray)
3879 struct ctdb_tickle_list_old *list;
3882 num = tcparray->num;
3887 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3888 sizeof(struct ctdb_connection) * num;
3889 data.dptr = talloc_size(ctdb, data.dsize);
3890 CTDB_NO_MEMORY(ctdb, data.dptr);
3892 list = (struct ctdb_tickle_list_old *)data.dptr;
3896 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3899 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3900 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3901 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3903 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3907 talloc_free(data.dptr);
3914 perform tickle updates if required
3916 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3917 struct tevent_timer *te,
3918 struct timeval t, void *private_data)
3920 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3922 struct ctdb_vnn *vnn;
3924 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3925 /* we only send out updates for public addresses that
3928 if (ctdb->pnn != vnn->pnn) {
3931 /* We only send out the updates if we need to */
3932 if (!vnn->tcp_update_needed) {
3935 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3936 &vnn->public_address,
3939 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3940 ctdb_addr_to_str(&vnn->public_address)));
3943 ("Sent tickle update for public address %s\n",
3944 ctdb_addr_to_str(&vnn->public_address)));
3945 vnn->tcp_update_needed = false;
3949 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3950 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3951 ctdb_update_tcp_tickles, ctdb);
3955 start periodic update of tcp tickles
3957 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3959 ctdb->tickle_update_context = talloc_new(ctdb);
3961 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3962 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3963 ctdb_update_tcp_tickles, ctdb);
3969 struct control_gratious_arp {
3970 struct ctdb_context *ctdb;
3971 ctdb_sock_addr addr;
3977 send a control_gratuitous arp
3979 static void send_gratious_arp(struct tevent_context *ev,
3980 struct tevent_timer *te,
3981 struct timeval t, void *private_data)
3984 struct control_gratious_arp *arp = talloc_get_type(private_data,
3985 struct control_gratious_arp);
3987 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3989 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3990 arp->iface, strerror(errno)));
3995 if (arp->count == CTDB_ARP_REPEAT) {
4000 tevent_add_timer(arp->ctdb->ev, arp,
4001 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4002 send_gratious_arp, arp);
4009 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4011 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4012 struct control_gratious_arp *arp;
4014 /* verify the size of indata */
4015 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4016 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4017 (unsigned)indata.dsize,
4018 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4022 ( offsetof(struct ctdb_addr_info_old, iface)
4023 + gratious_arp->len ) ){
4025 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4026 "but should be %u bytes\n",
4027 (unsigned)indata.dsize,
4028 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4033 arp = talloc(ctdb, struct control_gratious_arp);
4034 CTDB_NO_MEMORY(ctdb, arp);
4037 arp->addr = gratious_arp->addr;
4038 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4039 CTDB_NO_MEMORY(ctdb, arp->iface);
4042 tevent_add_timer(arp->ctdb->ev, arp,
4043 timeval_zero(), send_gratious_arp, arp);
4048 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4050 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4053 /* verify the size of indata */
4054 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4055 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4059 ( offsetof(struct ctdb_addr_info_old, iface)
4062 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4063 "but should be %u bytes\n",
4064 (unsigned)indata.dsize,
4065 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4069 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4071 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4074 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4081 struct delete_ip_callback_state {
4082 struct ctdb_req_control_old *c;
4086 called when releaseip event finishes for del_public_address
4088 static void delete_ip_callback(struct ctdb_context *ctdb,
4089 int32_t status, TDB_DATA data,
4090 const char *errormsg,
4093 struct delete_ip_callback_state *state =
4094 talloc_get_type(private_data, struct delete_ip_callback_state);
4096 /* If release failed then fail. */
4097 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4098 talloc_free(private_data);
4101 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4102 struct ctdb_req_control_old *c,
4103 TDB_DATA indata, bool *async_reply)
4105 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4106 struct ctdb_vnn *vnn;
4108 /* verify the size of indata */
4109 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4110 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4114 ( offsetof(struct ctdb_addr_info_old, iface)
4117 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4118 "but should be %u bytes\n",
4119 (unsigned)indata.dsize,
4120 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4124 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4126 /* walk over all public addresses until we find a match */
4127 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4128 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4129 if (vnn->pnn == ctdb->pnn) {
4130 struct delete_ip_callback_state *state;
4131 struct ctdb_public_ip *ip;
4135 vnn->delete_pending = true;
4137 state = talloc(ctdb,
4138 struct delete_ip_callback_state);
4139 CTDB_NO_MEMORY(ctdb, state);
4142 ip = talloc(state, struct ctdb_public_ip);
4145 (__location__ " Out of memory\n"));
4150 ip->addr = pub->addr;
4152 data.dsize = sizeof(struct ctdb_public_ip);
4153 data.dptr = (unsigned char *)ip;
4155 ret = ctdb_daemon_send_control(ctdb,
4158 CTDB_CONTROL_RELEASE_IP,
4165 (__location__ "Unable to send "
4166 "CTDB_CONTROL_RELEASE_IP\n"));
4171 state->c = talloc_steal(state, c);
4172 *async_reply = true;
4174 /* This IP is not hosted on the
4175 * current node so just delete it
4177 do_delete_ip(ctdb, vnn);
4184 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4185 ctdb_addr_to_str(&pub->addr)));
4190 struct ipreallocated_callback_state {
4191 struct ctdb_req_control_old *c;
4194 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4195 int status, void *p)
4197 struct ipreallocated_callback_state *state =
4198 talloc_get_type(p, struct ipreallocated_callback_state);
4202 (" \"ipreallocated\" event script failed (status %d)\n",
4204 if (status == -ETIME) {
4205 ctdb_ban_self(ctdb);
4209 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4213 /* A control to run the ipreallocated event */
4214 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4215 struct ctdb_req_control_old *c,
4219 struct ipreallocated_callback_state *state;
4221 state = talloc(ctdb, struct ipreallocated_callback_state);
4222 CTDB_NO_MEMORY(ctdb, state);
4224 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4226 ret = ctdb_event_script_callback(ctdb, state,
4227 ctdb_ipreallocated_callback, state,
4228 CTDB_EVENT_IPREALLOCATED,
4232 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4237 /* tell the control that we will be reply asynchronously */
4238 state->c = talloc_steal(state, c);
4239 *async_reply = true;
4245 /* This function is called from the recovery daemon to verify that a remote
4246 node has the expected ip allocation.
4247 This is verified against ctdb->ip_tree
4249 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4250 struct ctdb_public_ip_list_old *ips,
4253 struct public_ip_list *tmp_ip;
4256 if (ctdb->ip_tree == NULL) {
4257 /* don't know the expected allocation yet, assume remote node
4266 for (i=0; i<ips->num; i++) {
4267 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4268 if (tmp_ip == NULL) {
4269 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4273 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4277 if (tmp_ip->pnn != ips->ips[i].pnn) {
4279 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4281 ctdb_addr_to_str(&ips->ips[i].addr),
4282 ips->ips[i].pnn, tmp_ip->pnn));
4290 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4292 struct public_ip_list *tmp_ip;
4294 /* IP tree is never built if DisableIPFailover is set */
4295 if (ctdb->tunable.disable_ip_failover != 0) {
4299 if (ctdb->ip_tree == NULL) {
4300 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4304 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4305 if (tmp_ip == NULL) {
4306 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4310 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4311 tmp_ip->pnn = ip->pnn;
4316 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4318 TALLOC_FREE(ctdb->ip_tree);
4321 struct ctdb_reloadips_handle {
4322 struct ctdb_context *ctdb;
4323 struct ctdb_req_control_old *c;
4327 struct tevent_fd *fde;
4330 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4332 if (h == h->ctdb->reload_ips) {
4333 h->ctdb->reload_ips = NULL;
4336 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4339 ctdb_kill(h->ctdb, h->child, SIGKILL);
4343 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4344 struct tevent_timer *te,
4345 struct timeval t, void *private_data)
4347 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4352 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4353 struct tevent_fd *fde,
4354 uint16_t flags, void *private_data)
4356 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4361 ret = sys_read(h->fd[0], &res, 1);
4362 if (ret < 1 || res != 0) {
4363 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4371 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4373 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4374 struct ctdb_public_ip_list_old *ips;
4375 struct ctdb_vnn *vnn;
4376 struct client_async_data *async_data;
4377 struct timeval timeout;
4379 struct ctdb_client_control_state *state;
4383 CTDB_NO_MEMORY(ctdb, mem_ctx);
4385 /* Read IPs from local node */
4386 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4387 CTDB_CURRENT_NODE, mem_ctx, &ips);
4390 ("Unable to fetch public IPs from local node\n"));
4391 talloc_free(mem_ctx);
4395 /* Read IPs file - this is safe since this is a child process */
4397 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4398 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4399 talloc_free(mem_ctx);
4403 async_data = talloc_zero(mem_ctx, struct client_async_data);
4404 CTDB_NO_MEMORY(ctdb, async_data);
4406 /* Compare IPs between node and file for IPs to be deleted */
4407 for (i = 0; i < ips->num; i++) {
4409 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4410 if (ctdb_same_ip(&vnn->public_address,
4411 &ips->ips[i].addr)) {
4412 /* IP is still in file */
4418 /* Delete IP ips->ips[i] */
4419 struct ctdb_addr_info_old *pub;
4422 ("IP %s no longer configured, deleting it\n",
4423 ctdb_addr_to_str(&ips->ips[i].addr)));
4425 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4426 CTDB_NO_MEMORY(ctdb, pub);
4428 pub->addr = ips->ips[i].addr;
4432 timeout = TAKEOVER_TIMEOUT();
4434 data.dsize = offsetof(struct ctdb_addr_info_old,
4436 data.dptr = (uint8_t *)pub;
4438 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4439 CTDB_CONTROL_DEL_PUBLIC_IP,
4440 0, data, async_data,
4442 if (state == NULL) {
4445 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4449 ctdb_client_async_add(async_data, state);
4453 /* Compare IPs between node and file for IPs to be added */
4455 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4456 for (i = 0; i < ips->num; i++) {
4457 if (ctdb_same_ip(&vnn->public_address,
4458 &ips->ips[i].addr)) {
4459 /* IP already on node */
4463 if (i == ips->num) {
4464 /* Add IP ips->ips[i] */
4465 struct ctdb_addr_info_old *pub;
4466 const char *ifaces = NULL;
4471 ("New IP %s configured, adding it\n",
4472 ctdb_addr_to_str(&vnn->public_address)));
4474 uint32_t pnn = ctdb_get_pnn(ctdb);
4476 data.dsize = sizeof(pnn);
4477 data.dptr = (uint8_t *)&pnn;
4479 ret = ctdb_client_send_message(
4481 CTDB_BROADCAST_CONNECTED,
4482 CTDB_SRVID_REBALANCE_NODE,
4485 DEBUG(DEBUG_WARNING,
4486 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4492 ifaces = vnn->ifaces[0];
4494 while (vnn->ifaces[iface] != NULL) {
4495 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4496 vnn->ifaces[iface]);
4500 len = strlen(ifaces) + 1;
4501 pub = talloc_zero_size(mem_ctx,
4502 offsetof(struct ctdb_addr_info_old, iface) + len);
4503 CTDB_NO_MEMORY(ctdb, pub);
4505 pub->addr = vnn->public_address;
4506 pub->mask = vnn->public_netmask_bits;
4508 memcpy(&pub->iface[0], ifaces, pub->len);
4510 timeout = TAKEOVER_TIMEOUT();
4512 data.dsize = offsetof(struct ctdb_addr_info_old,
4514 data.dptr = (uint8_t *)pub;
4516 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4517 CTDB_CONTROL_ADD_PUBLIC_IP,
4518 0, data, async_data,
4520 if (state == NULL) {
4523 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4527 ctdb_client_async_add(async_data, state);
4531 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4532 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4536 talloc_free(mem_ctx);
4540 talloc_free(mem_ctx);
4544 /* This control is sent to force the node to re-read the public addresses file
4545 and drop any addresses we should nnot longer host, and add new addresses
4546 that we are now able to host
4548 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4550 struct ctdb_reloadips_handle *h;
4551 pid_t parent = getpid();
4553 if (ctdb->reload_ips != NULL) {
4554 talloc_free(ctdb->reload_ips);
4555 ctdb->reload_ips = NULL;
4558 h = talloc(ctdb, struct ctdb_reloadips_handle);
4559 CTDB_NO_MEMORY(ctdb, h);
4564 if (pipe(h->fd) == -1) {
4565 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4570 h->child = ctdb_fork(ctdb);
4571 if (h->child == (pid_t)-1) {
4572 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4580 if (h->child == 0) {
4581 signed char res = 0;
4584 debug_extra = talloc_asprintf(NULL, "reloadips:");
4586 prctl_set_comment("ctdb_reloadips");
4587 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4588 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4591 res = ctdb_reloadips_child(ctdb);
4593 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4597 sys_write(h->fd[1], &res, 1);
4598 /* make sure we die when our parent dies */
4599 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4605 h->c = talloc_steal(h, c);
4608 set_close_on_exec(h->fd[0]);
4610 talloc_set_destructor(h, ctdb_reloadips_destructor);
4613 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4614 ctdb_reloadips_child_handler, (void *)h);
4615 tevent_fd_set_auto_close(h->fde);
4617 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4618 ctdb_reloadips_timeout_event, h);
4620 /* we reply later */
4621 *async_reply = true;