4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
56 enum ipalloc_algorithm {
57 IPALLOC_DETERMINISTIC,
58 IPALLOC_NONDETERMINISTIC,
62 struct ipalloc_state {
65 /* Arrays with data for each node */
66 struct ctdb_public_ip_list_old **known_public_ips;
67 struct ctdb_public_ip_list_old **available_public_ips;
69 enum ipalloc_algorithm algorithm;
70 uint32_t no_ip_failback;
73 struct ctdb_interface {
74 struct ctdb_interface *prev, *next;
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
83 return vnn->iface->name;
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
91 struct ctdb_interface *i;
93 /* Verify that we don't have an entry for this ip yet */
94 for (i=ctdb->ifaces;i;i=i->next) {
95 if (strcmp(i->name, iface) == 0) {
100 /* create a new structure for this interface */
101 i = talloc_zero(ctdb, struct ctdb_interface);
102 CTDB_NO_MEMORY_FATAL(ctdb, i);
103 i->name = talloc_strdup(i, iface);
104 CTDB_NO_MEMORY(ctdb, i->name);
108 DLIST_ADD(ctdb->ifaces, i);
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
118 for (n = 0; vnn->ifaces[n] != NULL; n++) {
119 if (strcmp(name, vnn->ifaces[n]) == 0) {
127 /* If any interfaces now have no possible IPs then delete them. This
128 * implementation is naive (i.e. simple) rather than clever
129 * (i.e. complex). Given that this is run on delip and that operation
130 * is rare, this doesn't need to be efficient - it needs to be
131 * foolproof. One alternative is reference counting, where the logic
132 * is distributed and can, therefore, be broken in multiple places.
133 * Another alternative is to build a red-black tree of interfaces that
134 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135 * once) and then walking ctdb->ifaces once and deleting those not in
136 * the tree. Let's go to one of those if the naive implementation
137 * causes problems... :-)
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140 struct ctdb_vnn *vnn)
142 struct ctdb_interface *i, *next;
144 /* For each interface, check if there's an IP using it. */
145 for (i = ctdb->ifaces; i != NULL; i = next) {
150 /* Only consider interfaces named in the given VNN. */
151 if (!vnn_has_interface_with_name(vnn, i->name)) {
155 /* Is the "single IP" on this interface? */
156 if ((ctdb->single_ip_vnn != NULL) &&
157 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159 /* Found, next interface please... */
162 /* Search for a vnn with this interface. */
164 for (tv=ctdb->vnn; tv; tv=tv->next) {
165 if (vnn_has_interface_with_name(tv, i->name)) {
172 /* None of the VNNs are using this interface. */
173 DLIST_REMOVE(ctdb->ifaces, i);
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
183 struct ctdb_interface *i;
185 for (i=ctdb->ifaces;i;i=i->next) {
186 if (strcmp(i->name, iface) == 0) {
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195 struct ctdb_vnn *vnn)
198 struct ctdb_interface *cur = NULL;
199 struct ctdb_interface *best = NULL;
201 for (i=0; vnn->ifaces[i]; i++) {
203 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
217 if (cur->references < best->references) {
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227 struct ctdb_vnn *vnn)
229 struct ctdb_interface *best = NULL;
232 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233 "still assigned to iface '%s'\n",
234 ctdb_addr_to_str(&vnn->public_address),
235 ctdb_vnn_iface_string(vnn)));
239 best = ctdb_vnn_best_iface(ctdb, vnn);
241 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242 "cannot assign to iface any iface\n",
243 ctdb_addr_to_str(&vnn->public_address)));
249 vnn->pnn = ctdb->pnn;
251 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252 "now assigned to iface '%s' refs[%d]\n",
253 ctdb_addr_to_str(&vnn->public_address),
254 ctdb_vnn_iface_string(vnn),
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260 struct ctdb_vnn *vnn)
262 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263 "now unassigned (old iface '%s' refs[%d])\n",
264 ctdb_addr_to_str(&vnn->public_address),
265 ctdb_vnn_iface_string(vnn),
266 vnn->iface?vnn->iface->references:0));
268 vnn->iface->references--;
271 if (vnn->pnn == ctdb->pnn) {
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277 struct ctdb_vnn *vnn)
281 /* Nodes that are not RUNNING can not host IPs */
282 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
286 if (vnn->delete_pending) {
290 if (vnn->iface && vnn->iface->link_up) {
294 for (i=0; vnn->ifaces[i]; i++) {
295 struct ctdb_interface *cur;
297 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
310 struct ctdb_takeover_arp {
311 struct ctdb_context *ctdb;
314 struct ctdb_tcp_array *tcparray;
315 struct ctdb_vnn *vnn;
320 lists of tcp endpoints
322 struct ctdb_tcp_list {
323 struct ctdb_tcp_list *prev, *next;
324 struct ctdb_connection connection;
328 list of clients to kill on IP release
330 struct ctdb_client_ip {
331 struct ctdb_client_ip *prev, *next;
332 struct ctdb_context *ctdb;
339 send a gratuitous arp
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342 struct tevent_timer *te,
343 struct timeval t, void *private_data)
345 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
346 struct ctdb_takeover_arp);
348 struct ctdb_tcp_array *tcparray;
349 const char *iface = ctdb_vnn_iface_string(arp->vnn);
351 ret = ctdb_sys_send_arp(&arp->addr, iface);
353 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354 iface, strerror(errno)));
357 tcparray = arp->tcparray;
359 for (i=0;i<tcparray->num;i++) {
360 struct ctdb_connection *tcon;
362 tcon = &tcparray->connections[i];
363 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364 (unsigned)ntohs(tcon->dst.ip.sin_port),
365 ctdb_addr_to_str(&tcon->src),
366 (unsigned)ntohs(tcon->src.ip.sin_port)));
367 ret = ctdb_sys_send_tcp(
372 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373 ctdb_addr_to_str(&tcon->src)));
380 if (arp->count == CTDB_ARP_REPEAT) {
385 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387 ctdb_control_send_arp, arp);
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391 struct ctdb_vnn *vnn)
393 struct ctdb_takeover_arp *arp;
394 struct ctdb_tcp_array *tcparray;
396 if (!vnn->takeover_ctx) {
397 vnn->takeover_ctx = talloc_new(vnn);
398 if (!vnn->takeover_ctx) {
403 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
409 arp->addr = vnn->public_address;
412 tcparray = vnn->tcp_array;
414 /* add all of the known tcp connections for this IP to the
415 list of tcp connections to send tickle acks for */
416 arp->tcparray = talloc_steal(arp, tcparray);
418 vnn->tcp_array = NULL;
419 vnn->tcp_update_needed = true;
422 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423 timeval_zero(), ctdb_control_send_arp, arp);
428 struct takeover_callback_state {
429 struct ctdb_req_control_old *c;
430 ctdb_sock_addr *addr;
431 struct ctdb_vnn *vnn;
434 struct ctdb_do_takeip_state {
435 struct ctdb_req_control_old *c;
436 struct ctdb_vnn *vnn;
440 called when takeip event finishes
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
445 struct ctdb_do_takeip_state *state =
446 talloc_get_type(private_data, struct ctdb_do_takeip_state);
451 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
453 if (status == -ETIME) {
456 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457 ctdb_addr_to_str(&state->vnn->public_address),
458 ctdb_vnn_iface_string(state->vnn)));
459 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
461 node->flags |= NODE_FLAGS_UNHEALTHY;
466 if (ctdb->do_checkpublicip) {
468 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
470 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
477 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478 data.dsize = strlen((char *)data.dptr) + 1;
479 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
481 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
484 /* the control succeeded */
485 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
492 state->vnn->update_in_flight = false;
497 take over an ip address
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500 struct ctdb_req_control_old *c,
501 struct ctdb_vnn *vnn)
504 struct ctdb_do_takeip_state *state;
506 if (vnn->update_in_flight) {
507 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508 "update for this IP already in flight\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits));
514 ret = ctdb_vnn_assign_iface(ctdb, vnn);
516 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517 "assign a usable interface\n",
518 ctdb_addr_to_str(&vnn->public_address),
519 vnn->public_netmask_bits));
523 state = talloc(vnn, struct ctdb_do_takeip_state);
524 CTDB_NO_MEMORY(ctdb, state);
526 state->c = talloc_steal(ctdb, c);
529 vnn->update_in_flight = true;
530 talloc_set_destructor(state, ctdb_takeip_destructor);
532 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533 ctdb_addr_to_str(&vnn->public_address),
534 vnn->public_netmask_bits,
535 ctdb_vnn_iface_string(vnn)));
537 ret = ctdb_event_script_callback(ctdb,
539 ctdb_do_takeip_callback,
543 ctdb_vnn_iface_string(vnn),
544 ctdb_addr_to_str(&vnn->public_address),
545 vnn->public_netmask_bits);
548 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549 ctdb_addr_to_str(&vnn->public_address),
550 ctdb_vnn_iface_string(vnn)));
558 struct ctdb_do_updateip_state {
559 struct ctdb_req_control_old *c;
560 struct ctdb_interface *old;
561 struct ctdb_vnn *vnn;
565 called when updateip event finishes
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
570 struct ctdb_do_updateip_state *state =
571 talloc_get_type(private_data, struct ctdb_do_updateip_state);
575 if (status == -ETIME) {
578 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579 ctdb_addr_to_str(&state->vnn->public_address),
581 ctdb_vnn_iface_string(state->vnn)));
584 * All we can do is reset the old interface
585 * and let the next run fix it
587 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588 state->vnn->iface = state->old;
589 state->vnn->iface->references++;
591 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
596 if (ctdb->do_checkpublicip) {
598 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
600 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
607 /* the control succeeded */
608 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
615 state->vnn->update_in_flight = false;
620 update (move) an ip address
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623 struct ctdb_req_control_old *c,
624 struct ctdb_vnn *vnn)
627 struct ctdb_do_updateip_state *state;
628 struct ctdb_interface *old = vnn->iface;
629 const char *new_name;
631 if (vnn->update_in_flight) {
632 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633 "update for this IP already in flight\n",
634 ctdb_addr_to_str(&vnn->public_address),
635 vnn->public_netmask_bits));
639 ctdb_vnn_unassign_iface(ctdb, vnn);
640 ret = ctdb_vnn_assign_iface(ctdb, vnn);
642 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643 "assin a usable interface (old iface '%s')\n",
644 ctdb_addr_to_str(&vnn->public_address),
645 vnn->public_netmask_bits,
650 new_name = ctdb_vnn_iface_string(vnn);
651 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652 /* A benign update from one interface onto itself.
653 * no need to run the eventscripts in this case, just return
656 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
660 state = talloc(vnn, struct ctdb_do_updateip_state);
661 CTDB_NO_MEMORY(ctdb, state);
663 state->c = talloc_steal(ctdb, c);
667 vnn->update_in_flight = true;
668 talloc_set_destructor(state, ctdb_updateip_destructor);
670 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671 "interface %s to %s\n",
672 ctdb_addr_to_str(&vnn->public_address),
673 vnn->public_netmask_bits,
677 ret = ctdb_event_script_callback(ctdb,
679 ctdb_do_updateip_callback,
681 CTDB_EVENT_UPDATE_IP,
685 ctdb_addr_to_str(&vnn->public_address),
686 vnn->public_netmask_bits);
688 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689 ctdb_addr_to_str(&vnn->public_address),
690 old->name, new_name));
699 Find the vnn of the node that has a public ip address
700 returns -1 if the address is not known as a public address
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
704 struct ctdb_vnn *vnn;
706 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707 if (ctdb_same_ip(&vnn->public_address, addr)) {
716 take over an ip address
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719 struct ctdb_req_control_old *c,
724 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725 struct ctdb_vnn *vnn;
726 bool have_ip = false;
727 bool do_updateip = false;
728 bool do_takeip = false;
729 struct ctdb_interface *best_iface = NULL;
731 if (pip->pnn != ctdb->pnn) {
732 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733 "with pnn %d, but we're node %d\n",
734 ctdb_addr_to_str(&pip->addr),
735 pip->pnn, ctdb->pnn));
739 /* update out vnn list */
740 vnn = find_public_ip_vnn(ctdb, &pip->addr);
742 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743 ctdb_addr_to_str(&pip->addr)));
747 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748 have_ip = ctdb_sys_have_ip(&pip->addr);
750 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751 if (best_iface == NULL) {
752 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753 "a usable interface (old %s, have_ip %d)\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 vnn->public_netmask_bits,
756 ctdb_vnn_iface_string(vnn),
761 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
767 if (vnn->iface == NULL && have_ip) {
768 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770 ctdb_addr_to_str(&vnn->public_address)));
774 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776 "and we have it on iface[%s], but it was assigned to node %d"
777 "and we are node %d, banning ourself\n",
778 ctdb_addr_to_str(&vnn->public_address),
779 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
784 if (vnn->pnn == -1 && have_ip) {
785 vnn->pnn = ctdb->pnn;
786 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787 "and we already have it on iface[%s], update local daemon\n",
788 ctdb_addr_to_str(&vnn->public_address),
789 ctdb_vnn_iface_string(vnn)));
794 if (vnn->iface != best_iface) {
795 if (!vnn->iface->link_up) {
797 } else if (vnn->iface->references > (best_iface->references + 1)) {
798 /* only move when the rebalance gains something */
806 ctdb_vnn_unassign_iface(ctdb, vnn);
813 ret = ctdb_do_takeip(ctdb, c, vnn);
817 } else if (do_updateip) {
818 ret = ctdb_do_updateip(ctdb, c, vnn);
824 * The interface is up and the kernel known the ip
827 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828 ctdb_addr_to_str(&pip->addr),
829 vnn->public_netmask_bits,
830 ctdb_vnn_iface_string(vnn)));
834 /* tell ctdb_control.c that we will be replying asynchronously */
841 kill any clients that are registered with a IP that is being released
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
845 struct ctdb_client_ip *ip;
847 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848 ctdb_addr_to_str(addr)));
850 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851 ctdb_sock_addr tmp_addr;
854 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
856 ctdb_addr_to_str(&ip->addr)));
858 if (ctdb_same_ip(&tmp_addr, addr)) {
859 struct ctdb_client *client = reqid_find(ctdb->idr,
862 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
864 ctdb_addr_to_str(&ip->addr),
867 if (client->pid != 0) {
868 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869 (unsigned)client->pid,
870 ctdb_addr_to_str(addr),
872 kill(client->pid, SIGKILL);
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
880 DLIST_REMOVE(ctdb->vnn, vnn);
881 ctdb_vnn_unassign_iface(ctdb, vnn);
882 ctdb_remove_orphaned_ifaces(ctdb, vnn);
887 called when releaseip event finishes
889 static void release_ip_callback(struct ctdb_context *ctdb, int status,
892 struct takeover_callback_state *state =
893 talloc_get_type(private_data, struct takeover_callback_state);
896 if (status == -ETIME) {
900 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901 if (ctdb_sys_have_ip(state->addr)) {
903 ("IP %s still hosted during release IP callback, failing\n",
904 ctdb_addr_to_str(state->addr)));
905 ctdb_request_control_reply(ctdb, state->c,
912 /* send a message to all clients of this node telling them
913 that the cluster has been reconfigured and they should
914 release any sockets on this IP */
915 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917 data.dsize = strlen((char *)data.dptr)+1;
919 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
921 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
923 /* kill clients that have registered with this IP */
924 release_kill_clients(ctdb, state->addr);
926 ctdb_vnn_unassign_iface(ctdb, state->vnn);
928 /* Process the IP if it has been marked for deletion */
929 if (state->vnn->delete_pending) {
930 do_delete_ip(ctdb, state->vnn);
934 /* the control succeeded */
935 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
941 if (state->vnn != NULL) {
942 state->vnn->update_in_flight = false;
948 release an ip address
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
951 struct ctdb_req_control_old *c,
956 struct takeover_callback_state *state;
957 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958 struct ctdb_vnn *vnn;
961 /* update our vnn list */
962 vnn = find_public_ip_vnn(ctdb, &pip->addr);
964 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965 ctdb_addr_to_str(&pip->addr)));
970 /* stop any previous arps */
971 talloc_free(vnn->takeover_ctx);
972 vnn->takeover_ctx = NULL;
974 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975 * lazy multicast to drop an IP from any node that isn't the
976 * intended new node. The following causes makes ctdbd ignore
977 * a release for any address it doesn't host.
979 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980 if (!ctdb_sys_have_ip(&pip->addr)) {
981 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982 ctdb_addr_to_str(&pip->addr),
983 vnn->public_netmask_bits,
984 ctdb_vnn_iface_string(vnn)));
985 ctdb_vnn_unassign_iface(ctdb, vnn);
989 if (vnn->iface == NULL) {
990 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991 ctdb_addr_to_str(&pip->addr),
992 vnn->public_netmask_bits));
997 /* There is a potential race between take_ip and us because we
998 * update the VNN via a callback that run when the
999 * eventscripts have been run. Avoid the race by allowing one
1000 * update to be in flight at a time.
1002 if (vnn->update_in_flight) {
1003 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004 "update for this IP already in flight\n",
1005 ctdb_addr_to_str(&vnn->public_address),
1006 vnn->public_netmask_bits));
1010 iface = strdup(ctdb_vnn_iface_string(vnn));
1012 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1013 ctdb_addr_to_str(&pip->addr),
1014 vnn->public_netmask_bits,
1018 state = talloc(ctdb, struct takeover_callback_state);
1019 if (state == NULL) {
1020 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021 __FILE__, __LINE__);
1026 state->c = talloc_steal(state, c);
1027 state->addr = talloc(state, ctdb_sock_addr);
1028 if (state->addr == NULL) {
1029 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030 __FILE__, __LINE__);
1035 *state->addr = pip->addr;
1038 vnn->update_in_flight = true;
1039 talloc_set_destructor(state, ctdb_releaseip_destructor);
1041 ret = ctdb_event_script_callback(ctdb,
1042 state, release_ip_callback, state,
1043 CTDB_EVENT_RELEASE_IP,
1046 ctdb_addr_to_str(&pip->addr),
1047 vnn->public_netmask_bits);
1050 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051 ctdb_addr_to_str(&pip->addr),
1052 ctdb_vnn_iface_string(vnn)));
1057 /* tell the control that we will be reply asynchronously */
1058 *async_reply = true;
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063 ctdb_sock_addr *addr,
1064 unsigned mask, const char *ifaces,
1067 struct ctdb_vnn *vnn;
1074 tmp = strdup(ifaces);
1075 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076 if (!ctdb_sys_check_iface_exists(iface)) {
1077 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1084 /* Verify that we don't have an entry for this ip yet */
1085 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1088 ctdb_addr_to_str(addr)));
1093 /* create a new vnn structure for this ip address */
1094 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097 tmp = talloc_strdup(vnn, ifaces);
1098 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1107 vnn->ifaces[num] = NULL;
1108 vnn->public_address = *addr;
1109 vnn->public_netmask_bits = mask;
1111 if (check_address) {
1112 if (ctdb_sys_have_ip(addr)) {
1113 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114 vnn->pnn = ctdb->pnn;
1118 for (i=0; vnn->ifaces[i]; i++) {
1119 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122 "for public_address[%s]\n",
1123 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1129 DLIST_ADD(ctdb->vnn, vnn);
1135 setup the public address lists from a file
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1143 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144 if (lines == NULL) {
1145 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1148 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1152 for (i=0;i<nlines;i++) {
1154 ctdb_sock_addr addr;
1155 const char *addrstr;
1160 while ((*line == ' ') || (*line == '\t')) {
1166 if (strcmp(line, "") == 0) {
1169 tok = strtok(line, " \t");
1171 tok = strtok(NULL, " \t");
1173 if (NULL == ctdb->default_public_interface) {
1174 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1179 ifaces = ctdb->default_public_interface;
1184 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1189 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1205 struct ctdb_vnn *svnn;
1206 struct ctdb_interface *cur = NULL;
1210 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211 CTDB_NO_MEMORY(ctdb, svnn);
1213 svnn->ifaces = talloc_array(svnn, const char *, 2);
1214 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217 svnn->ifaces[1] = NULL;
1219 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1225 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1227 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228 "for single_ip[%s]\n",
1230 ctdb_addr_to_str(&svnn->public_address)));
1235 /* assume the single public ip interface is initially "good" */
1236 cur = ctdb_find_iface(ctdb, iface);
1238 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1241 cur->link_up = true;
1243 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1249 ctdb->single_ip_vnn = svnn;
1253 struct public_ip_list {
1254 struct public_ip_list *next;
1256 ctdb_sock_addr addr;
1259 /* Given a physical node, return the number of
1260 public addresses that is currently assigned to this node.
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1266 for (;ips;ips=ips->next) {
1267 if (ips->pnn == pnn) {
1275 /* Can the given node host the given IP: is the public IP known to the
1276 * node and is NOIPHOST unset?
1278 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1280 struct ctdb_ipflags ipflags,
1281 struct public_ip_list *ip)
1283 struct ctdb_public_ip_list_old *public_ips;
1286 if (ipflags.noiphost) {
1290 public_ips = ipalloc_state->available_public_ips[pnn];
1292 if (public_ips == NULL) {
1296 for (i=0; i<public_ips->num; i++) {
1297 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1298 /* yes, this node can serve this public ip */
1306 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1308 struct ctdb_ipflags ipflags,
1309 struct public_ip_list *ip)
1311 if (ipflags.noiptakeover) {
1315 return can_node_host_ip(ipalloc_state, pnn, ipflags, ip);
1318 /* search the node lists list for a node to takeover this ip.
1319 pick the node that currently are serving the least number of ips
1320 so that the ips get spread out evenly.
1322 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1323 struct ctdb_ipflags *ipflags,
1324 struct public_ip_list *ip,
1325 struct public_ip_list *all_ips)
1327 int pnn, min=0, num;
1330 numnodes = talloc_array_length(ipflags);
1332 for (i=0; i<numnodes; i++) {
1333 /* verify that this node can serve this ip */
1334 if (!can_node_takeover_ip(ipalloc_state, i, ipflags[i], ip)) {
1335 /* no it couldnt so skip to the next node */
1339 num = node_ip_coverage(i, all_ips);
1340 /* was this the first node we checked ? */
1352 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1353 ctdb_addr_to_str(&ip->addr)));
1363 static uint32_t *ip_key(ctdb_sock_addr *ip)
1365 static uint32_t key[IP_KEYLEN];
1367 bzero(key, sizeof(key));
1369 switch (ip->sa.sa_family) {
1371 key[3] = htonl(ip->ip.sin_addr.s_addr);
1374 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1375 key[0] = htonl(s6_a32[0]);
1376 key[1] = htonl(s6_a32[1]);
1377 key[2] = htonl(s6_a32[2]);
1378 key[3] = htonl(s6_a32[3]);
1382 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1389 static void *add_ip_callback(void *parm, void *data)
1391 struct public_ip_list *this_ip = parm;
1392 struct public_ip_list *prev_ip = data;
1394 if (prev_ip == NULL) {
1397 if (this_ip->pnn == -1) {
1398 this_ip->pnn = prev_ip->pnn;
1404 static int getips_count_callback(void *param, void *data)
1406 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1407 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1409 new_ip->next = *ip_list;
1414 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1415 struct ctdb_public_ip_list_old *ips,
1418 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1419 struct ipalloc_state *ipalloc_state,
1420 struct ctdb_node_map_old *nodemap)
1425 if (ipalloc_state->num != nodemap->num) {
1428 " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1429 ipalloc_state->num, nodemap->num));
1433 for (j=0; j<nodemap->num; j++) {
1434 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1438 /* Retrieve the list of known public IPs from the node */
1439 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1444 &ipalloc_state->known_public_ips[j]);
1447 ("Failed to read known public IPs from node: %u\n",
1452 if (ctdb->do_checkpublicip) {
1453 verify_remote_ip_allocation(ctdb,
1454 ipalloc_state->known_public_ips[j],
1458 /* Retrieve the list of available public IPs from the node */
1459 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1463 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1464 &ipalloc_state->available_public_ips[j]);
1467 ("Failed to read available public IPs from node: %u\n",
1476 static struct public_ip_list *
1477 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1480 struct public_ip_list *ip_list;
1481 struct ctdb_public_ip_list_old *public_ips;
1483 TALLOC_FREE(ctdb->ip_tree);
1484 ctdb->ip_tree = trbt_create(ctdb, 0);
1486 for (i=0; i < ctdb->num_nodes; i++) {
1487 public_ips = ipalloc_state->known_public_ips[i];
1489 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1493 /* there were no public ips for this node */
1494 if (public_ips == NULL) {
1498 for (j=0; j < public_ips->num; j++) {
1499 struct public_ip_list *tmp_ip;
1501 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1502 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503 /* Do not use information about IP addresses hosted
1504 * on other nodes, it may not be accurate */
1505 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506 tmp_ip->pnn = public_ips->ips[j].pnn;
1510 tmp_ip->addr = public_ips->ips[j].addr;
1511 tmp_ip->next = NULL;
1513 trbt_insertarray32_callback(ctdb->ip_tree,
1514 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1521 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1527 * This is the length of the longtest common prefix between the IPs.
1528 * It is calculated by XOR-ing the 2 IPs together and counting the
1529 * number of leading zeroes. The implementation means that all
1530 * addresses end up being 128 bits long.
1532 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534 * lots of nodes and IP addresses?
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1538 uint32_t ip1_k[IP_KEYLEN];
1543 uint32_t distance = 0;
1545 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1547 for (i=0; i<IP_KEYLEN; i++) {
1548 x = ip1_k[i] ^ t[i];
1552 /* Count number of leading zeroes.
1553 * FIXME? This could be optimised...
1555 while ((x & (1 << 31)) == 0) {
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566 given node. The ips argument is generally the all_ips variable
1567 used in the main part of the algorithm.
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570 struct public_ip_list *ips,
1573 struct public_ip_list *t;
1578 for (t=ips; t != NULL; t=t->next) {
1579 if (t->pnn != pnn) {
1583 /* Optimisation: We never calculate the distance
1584 * between an address and itself. This allows us to
1585 * calculate the effect of removing an address from a
1586 * node by simply calculating the distance between
1587 * that address and all of the exitsing addresses.
1588 * Moreover, we assume that we're only ever dealing
1589 * with addresses from all_ips so we can identify an
1590 * address via a pointer rather than doing a more
1591 * expensive address comparison. */
1592 if (&(t->addr) == ip) {
1596 d = ip_distance(ip, &(t->addr));
1597 sum += d * d; /* Cheaper than pulling in math.h :-) */
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1606 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1608 struct public_ip_list *t;
1610 uint32_t imbalance = 0;
1612 for (t=all_ips; t!=NULL; t=t->next) {
1613 if (t->pnn != pnn) {
1616 /* Pass the rest of the IPs rather than the whole
1619 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626 * finding the best node for each.
1628 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1629 struct ctdb_ipflags *ipflags,
1630 struct public_ip_list *all_ips)
1632 struct public_ip_list *tmp_ip;
1634 /* loop over all ip's and find a physical node to cover for
1637 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638 if (tmp_ip->pnn == -1) {
1639 if (find_takeover_node(ipalloc_state, ipflags,
1641 DEBUG(DEBUG_WARNING,
1642 ("Failed to find node to cover ip %s\n",
1643 ctdb_addr_to_str(&tmp_ip->addr)));
1649 /* Basic non-deterministic rebalancing algorithm.
1651 static void basic_failback(struct ipalloc_state *ipalloc_state,
1652 struct ctdb_ipflags *ipflags,
1653 struct public_ip_list *all_ips,
1657 int maxnode, maxnum, minnode, minnum, num, retries;
1658 struct public_ip_list *tmp_ip;
1660 numnodes = talloc_array_length(ipflags);
1667 /* for each ip address, loop over all nodes that can serve
1668 this ip and make sure that the difference between the node
1669 serving the most and the node serving the least ip's are
1672 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1673 if (tmp_ip->pnn == -1) {
1677 /* Get the highest and lowest number of ips's served by any
1678 valid node which can serve this ip.
1682 for (i=0; i<numnodes; i++) {
1683 /* only check nodes that can actually serve this ip */
1684 if (!can_node_takeover_ip(ipalloc_state, i,
1685 ipflags[i], tmp_ip)) {
1686 /* no it couldnt so skip to the next node */
1690 num = node_ip_coverage(i, all_ips);
1691 if (maxnode == -1) {
1700 if (minnode == -1) {
1710 if (maxnode == -1) {
1711 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1712 ctdb_addr_to_str(&tmp_ip->addr)));
1717 /* if the spread between the smallest and largest coverage by
1718 a node is >=2 we steal one of the ips from the node with
1719 most coverage to even things out a bit.
1720 try to do this a limited number of times since we dont
1721 want to spend too much time balancing the ip coverage.
1723 if ( (maxnum > minnum+1)
1724 && (retries < (num_ips + 5)) ){
1725 struct public_ip_list *tmp;
1727 /* Reassign one of maxnode's VNNs */
1728 for (tmp=all_ips;tmp;tmp=tmp->next) {
1729 if (tmp->pnn == maxnode) {
1730 (void)find_takeover_node(ipalloc_state,
1742 static void lcp2_init(TALLOC_CTX *tmp_ctx,
1743 struct ctdb_ipflags *ipflags,
1744 struct public_ip_list *all_ips,
1745 uint32_t *force_rebalance_nodes,
1746 uint32_t **lcp2_imbalances,
1747 bool **rebalance_candidates)
1750 struct public_ip_list *tmp_ip;
1752 numnodes = talloc_array_length(ipflags);
1754 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1755 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1756 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1757 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1759 for (i=0; i<numnodes; i++) {
1760 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1761 /* First step: assume all nodes are candidates */
1762 (*rebalance_candidates)[i] = true;
1765 /* 2nd step: if a node has IPs assigned then it must have been
1766 * healthy before, so we remove it from consideration. This
1767 * is overkill but is all we have because we don't maintain
1768 * state between takeover runs. An alternative would be to
1769 * keep state and invalidate it every time the recovery master
1772 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1773 if (tmp_ip->pnn != -1) {
1774 (*rebalance_candidates)[tmp_ip->pnn] = false;
1778 /* 3rd step: if a node is forced to re-balance then
1779 we allow failback onto the node */
1780 if (force_rebalance_nodes == NULL) {
1783 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1784 uint32_t pnn = force_rebalance_nodes[i];
1785 if (pnn >= numnodes) {
1787 (__location__ "unknown node %u\n", pnn));
1792 ("Forcing rebalancing of IPs to node %u\n", pnn));
1793 (*rebalance_candidates)[pnn] = true;
1797 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1798 * the IP/node combination that will cost the least.
1800 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1801 struct ctdb_ipflags *ipflags,
1802 struct public_ip_list *all_ips,
1803 uint32_t *lcp2_imbalances)
1805 struct public_ip_list *tmp_ip;
1806 int dstnode, numnodes;
1809 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1810 struct public_ip_list *minip;
1812 bool should_loop = true;
1813 bool have_unassigned = true;
1815 numnodes = talloc_array_length(ipflags);
1817 while (have_unassigned && should_loop) {
1818 should_loop = false;
1820 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1821 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1827 /* loop over each unassigned ip. */
1828 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1829 if (tmp_ip->pnn != -1) {
1833 for (dstnode=0; dstnode<numnodes; dstnode++) {
1834 /* only check nodes that can actually takeover this ip */
1835 if (!can_node_takeover_ip(ipalloc_state,
1839 /* no it couldnt so skip to the next node */
1843 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1844 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1845 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1846 ctdb_addr_to_str(&(tmp_ip->addr)),
1848 dstimbl - lcp2_imbalances[dstnode]));
1851 if ((minnode == -1) || (dstdsum < mindsum)) {
1861 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1863 /* If we found one then assign it to the given node. */
1864 if (minnode != -1) {
1865 minip->pnn = minnode;
1866 lcp2_imbalances[minnode] = minimbl;
1867 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1868 ctdb_addr_to_str(&(minip->addr)),
1873 /* There might be a better way but at least this is clear. */
1874 have_unassigned = false;
1875 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876 if (tmp_ip->pnn == -1) {
1877 have_unassigned = true;
1882 /* We know if we have an unassigned addresses so we might as
1885 if (have_unassigned) {
1886 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1887 if (tmp_ip->pnn == -1) {
1888 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1889 ctdb_addr_to_str(&tmp_ip->addr)));
1895 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1896 * to move IPs from, determines the best IP/destination node
1897 * combination to move from the source node.
1899 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1900 struct ctdb_ipflags *ipflags,
1901 struct public_ip_list *all_ips,
1903 uint32_t *lcp2_imbalances,
1904 bool *rebalance_candidates)
1906 int dstnode, mindstnode, numnodes;
1907 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1908 uint32_t minsrcimbl, mindstimbl;
1909 struct public_ip_list *minip;
1910 struct public_ip_list *tmp_ip;
1912 /* Find an IP and destination node that best reduces imbalance. */
1919 numnodes = talloc_array_length(ipflags);
1921 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1922 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1923 srcnode, lcp2_imbalances[srcnode]));
1925 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1926 /* Only consider addresses on srcnode. */
1927 if (tmp_ip->pnn != srcnode) {
1931 /* What is this IP address costing the source node? */
1932 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1933 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1935 /* Consider this IP address would cost each potential
1936 * destination node. Destination nodes are limited to
1937 * those that are newly healthy, since we don't want
1938 * to do gratuitous failover of IPs just to make minor
1939 * balance improvements.
1941 for (dstnode=0; dstnode<numnodes; dstnode++) {
1942 if (!rebalance_candidates[dstnode]) {
1946 /* only check nodes that can actually takeover this ip */
1947 if (!can_node_takeover_ip(ipalloc_state, dstnode,
1948 ipflags[dstnode], tmp_ip)) {
1949 /* no it couldnt so skip to the next node */
1953 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1954 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1955 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1957 ctdb_addr_to_str(&(tmp_ip->addr)),
1960 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1961 (dstdsum < srcdsum) && \
1962 ((mindstnode == -1) || \
1963 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1966 minsrcimbl = srcimbl;
1967 mindstnode = dstnode;
1968 mindstimbl = dstimbl;
1972 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1974 if (mindstnode != -1) {
1975 /* We found a move that makes things better... */
1976 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1977 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1978 ctdb_addr_to_str(&(minip->addr)),
1979 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1982 lcp2_imbalances[srcnode] = minsrcimbl;
1983 lcp2_imbalances[mindstnode] = mindstimbl;
1984 minip->pnn = mindstnode;
1993 struct lcp2_imbalance_pnn {
1998 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2000 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2001 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2003 if (lipa->imbalance > lipb->imbalance) {
2005 } else if (lipa->imbalance == lipb->imbalance) {
2012 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2013 * node with the highest LCP2 imbalance, and then determines the best
2014 * IP/destination node combination to move from the source node.
2016 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2017 struct ctdb_ipflags *ipflags,
2018 struct public_ip_list *all_ips,
2019 uint32_t *lcp2_imbalances,
2020 bool *rebalance_candidates)
2023 struct lcp2_imbalance_pnn * lips;
2026 numnodes = talloc_array_length(ipflags);
2029 /* Put the imbalances and nodes into an array, sort them and
2030 * iterate through candidates. Usually the 1st one will be
2031 * used, so this doesn't cost much...
2033 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2034 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2035 lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2036 for (i=0; i<numnodes; i++) {
2037 lips[i].imbalance = lcp2_imbalances[i];
2039 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2041 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2042 lcp2_cmp_imbalance_pnn);
2045 for (i=0; i<numnodes; i++) {
2046 /* This means that all nodes had 0 or 1 addresses, so
2047 * can't be imbalanced.
2049 if (lips[i].imbalance == 0) {
2053 if (lcp2_failback_candidate(ipalloc_state,
2058 rebalance_candidates)) {
2070 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
2071 struct ctdb_ipflags *ipflags,
2072 struct public_ip_list *all_ips)
2074 struct public_ip_list *tmp_ip;
2076 /* verify that the assigned nodes can serve that public ip
2077 and set it to -1 if not
2079 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2080 if (tmp_ip->pnn == -1) {
2083 if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
2084 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2085 /* this node can not serve this ip. */
2086 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2087 ctdb_addr_to_str(&(tmp_ip->addr)),
2094 static void ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
2095 struct ctdb_ipflags *ipflags,
2096 struct public_ip_list *all_ips)
2098 struct public_ip_list *tmp_ip;
2101 numnodes = talloc_array_length(ipflags);
2103 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2104 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2105 * always be allocated the same way for a specific set of
2106 * available/unavailable nodes.
2109 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2110 tmp_ip->pnn = i % numnodes;
2113 /* IP failback doesn't make sense with deterministic
2114 * IPs, since the modulo step above implicitly fails
2115 * back IPs to their "home" node.
2117 if (1 == ipalloc_state->no_ip_failback) {
2118 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2121 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2123 basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2125 /* No failback here! */
2128 static void ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
2129 struct ctdb_ipflags *ipflags,
2130 struct public_ip_list *all_ips)
2132 /* This should be pushed down into basic_failback. */
2133 struct public_ip_list *tmp_ip;
2135 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2139 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2141 basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2143 /* If we don't want IPs to fail back then don't rebalance IPs. */
2144 if (1 == ipalloc_state->no_ip_failback) {
2148 /* Now, try to make sure the ip adresses are evenly distributed
2151 basic_failback(ipalloc_state, ipflags, all_ips, num_ips);
2154 static void ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
2155 struct ctdb_ipflags *ipflags,
2156 struct public_ip_list *all_ips,
2157 uint32_t *force_rebalance_nodes)
2159 uint32_t *lcp2_imbalances;
2160 bool *rebalance_candidates;
2161 int numnodes, num_rebalance_candidates, i;
2163 TALLOC_CTX *tmp_ctx = talloc_new(ipalloc_state);
2165 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2167 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2168 &lcp2_imbalances, &rebalance_candidates);
2170 lcp2_allocate_unassigned(ipalloc_state, ipflags, all_ips, lcp2_imbalances);
2172 /* If we don't want IPs to fail back then don't rebalance IPs. */
2173 if (1 == ipalloc_state->no_ip_failback) {
2177 /* It is only worth continuing if we have suitable target
2178 * nodes to transfer IPs to. This check is much cheaper than
2181 numnodes = talloc_array_length(ipflags);
2182 num_rebalance_candidates = 0;
2183 for (i=0; i<numnodes; i++) {
2184 if (rebalance_candidates[i]) {
2185 num_rebalance_candidates++;
2188 if (num_rebalance_candidates == 0) {
2192 /* Now, try to make sure the ip adresses are evenly distributed
2195 lcp2_failback(ipalloc_state, ipflags, all_ips,
2196 lcp2_imbalances, rebalance_candidates);
2199 talloc_free(tmp_ctx);
2202 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2206 for (i=0;i<nodemap->num;i++) {
2207 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2208 /* Found one completely healthy node */
2216 /* The calculation part of the IP allocation algorithm. */
2217 static void ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
2218 struct ctdb_ipflags *ipflags,
2219 struct public_ip_list *all_ips,
2220 uint32_t *force_rebalance_nodes)
2222 switch (ipalloc_state->algorithm) {
2224 ip_alloc_lcp2(ipalloc_state, ipflags, all_ips,
2225 force_rebalance_nodes);
2227 case IPALLOC_DETERMINISTIC:
2228 ip_alloc_deterministic_ips(ipalloc_state, ipflags, all_ips);
2230 case IPALLOC_NONDETERMINISTIC:
2231 ip_alloc_nondeterministic_ips(ipalloc_state, ipflags, all_ips);
2235 /* at this point ->pnn is the node which will own each IP
2236 or -1 if there is no node that can cover this ip
2242 struct get_tunable_callback_data {
2243 const char *tunable;
2248 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2249 int32_t res, TDB_DATA outdata,
2252 struct get_tunable_callback_data *cd =
2253 (struct get_tunable_callback_data *)callback;
2257 /* Already handled in fail callback */
2261 if (outdata.dsize != sizeof(uint32_t)) {
2262 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2263 cd->tunable, pnn, (int)sizeof(uint32_t),
2264 (int)outdata.dsize));
2269 size = talloc_array_length(cd->out);
2271 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2272 cd->tunable, pnn, size));
2277 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2280 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2281 int32_t res, TDB_DATA outdata,
2284 struct get_tunable_callback_data *cd =
2285 (struct get_tunable_callback_data *)callback;
2290 ("Timed out getting tunable \"%s\" from node %d\n",
2296 DEBUG(DEBUG_WARNING,
2297 ("Tunable \"%s\" not implemented on node %d\n",
2302 ("Unexpected error getting tunable \"%s\" from node %d\n",
2308 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2309 TALLOC_CTX *tmp_ctx,
2310 struct ctdb_node_map_old *nodemap,
2311 const char *tunable,
2312 uint32_t default_value)
2315 struct ctdb_control_get_tunable *t;
2318 struct get_tunable_callback_data callback_data;
2321 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2322 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2323 for (i=0; i<nodemap->num; i++) {
2324 tvals[i] = default_value;
2327 callback_data.out = tvals;
2328 callback_data.tunable = tunable;
2329 callback_data.fatal = false;
2331 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2332 data.dptr = talloc_size(tmp_ctx, data.dsize);
2333 t = (struct ctdb_control_get_tunable *)data.dptr;
2334 t->length = strlen(tunable)+1;
2335 memcpy(t->name, tunable, t->length);
2336 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2337 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2338 nodes, 0, TAKEOVER_TIMEOUT(),
2340 get_tunable_callback,
2341 get_tunable_fail_callback,
2342 &callback_data) != 0) {
2343 if (callback_data.fatal) {
2349 talloc_free(data.dptr);
2354 /* Set internal flags for IP allocation:
2356 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2357 * Set NOIPHOST ip flag for each INACTIVE node
2358 * if all nodes are disabled:
2359 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2361 * Set NOIPHOST ip flags for disabled nodes
2363 static struct ctdb_ipflags *
2364 set_ipflags_internal(TALLOC_CTX *tmp_ctx,
2365 struct ctdb_node_map_old *nodemap,
2366 uint32_t *tval_noiptakeover,
2367 uint32_t *tval_noiphostonalldisabled)
2370 struct ctdb_ipflags *ipflags;
2372 /* Clear IP flags - implicit due to talloc_zero */
2373 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2374 if (ipflags == NULL) {
2375 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
2379 for (i=0;i<nodemap->num;i++) {
2380 /* Can not take IPs on node with NoIPTakeover set */
2381 if (tval_noiptakeover[i] != 0) {
2382 ipflags[i].noiptakeover = true;
2385 /* Can not host IPs on INACTIVE node */
2386 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2387 ipflags[i].noiphost = true;
2391 if (all_nodes_are_disabled(nodemap)) {
2392 /* If all nodes are disabled, can not host IPs on node
2393 * with NoIPHostOnAllDisabled set
2395 for (i=0;i<nodemap->num;i++) {
2396 if (tval_noiphostonalldisabled[i] != 0) {
2397 ipflags[i].noiphost = true;
2401 /* If some nodes are not disabled, then can not host
2402 * IPs on DISABLED node
2404 for (i=0;i<nodemap->num;i++) {
2405 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2406 ipflags[i].noiphost = true;
2414 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2415 TALLOC_CTX *tmp_ctx,
2416 struct ctdb_node_map_old *nodemap)
2418 uint32_t *tval_noiptakeover;
2419 uint32_t *tval_noiphostonalldisabled;
2420 struct ctdb_ipflags *ipflags;
2423 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2425 if (tval_noiptakeover == NULL) {
2429 tval_noiphostonalldisabled =
2430 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2431 "NoIPHostOnAllDisabled", 0);
2432 if (tval_noiphostonalldisabled == NULL) {
2433 /* Caller frees tmp_ctx */
2437 ipflags = set_ipflags_internal(tmp_ctx, nodemap,
2439 tval_noiphostonalldisabled);
2441 talloc_free(tval_noiptakeover);
2442 talloc_free(tval_noiphostonalldisabled);
2447 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2448 TALLOC_CTX *mem_ctx)
2450 struct ipalloc_state *ipalloc_state =
2451 talloc_zero(mem_ctx, struct ipalloc_state);
2452 if (ipalloc_state == NULL) {
2453 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2457 ipalloc_state->num = ctdb->num_nodes;
2458 ipalloc_state->known_public_ips =
2459 talloc_zero_array(ipalloc_state,
2460 struct ctdb_public_ip_list_old *,
2461 ipalloc_state->num);
2462 if (ipalloc_state->known_public_ips == NULL) {
2463 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2464 talloc_free(ipalloc_state);
2467 ipalloc_state->available_public_ips =
2468 talloc_zero_array(ipalloc_state,
2469 struct ctdb_public_ip_list_old *,
2470 ipalloc_state->num);
2471 if (ipalloc_state->available_public_ips == NULL) {
2472 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2473 talloc_free(ipalloc_state);
2477 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2478 ipalloc_state->algorithm = IPALLOC_LCP2;
2479 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2480 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2482 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2485 ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2487 return ipalloc_state;
2490 struct iprealloc_callback_data {
2493 client_async_callback fail_callback;
2494 void *fail_callback_data;
2495 struct ctdb_node_map_old *nodemap;
2498 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2499 int32_t res, TDB_DATA outdata,
2503 struct iprealloc_callback_data *cd =
2504 (struct iprealloc_callback_data *)callback;
2506 numnodes = talloc_array_length(cd->retry_nodes);
2507 if (pnn > numnodes) {
2509 ("ipreallocated failure from node %d, "
2510 "but only %d nodes in nodemap\n",
2515 /* Can't run the "ipreallocated" event on a INACTIVE node */
2516 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2517 DEBUG(DEBUG_WARNING,
2518 ("ipreallocated failed on inactive node %d, ignoring\n",
2525 /* If the control timed out then that's a real error,
2526 * so call the real fail callback
2528 if (cd->fail_callback) {
2529 cd->fail_callback(ctdb, pnn, res, outdata,
2530 cd->fail_callback_data);
2532 DEBUG(DEBUG_WARNING,
2533 ("iprealloc timed out but no callback registered\n"));
2537 /* If not a timeout then either the ipreallocated
2538 * eventscript (or some setup) failed. This might
2539 * have failed because the IPREALLOCATED control isn't
2540 * implemented - right now there is no way of knowing
2541 * because the error codes are all folded down to -1.
2542 * Consider retrying using EVENTSCRIPT control...
2544 DEBUG(DEBUG_WARNING,
2545 ("ipreallocated failure from node %d, flagging retry\n",
2547 cd->retry_nodes[pnn] = true;
2552 struct takeover_callback_data {
2554 client_async_callback fail_callback;
2555 void *fail_callback_data;
2556 struct ctdb_node_map_old *nodemap;
2559 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2560 uint32_t node_pnn, int32_t res,
2561 TDB_DATA outdata, void *callback_data)
2563 struct takeover_callback_data *cd =
2564 talloc_get_type_abort(callback_data,
2565 struct takeover_callback_data);
2568 for (i = 0; i < cd->nodemap->num; i++) {
2569 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2574 if (i == cd->nodemap->num) {
2575 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2579 if (!cd->node_failed[i]) {
2580 cd->node_failed[i] = true;
2581 cd->fail_callback(ctdb, node_pnn, res, outdata,
2582 cd->fail_callback_data);
2587 make any IP alias changes for public addresses that are necessary
2589 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2590 uint32_t *force_rebalance_nodes,
2591 client_async_callback fail_callback, void *callback_data)
2594 struct ctdb_public_ip ip;
2596 struct public_ip_list *all_ips, *tmp_ip;
2598 struct timeval timeout;
2599 struct client_async_data *async_data;
2600 struct ctdb_client_control_state *state;
2601 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2602 struct ctdb_ipflags *ipflags;
2603 struct ipalloc_state *ipalloc_state;
2604 struct takeover_callback_data *takeover_data;
2605 struct iprealloc_callback_data iprealloc_data;
2610 * ip failover is completely disabled, just send out the
2611 * ipreallocated event.
2613 if (ctdb->tunable.disable_ip_failover != 0) {
2617 ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2618 if (ipalloc_state == NULL) {
2619 talloc_free(tmp_ctx);
2623 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2624 if (ipflags == NULL) {
2625 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2626 talloc_free(tmp_ctx);
2630 /* Fetch known/available public IPs from each active node */
2631 ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2633 talloc_free(tmp_ctx);
2637 /* Short-circuit IP allocation if no node has available IPs */
2638 can_host_ips = false;
2639 for (i=0; i < ipalloc_state->num; i++) {
2640 if (ipalloc_state->available_public_ips[i] != NULL) {
2641 can_host_ips = true;
2644 if (!can_host_ips) {
2645 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2649 /* since nodes only know about those public addresses that
2650 can be served by that particular node, no single node has
2651 a full list of all public addresses that exist in the cluster.
2652 Walk over all node structures and create a merged list of
2653 all public addresses that exist in the cluster.
2655 keep the tree of ips around as ctdb->ip_tree
2657 all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2659 /* Do the IP reassignment calculations */
2660 ctdb_takeover_run_core(ipalloc_state, ipflags,
2661 all_ips, force_rebalance_nodes);
2663 /* Now tell all nodes to release any public IPs should not
2664 * host. This will be a NOOP on nodes that don't currently
2665 * hold the given IP.
2667 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2668 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2670 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2671 bool, nodemap->num);
2672 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2673 takeover_data->fail_callback = fail_callback;
2674 takeover_data->fail_callback_data = callback_data;
2675 takeover_data->nodemap = nodemap;
2677 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2678 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2680 async_data->fail_callback = takeover_run_fail_callback;
2681 async_data->callback_data = takeover_data;
2683 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2685 /* Send a RELEASE_IP to all nodes that should not be hosting
2686 * each IP. For each IP, all but one of these will be
2687 * redundant. However, the redundant ones are used to tell
2688 * nodes which node should be hosting the IP so that commands
2689 * like "ctdb ip" can display a particular nodes idea of who
2690 * is hosting what. */
2691 for (i=0;i<nodemap->num;i++) {
2692 /* don't talk to unconnected nodes, but do talk to banned nodes */
2693 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2697 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2698 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2699 /* This node should be serving this
2700 vnn so don't tell it to release the ip
2704 ip.pnn = tmp_ip->pnn;
2705 ip.addr = tmp_ip->addr;
2707 timeout = TAKEOVER_TIMEOUT();
2708 data.dsize = sizeof(ip);
2709 data.dptr = (uint8_t *)&ip;
2710 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2711 0, CTDB_CONTROL_RELEASE_IP, 0,
2714 if (state == NULL) {
2715 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2716 talloc_free(tmp_ctx);
2720 ctdb_client_async_add(async_data, state);
2723 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2724 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2725 talloc_free(tmp_ctx);
2728 talloc_free(async_data);
2731 /* For each IP, send a TAKOVER_IP to the node that should be
2732 * hosting it. Many of these will often be redundant (since
2733 * the allocation won't have changed) but they can be useful
2734 * to recover from inconsistencies. */
2735 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2736 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2738 async_data->fail_callback = fail_callback;
2739 async_data->callback_data = callback_data;
2741 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2742 if (tmp_ip->pnn == -1) {
2743 /* this IP won't be taken over */
2747 ip.pnn = tmp_ip->pnn;
2748 ip.addr = tmp_ip->addr;
2750 timeout = TAKEOVER_TIMEOUT();
2751 data.dsize = sizeof(ip);
2752 data.dptr = (uint8_t *)&ip;
2753 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2754 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2755 data, async_data, &timeout, NULL);
2756 if (state == NULL) {
2757 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2758 talloc_free(tmp_ctx);
2762 ctdb_client_async_add(async_data, state);
2764 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2765 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2766 talloc_free(tmp_ctx);
2772 * Tell all nodes to run eventscripts to process the
2773 * "ipreallocated" event. This can do a lot of things,
2774 * including restarting services to reconfigure them if public
2775 * IPs have moved. Once upon a time this event only used to
2778 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2779 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2780 iprealloc_data.retry_nodes = retry_data;
2781 iprealloc_data.retry_count = 0;
2782 iprealloc_data.fail_callback = fail_callback;
2783 iprealloc_data.fail_callback_data = callback_data;
2784 iprealloc_data.nodemap = nodemap;
2786 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2787 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2788 nodes, 0, TAKEOVER_TIMEOUT(),
2790 NULL, iprealloc_fail_callback,
2793 /* If the control failed then we should retry to any
2794 * nodes flagged by iprealloc_fail_callback using the
2795 * EVENTSCRIPT control. This is a best-effort at
2796 * backward compatiblity when running a mixed cluster
2797 * where some nodes have not yet been upgraded to
2798 * support the IPREALLOCATED control.
2800 DEBUG(DEBUG_WARNING,
2801 ("Retry ipreallocated to some nodes using eventscript control\n"));
2803 nodes = talloc_array(tmp_ctx, uint32_t,
2804 iprealloc_data.retry_count);
2805 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2808 for (i=0; i<nodemap->num; i++) {
2809 if (iprealloc_data.retry_nodes[i]) {
2815 data.dptr = discard_const("ipreallocated");
2816 data.dsize = strlen((char *)data.dptr) + 1;
2817 ret = ctdb_client_async_control(ctdb,
2818 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2819 nodes, 0, TAKEOVER_TIMEOUT(),
2821 NULL, fail_callback,
2824 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2828 talloc_free(tmp_ctx);
2834 destroy a ctdb_client_ip structure
2836 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2838 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2839 ctdb_addr_to_str(&ip->addr),
2840 ntohs(ip->addr.ip.sin_port),
2843 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2848 called by a client to inform us of a TCP connection that it is managing
2849 that should tickled with an ACK when IP takeover is done
2851 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2854 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2855 struct ctdb_connection *tcp_sock = NULL;
2856 struct ctdb_tcp_list *tcp;
2857 struct ctdb_connection t;
2860 struct ctdb_client_ip *ip;
2861 struct ctdb_vnn *vnn;
2862 ctdb_sock_addr addr;
2864 /* If we don't have public IPs, tickles are useless */
2865 if (ctdb->vnn == NULL) {
2869 tcp_sock = (struct ctdb_connection *)indata.dptr;
2871 addr = tcp_sock->src;
2872 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2873 addr = tcp_sock->dst;
2874 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2877 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2878 vnn = find_public_ip_vnn(ctdb, &addr);
2880 switch (addr.sa.sa_family) {
2882 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2883 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2884 ctdb_addr_to_str(&addr)));
2888 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2889 ctdb_addr_to_str(&addr)));
2892 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2898 if (vnn->pnn != ctdb->pnn) {
2899 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2900 ctdb_addr_to_str(&addr),
2901 client_id, client->pid));
2902 /* failing this call will tell smbd to die */
2906 ip = talloc(client, struct ctdb_client_ip);
2907 CTDB_NO_MEMORY(ctdb, ip);
2911 ip->client_id = client_id;
2912 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2913 DLIST_ADD(ctdb->client_ip_list, ip);
2915 tcp = talloc(client, struct ctdb_tcp_list);
2916 CTDB_NO_MEMORY(ctdb, tcp);
2918 tcp->connection.src = tcp_sock->src;
2919 tcp->connection.dst = tcp_sock->dst;
2921 DLIST_ADD(client->tcp_list, tcp);
2923 t.src = tcp_sock->src;
2924 t.dst = tcp_sock->dst;
2926 data.dptr = (uint8_t *)&t;
2927 data.dsize = sizeof(t);
2929 switch (addr.sa.sa_family) {
2931 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2932 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2933 ctdb_addr_to_str(&tcp_sock->src),
2934 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2937 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2938 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2939 ctdb_addr_to_str(&tcp_sock->src),
2940 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2943 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2947 /* tell all nodes about this tcp connection */
2948 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2949 CTDB_CONTROL_TCP_ADD,
2950 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2952 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2960 find a tcp address on a list
2962 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2963 struct ctdb_connection *tcp)
2967 if (array == NULL) {
2971 for (i=0;i<array->num;i++) {
2972 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2973 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2974 return &array->connections[i];
2983 called by a daemon to inform us of a TCP connection that one of its
2984 clients managing that should tickled with an ACK when IP takeover is
2987 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2989 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2990 struct ctdb_tcp_array *tcparray;
2991 struct ctdb_connection tcp;
2992 struct ctdb_vnn *vnn;
2994 /* If we don't have public IPs, tickles are useless */
2995 if (ctdb->vnn == NULL) {
2999 vnn = find_public_ip_vnn(ctdb, &p->dst);
3001 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3002 ctdb_addr_to_str(&p->dst)));
3008 tcparray = vnn->tcp_array;
3010 /* If this is the first tickle */
3011 if (tcparray == NULL) {
3012 tcparray = talloc(vnn, struct ctdb_tcp_array);
3013 CTDB_NO_MEMORY(ctdb, tcparray);
3014 vnn->tcp_array = tcparray;
3017 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3018 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3020 tcparray->connections[tcparray->num].src = p->src;
3021 tcparray->connections[tcparray->num].dst = p->dst;
3024 if (tcp_update_needed) {
3025 vnn->tcp_update_needed = true;
3031 /* Do we already have this tickle ?*/
3034 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3035 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3036 ctdb_addr_to_str(&tcp.dst),
3037 ntohs(tcp.dst.ip.sin_port),
3042 /* A new tickle, we must add it to the array */
3043 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3044 struct ctdb_connection,
3046 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3048 tcparray->connections[tcparray->num].src = p->src;
3049 tcparray->connections[tcparray->num].dst = p->dst;
3052 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3053 ctdb_addr_to_str(&tcp.dst),
3054 ntohs(tcp.dst.ip.sin_port),
3057 if (tcp_update_needed) {
3058 vnn->tcp_update_needed = true;
3066 called by a daemon to inform us of a TCP connection that one of its
3067 clients managing that should tickled with an ACK when IP takeover is
3070 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3072 struct ctdb_connection *tcpp;
3073 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3076 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3077 ctdb_addr_to_str(&conn->dst)));
3081 /* if the array is empty we cant remove it
3082 and we don't need to do anything
3084 if (vnn->tcp_array == NULL) {
3085 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3086 ctdb_addr_to_str(&conn->dst),
3087 ntohs(conn->dst.ip.sin_port)));
3092 /* See if we know this connection
3093 if we don't know this connection then we dont need to do anything
3095 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3097 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3098 ctdb_addr_to_str(&conn->dst),
3099 ntohs(conn->dst.ip.sin_port)));
3104 /* We need to remove this entry from the array.
3105 Instead of allocating a new array and copying data to it
3106 we cheat and just copy the last entry in the existing array
3107 to the entry that is to be removed and just shring the
3110 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3111 vnn->tcp_array->num--;
3113 /* If we deleted the last entry we also need to remove the entire array
3115 if (vnn->tcp_array->num == 0) {
3116 talloc_free(vnn->tcp_array);
3117 vnn->tcp_array = NULL;
3120 vnn->tcp_update_needed = true;
3122 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3123 ctdb_addr_to_str(&conn->src),
3124 ntohs(conn->src.ip.sin_port)));
3129 called by a daemon to inform us of a TCP connection that one of its
3130 clients used are no longer needed in the tickle database
3132 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3134 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3136 /* If we don't have public IPs, tickles are useless */
3137 if (ctdb->vnn == NULL) {
3141 ctdb_remove_connection(ctdb, conn);
3148 Called when another daemon starts - causes all tickles for all
3149 public addresses we are serving to be sent to the new node on the
3150 next check. This actually causes the next scheduled call to
3151 tdb_update_tcp_tickles() to update all nodes. This is simple and
3152 doesn't require careful error handling.
3154 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3156 struct ctdb_vnn *vnn;
3158 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3159 (unsigned long) pnn));
3161 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3162 vnn->tcp_update_needed = true;
3170 called when a client structure goes away - hook to remove
3171 elements from the tcp_list in all daemons
3173 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3175 while (client->tcp_list) {
3176 struct ctdb_tcp_list *tcp = client->tcp_list;
3177 DLIST_REMOVE(client->tcp_list, tcp);
3178 ctdb_remove_connection(client->ctdb, &tcp->connection);
3183 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3185 struct ctdb_vnn *vnn;
3188 if (ctdb->tunable.disable_ip_failover == 1) {
3192 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3193 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3194 ctdb_vnn_unassign_iface(ctdb, vnn);
3201 /* Don't allow multiple releases at once. Some code,
3202 * particularly ctdb_tickle_sentenced_connections() is
3204 if (vnn->update_in_flight) {
3205 DEBUG(DEBUG_WARNING,
3207 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3208 ctdb_addr_to_str(&vnn->public_address),
3209 vnn->public_netmask_bits,
3210 ctdb_vnn_iface_string(vnn)));
3213 vnn->update_in_flight = true;
3215 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3216 ctdb_addr_to_str(&vnn->public_address),
3217 vnn->public_netmask_bits,
3218 ctdb_vnn_iface_string(vnn)));
3220 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3221 ctdb_vnn_iface_string(vnn),
3222 ctdb_addr_to_str(&vnn->public_address),
3223 vnn->public_netmask_bits);
3224 release_kill_clients(ctdb, &vnn->public_address);
3225 ctdb_vnn_unassign_iface(ctdb, vnn);
3226 vnn->update_in_flight = false;
3230 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3235 get list of public IPs
3237 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3238 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3241 struct ctdb_public_ip_list_old *ips;
3242 struct ctdb_vnn *vnn;
3243 bool only_available = false;
3245 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3246 only_available = true;
3249 /* count how many public ip structures we have */
3251 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3255 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3256 num*sizeof(struct ctdb_public_ip);
3257 ips = talloc_zero_size(outdata, len);
3258 CTDB_NO_MEMORY(ctdb, ips);
3261 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3262 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3265 ips->ips[i].pnn = vnn->pnn;
3266 ips->ips[i].addr = vnn->public_address;
3270 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3271 i*sizeof(struct ctdb_public_ip);
3273 outdata->dsize = len;
3274 outdata->dptr = (uint8_t *)ips;
3280 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3281 struct ctdb_req_control_old *c,
3286 ctdb_sock_addr *addr;
3287 struct ctdb_public_ip_info_old *info;
3288 struct ctdb_vnn *vnn;
3290 addr = (ctdb_sock_addr *)indata.dptr;
3292 vnn = find_public_ip_vnn(ctdb, addr);
3294 /* if it is not a public ip it could be our 'single ip' */
3295 if (ctdb->single_ip_vnn) {
3296 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3297 vnn = ctdb->single_ip_vnn;
3302 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3303 "'%s'not a public address\n",
3304 ctdb_addr_to_str(addr)));
3308 /* count how many public ip structures we have */
3310 for (;vnn->ifaces[num];) {
3314 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3315 num*sizeof(struct ctdb_iface);
3316 info = talloc_zero_size(outdata, len);
3317 CTDB_NO_MEMORY(ctdb, info);
3319 info->ip.addr = vnn->public_address;
3320 info->ip.pnn = vnn->pnn;
3321 info->active_idx = 0xFFFFFFFF;
3323 for (i=0; vnn->ifaces[i]; i++) {
3324 struct ctdb_interface *cur;
3326 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3328 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3332 if (vnn->iface == cur) {
3333 info->active_idx = i;
3335 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3336 info->ifaces[i].link_state = cur->link_up;
3337 info->ifaces[i].references = cur->references;
3340 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3341 i*sizeof(struct ctdb_iface);
3343 outdata->dsize = len;
3344 outdata->dptr = (uint8_t *)info;
3349 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3350 struct ctdb_req_control_old *c,
3354 struct ctdb_iface_list_old *ifaces;
3355 struct ctdb_interface *cur;
3357 /* count how many public ip structures we have */
3359 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3363 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3364 num*sizeof(struct ctdb_iface);
3365 ifaces = talloc_zero_size(outdata, len);
3366 CTDB_NO_MEMORY(ctdb, ifaces);
3369 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3370 strcpy(ifaces->ifaces[i].name, cur->name);
3371 ifaces->ifaces[i].link_state = cur->link_up;
3372 ifaces->ifaces[i].references = cur->references;
3376 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3377 i*sizeof(struct ctdb_iface);
3379 outdata->dsize = len;
3380 outdata->dptr = (uint8_t *)ifaces;
3385 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3386 struct ctdb_req_control_old *c,
3389 struct ctdb_iface *info;
3390 struct ctdb_interface *iface;
3391 bool link_up = false;
3393 info = (struct ctdb_iface *)indata.dptr;
3395 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3396 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3397 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3398 len, len, info->name));
3402 switch (info->link_state) {
3410 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3411 (unsigned int)info->link_state));
3415 if (info->references != 0) {
3416 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3417 (unsigned int)info->references));
3421 iface = ctdb_find_iface(ctdb, info->name);
3422 if (iface == NULL) {
3426 if (link_up == iface->link_up) {
3430 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3431 ("iface[%s] has changed it's link status %s => %s\n",
3433 iface->link_up?"up":"down",
3434 link_up?"up":"down"));
3436 iface->link_up = link_up;
3442 structure containing the listening socket and the list of tcp connections
3443 that the ctdb daemon is to kill
3445 struct ctdb_kill_tcp {
3446 struct ctdb_vnn *vnn;
3447 struct ctdb_context *ctdb;
3449 struct tevent_fd *fde;
3450 trbt_tree_t *connections;
3455 a tcp connection that is to be killed
3457 struct ctdb_killtcp_con {
3458 ctdb_sock_addr src_addr;
3459 ctdb_sock_addr dst_addr;
3461 struct ctdb_kill_tcp *killtcp;
3464 /* this function is used to create a key to represent this socketpair
3465 in the killtcp tree.
3466 this key is used to insert and lookup matching socketpairs that are
3467 to be tickled and RST
3469 #define KILLTCP_KEYLEN 10
3470 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3472 static uint32_t key[KILLTCP_KEYLEN];
3474 bzero(key, sizeof(key));
3476 if (src->sa.sa_family != dst->sa.sa_family) {
3477 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3481 switch (src->sa.sa_family) {
3483 key[0] = dst->ip.sin_addr.s_addr;
3484 key[1] = src->ip.sin_addr.s_addr;
3485 key[2] = dst->ip.sin_port;
3486 key[3] = src->ip.sin_port;
3489 uint32_t *dst6_addr32 =
3490 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3491 uint32_t *src6_addr32 =
3492 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3493 key[0] = dst6_addr32[3];
3494 key[1] = src6_addr32[3];
3495 key[2] = dst6_addr32[2];
3496 key[3] = src6_addr32[2];
3497 key[4] = dst6_addr32[1];
3498 key[5] = src6_addr32[1];
3499 key[6] = dst6_addr32[0];
3500 key[7] = src6_addr32[0];
3501 key[8] = dst->ip6.sin6_port;
3502 key[9] = src->ip6.sin6_port;
3506 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3514 called when we get a read event on the raw socket
3516 static void capture_tcp_handler(struct tevent_context *ev,
3517 struct tevent_fd *fde,
3518 uint16_t flags, void *private_data)
3520 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3521 struct ctdb_killtcp_con *con;
3522 ctdb_sock_addr src, dst;
3523 uint32_t ack_seq, seq;
3525 if (!(flags & TEVENT_FD_READ)) {
3529 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3530 killtcp->private_data,
3532 &ack_seq, &seq) != 0) {
3533 /* probably a non-tcp ACK packet */
3537 /* check if we have this guy in our list of connections
3540 con = trbt_lookuparray32(killtcp->connections,
3541 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3543 /* no this was some other packet we can just ignore */
3547 /* This one has been tickled !
3548 now reset him and remove him from the list.
3550 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3551 ntohs(con->dst_addr.ip.sin_port),
3552 ctdb_addr_to_str(&con->src_addr),
3553 ntohs(con->src_addr.ip.sin_port)));
3555 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3560 /* when traversing the list of all tcp connections to send tickle acks to
3561 (so that we can capture the ack coming back and kill the connection
3563 this callback is called for each connection we are currently trying to kill
3565 static int tickle_connection_traverse(void *param, void *data)
3567 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3569 /* have tried too many times, just give up */
3570 if (con->count >= 5) {
3571 /* can't delete in traverse: reparent to delete_cons */
3572 talloc_steal(param, con);
3576 /* othervise, try tickling it again */
3579 (ctdb_sock_addr *)&con->dst_addr,
3580 (ctdb_sock_addr *)&con->src_addr,
3587 called every second until all sentenced connections have been reset
3589 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3590 struct tevent_timer *te,
3591 struct timeval t, void *private_data)
3593 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3594 void *delete_cons = talloc_new(NULL);
3596 /* loop over all connections sending tickle ACKs */
3597 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3599 /* now we've finished traverse, it's safe to do deletion. */
3600 talloc_free(delete_cons);
3602 /* If there are no more connections to kill we can remove the
3603 entire killtcp structure
3605 if ( (killtcp->connections == NULL) ||
3606 (killtcp->connections->root == NULL) ) {
3607 talloc_free(killtcp);
3611 /* try tickling them again in a seconds time
3613 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3614 timeval_current_ofs(1, 0),
3615 ctdb_tickle_sentenced_connections, killtcp);
3619 destroy the killtcp structure
3621 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3623 struct ctdb_vnn *tmpvnn;
3625 /* verify that this vnn is still active */
3626 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3627 if (tmpvnn == killtcp->vnn) {
3632 if (tmpvnn == NULL) {
3636 if (killtcp->vnn->killtcp != killtcp) {
3640 killtcp->vnn->killtcp = NULL;
3646 /* nothing fancy here, just unconditionally replace any existing
3647 connection structure with the new one.
3649 don't even free the old one if it did exist, that one is talloc_stolen
3650 by the same node in the tree anyway and will be deleted when the new data
3653 static void *add_killtcp_callback(void *parm, void *data)
3659 add a tcp socket to the list of connections we want to RST
3661 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3665 ctdb_sock_addr src, dst;
3666 struct ctdb_kill_tcp *killtcp;
3667 struct ctdb_killtcp_con *con;
3668 struct ctdb_vnn *vnn;
3670 ctdb_canonicalize_ip(s, &src);
3671 ctdb_canonicalize_ip(d, &dst);
3673 vnn = find_public_ip_vnn(ctdb, &dst);
3675 vnn = find_public_ip_vnn(ctdb, &src);
3678 /* if it is not a public ip it could be our 'single ip' */
3679 if (ctdb->single_ip_vnn) {
3680 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3681 vnn = ctdb->single_ip_vnn;
3686 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3690 killtcp = vnn->killtcp;
3692 /* If this is the first connection to kill we must allocate
3695 if (killtcp == NULL) {
3696 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3697 CTDB_NO_MEMORY(ctdb, killtcp);
3700 killtcp->ctdb = ctdb;
3701 killtcp->capture_fd = -1;
3702 killtcp->connections = trbt_create(killtcp, 0);
3704 vnn->killtcp = killtcp;
3705 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3710 /* create a structure that describes this connection we want to
3711 RST and store it in killtcp->connections
3713 con = talloc(killtcp, struct ctdb_killtcp_con);
3714 CTDB_NO_MEMORY(ctdb, con);
3715 con->src_addr = src;
3716 con->dst_addr = dst;
3718 con->killtcp = killtcp;
3721 trbt_insertarray32_callback(killtcp->connections,
3722 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3723 add_killtcp_callback, con);
3726 If we don't have a socket to listen on yet we must create it
3728 if (killtcp->capture_fd == -1) {
3729 const char *iface = ctdb_vnn_iface_string(vnn);
3730 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3731 if (killtcp->capture_fd == -1) {
3732 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3733 "socket on iface '%s' for killtcp (%s)\n",
3734 iface, strerror(errno)));
3740 if (killtcp->fde == NULL) {
3741 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3742 killtcp->capture_fd,
3744 capture_tcp_handler, killtcp);
3745 tevent_fd_set_auto_close(killtcp->fde);
3747 /* We also need to set up some events to tickle all these connections
3748 until they are all reset
3750 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3751 ctdb_tickle_sentenced_connections, killtcp);
3754 /* tickle him once now */
3763 talloc_free(vnn->killtcp);
3764 vnn->killtcp = NULL;
3769 kill a TCP connection.
3771 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3773 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3775 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3779 called by a daemon to inform us of the entire list of TCP tickles for
3780 a particular public address.
3781 this control should only be sent by the node that is currently serving
3782 that public address.
3784 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3786 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3787 struct ctdb_tcp_array *tcparray;
3788 struct ctdb_vnn *vnn;
3790 /* We must at least have tickles.num or else we cant verify the size
3791 of the received data blob
3793 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3794 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3798 /* verify that the size of data matches what we expect */
3799 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3800 + sizeof(struct ctdb_connection) * list->num) {
3801 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3805 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3806 ctdb_addr_to_str(&list->addr)));
3808 vnn = find_public_ip_vnn(ctdb, &list->addr);
3810 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3811 ctdb_addr_to_str(&list->addr)));
3816 /* remove any old ticklelist we might have */
3817 talloc_free(vnn->tcp_array);
3818 vnn->tcp_array = NULL;
3820 tcparray = talloc(vnn, struct ctdb_tcp_array);
3821 CTDB_NO_MEMORY(ctdb, tcparray);
3823 tcparray->num = list->num;
3825 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3826 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3828 memcpy(tcparray->connections, &list->connections[0],
3829 sizeof(struct ctdb_connection)*tcparray->num);
3831 /* We now have a new fresh tickle list array for this vnn */
3832 vnn->tcp_array = tcparray;
3838 called to return the full list of tickles for the puclic address associated
3839 with the provided vnn
3841 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3843 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3844 struct ctdb_tickle_list_old *list;
3845 struct ctdb_tcp_array *tcparray;
3847 struct ctdb_vnn *vnn;
3849 vnn = find_public_ip_vnn(ctdb, addr);
3851 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3852 ctdb_addr_to_str(addr)));
3857 tcparray = vnn->tcp_array;
3859 num = tcparray->num;
3864 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3865 + sizeof(struct ctdb_connection) * num;
3867 outdata->dptr = talloc_size(outdata, outdata->dsize);
3868 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3869 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3874 memcpy(&list->connections[0], tcparray->connections,
3875 sizeof(struct ctdb_connection) * num);
3883 set the list of all tcp tickles for a public address
3885 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3886 ctdb_sock_addr *addr,
3887 struct ctdb_tcp_array *tcparray)
3891 struct ctdb_tickle_list_old *list;
3894 num = tcparray->num;
3899 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3900 sizeof(struct ctdb_connection) * num;
3901 data.dptr = talloc_size(ctdb, data.dsize);
3902 CTDB_NO_MEMORY(ctdb, data.dptr);
3904 list = (struct ctdb_tickle_list_old *)data.dptr;
3908 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3911 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3912 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3913 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3915 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3919 talloc_free(data.dptr);
3926 perform tickle updates if required
3928 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3929 struct tevent_timer *te,
3930 struct timeval t, void *private_data)
3932 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3934 struct ctdb_vnn *vnn;
3936 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3937 /* we only send out updates for public addresses that
3940 if (ctdb->pnn != vnn->pnn) {
3943 /* We only send out the updates if we need to */
3944 if (!vnn->tcp_update_needed) {
3947 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3948 &vnn->public_address,
3951 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3952 ctdb_addr_to_str(&vnn->public_address)));
3955 ("Sent tickle update for public address %s\n",
3956 ctdb_addr_to_str(&vnn->public_address)));
3957 vnn->tcp_update_needed = false;
3961 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3962 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3963 ctdb_update_tcp_tickles, ctdb);
3967 start periodic update of tcp tickles
3969 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3971 ctdb->tickle_update_context = talloc_new(ctdb);
3973 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3974 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3975 ctdb_update_tcp_tickles, ctdb);
3981 struct control_gratious_arp {
3982 struct ctdb_context *ctdb;
3983 ctdb_sock_addr addr;
3989 send a control_gratuitous arp
3991 static void send_gratious_arp(struct tevent_context *ev,
3992 struct tevent_timer *te,
3993 struct timeval t, void *private_data)
3996 struct control_gratious_arp *arp = talloc_get_type(private_data,
3997 struct control_gratious_arp);
3999 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4001 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4002 arp->iface, strerror(errno)));
4007 if (arp->count == CTDB_ARP_REPEAT) {
4012 tevent_add_timer(arp->ctdb->ev, arp,
4013 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4014 send_gratious_arp, arp);
4021 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4023 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4024 struct control_gratious_arp *arp;
4026 /* verify the size of indata */
4027 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4028 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4029 (unsigned)indata.dsize,
4030 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4034 ( offsetof(struct ctdb_addr_info_old, iface)
4035 + gratious_arp->len ) ){
4037 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4038 "but should be %u bytes\n",
4039 (unsigned)indata.dsize,
4040 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4045 arp = talloc(ctdb, struct control_gratious_arp);
4046 CTDB_NO_MEMORY(ctdb, arp);
4049 arp->addr = gratious_arp->addr;
4050 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4051 CTDB_NO_MEMORY(ctdb, arp->iface);
4054 tevent_add_timer(arp->ctdb->ev, arp,
4055 timeval_zero(), send_gratious_arp, arp);
4060 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4062 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4065 /* verify the size of indata */
4066 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4067 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4071 ( offsetof(struct ctdb_addr_info_old, iface)
4074 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4075 "but should be %u bytes\n",
4076 (unsigned)indata.dsize,
4077 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4081 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4083 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4086 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4093 struct delete_ip_callback_state {
4094 struct ctdb_req_control_old *c;
4098 called when releaseip event finishes for del_public_address
4100 static void delete_ip_callback(struct ctdb_context *ctdb,
4101 int32_t status, TDB_DATA data,
4102 const char *errormsg,
4105 struct delete_ip_callback_state *state =
4106 talloc_get_type(private_data, struct delete_ip_callback_state);
4108 /* If release failed then fail. */
4109 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4110 talloc_free(private_data);
4113 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4114 struct ctdb_req_control_old *c,
4115 TDB_DATA indata, bool *async_reply)
4117 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4118 struct ctdb_vnn *vnn;
4120 /* verify the size of indata */
4121 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4122 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4126 ( offsetof(struct ctdb_addr_info_old, iface)
4129 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4130 "but should be %u bytes\n",
4131 (unsigned)indata.dsize,
4132 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4136 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4138 /* walk over all public addresses until we find a match */
4139 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4140 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4141 if (vnn->pnn == ctdb->pnn) {
4142 struct delete_ip_callback_state *state;
4143 struct ctdb_public_ip *ip;
4147 vnn->delete_pending = true;
4149 state = talloc(ctdb,
4150 struct delete_ip_callback_state);
4151 CTDB_NO_MEMORY(ctdb, state);
4154 ip = talloc(state, struct ctdb_public_ip);
4157 (__location__ " Out of memory\n"));
4162 ip->addr = pub->addr;
4164 data.dsize = sizeof(struct ctdb_public_ip);
4165 data.dptr = (unsigned char *)ip;
4167 ret = ctdb_daemon_send_control(ctdb,
4170 CTDB_CONTROL_RELEASE_IP,
4177 (__location__ "Unable to send "
4178 "CTDB_CONTROL_RELEASE_IP\n"));
4183 state->c = talloc_steal(state, c);
4184 *async_reply = true;
4186 /* This IP is not hosted on the
4187 * current node so just delete it
4189 do_delete_ip(ctdb, vnn);
4196 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4197 ctdb_addr_to_str(&pub->addr)));
4202 struct ipreallocated_callback_state {
4203 struct ctdb_req_control_old *c;
4206 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4207 int status, void *p)
4209 struct ipreallocated_callback_state *state =
4210 talloc_get_type(p, struct ipreallocated_callback_state);
4214 (" \"ipreallocated\" event script failed (status %d)\n",
4216 if (status == -ETIME) {
4217 ctdb_ban_self(ctdb);
4221 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4225 /* A control to run the ipreallocated event */
4226 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4227 struct ctdb_req_control_old *c,
4231 struct ipreallocated_callback_state *state;
4233 state = talloc(ctdb, struct ipreallocated_callback_state);
4234 CTDB_NO_MEMORY(ctdb, state);
4236 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4238 ret = ctdb_event_script_callback(ctdb, state,
4239 ctdb_ipreallocated_callback, state,
4240 CTDB_EVENT_IPREALLOCATED,
4244 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4249 /* tell the control that we will be reply asynchronously */
4250 state->c = talloc_steal(state, c);
4251 *async_reply = true;
4257 /* This function is called from the recovery daemon to verify that a remote
4258 node has the expected ip allocation.
4259 This is verified against ctdb->ip_tree
4261 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4262 struct ctdb_public_ip_list_old *ips,
4265 struct public_ip_list *tmp_ip;
4268 if (ctdb->ip_tree == NULL) {
4269 /* don't know the expected allocation yet, assume remote node
4278 for (i=0; i<ips->num; i++) {
4279 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4280 if (tmp_ip == NULL) {
4281 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4285 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4289 if (tmp_ip->pnn != ips->ips[i].pnn) {
4291 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4293 ctdb_addr_to_str(&ips->ips[i].addr),
4294 ips->ips[i].pnn, tmp_ip->pnn));
4302 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4304 struct public_ip_list *tmp_ip;
4306 /* IP tree is never built if DisableIPFailover is set */
4307 if (ctdb->tunable.disable_ip_failover != 0) {
4311 if (ctdb->ip_tree == NULL) {
4312 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4316 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4317 if (tmp_ip == NULL) {
4318 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4322 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4323 tmp_ip->pnn = ip->pnn;
4328 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4330 TALLOC_FREE(ctdb->ip_tree);
4333 struct ctdb_reloadips_handle {
4334 struct ctdb_context *ctdb;
4335 struct ctdb_req_control_old *c;
4339 struct tevent_fd *fde;
4342 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4344 if (h == h->ctdb->reload_ips) {
4345 h->ctdb->reload_ips = NULL;
4348 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4351 ctdb_kill(h->ctdb, h->child, SIGKILL);
4355 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4356 struct tevent_timer *te,
4357 struct timeval t, void *private_data)
4359 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4364 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4365 struct tevent_fd *fde,
4366 uint16_t flags, void *private_data)
4368 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4373 ret = sys_read(h->fd[0], &res, 1);
4374 if (ret < 1 || res != 0) {
4375 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4383 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4385 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4386 struct ctdb_public_ip_list_old *ips;
4387 struct ctdb_vnn *vnn;
4388 struct client_async_data *async_data;
4389 struct timeval timeout;
4391 struct ctdb_client_control_state *state;
4395 CTDB_NO_MEMORY(ctdb, mem_ctx);
4397 /* Read IPs from local node */
4398 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4399 CTDB_CURRENT_NODE, mem_ctx, &ips);
4402 ("Unable to fetch public IPs from local node\n"));
4403 talloc_free(mem_ctx);
4407 /* Read IPs file - this is safe since this is a child process */
4409 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4410 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4411 talloc_free(mem_ctx);
4415 async_data = talloc_zero(mem_ctx, struct client_async_data);
4416 CTDB_NO_MEMORY(ctdb, async_data);
4418 /* Compare IPs between node and file for IPs to be deleted */
4419 for (i = 0; i < ips->num; i++) {
4421 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4422 if (ctdb_same_ip(&vnn->public_address,
4423 &ips->ips[i].addr)) {
4424 /* IP is still in file */
4430 /* Delete IP ips->ips[i] */
4431 struct ctdb_addr_info_old *pub;
4434 ("IP %s no longer configured, deleting it\n",
4435 ctdb_addr_to_str(&ips->ips[i].addr)));
4437 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4438 CTDB_NO_MEMORY(ctdb, pub);
4440 pub->addr = ips->ips[i].addr;
4444 timeout = TAKEOVER_TIMEOUT();
4446 data.dsize = offsetof(struct ctdb_addr_info_old,
4448 data.dptr = (uint8_t *)pub;
4450 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4451 CTDB_CONTROL_DEL_PUBLIC_IP,
4452 0, data, async_data,
4454 if (state == NULL) {
4457 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4461 ctdb_client_async_add(async_data, state);
4465 /* Compare IPs between node and file for IPs to be added */
4467 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4468 for (i = 0; i < ips->num; i++) {
4469 if (ctdb_same_ip(&vnn->public_address,
4470 &ips->ips[i].addr)) {
4471 /* IP already on node */
4475 if (i == ips->num) {
4476 /* Add IP ips->ips[i] */
4477 struct ctdb_addr_info_old *pub;
4478 const char *ifaces = NULL;
4483 ("New IP %s configured, adding it\n",
4484 ctdb_addr_to_str(&vnn->public_address)));
4486 uint32_t pnn = ctdb_get_pnn(ctdb);
4488 data.dsize = sizeof(pnn);
4489 data.dptr = (uint8_t *)&pnn;
4491 ret = ctdb_client_send_message(
4493 CTDB_BROADCAST_CONNECTED,
4494 CTDB_SRVID_REBALANCE_NODE,
4497 DEBUG(DEBUG_WARNING,
4498 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4504 ifaces = vnn->ifaces[0];
4506 while (vnn->ifaces[iface] != NULL) {
4507 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4508 vnn->ifaces[iface]);
4512 len = strlen(ifaces) + 1;
4513 pub = talloc_zero_size(mem_ctx,
4514 offsetof(struct ctdb_addr_info_old, iface) + len);
4515 CTDB_NO_MEMORY(ctdb, pub);
4517 pub->addr = vnn->public_address;
4518 pub->mask = vnn->public_netmask_bits;
4520 memcpy(&pub->iface[0], ifaces, pub->len);
4522 timeout = TAKEOVER_TIMEOUT();
4524 data.dsize = offsetof(struct ctdb_addr_info_old,
4526 data.dptr = (uint8_t *)pub;
4528 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4529 CTDB_CONTROL_ADD_PUBLIC_IP,
4530 0, data, async_data,
4532 if (state == NULL) {
4535 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4539 ctdb_client_async_add(async_data, state);
4543 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4544 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4548 talloc_free(mem_ctx);
4552 talloc_free(mem_ctx);
4556 /* This control is sent to force the node to re-read the public addresses file
4557 and drop any addresses we should nnot longer host, and add new addresses
4558 that we are now able to host
4560 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4562 struct ctdb_reloadips_handle *h;
4563 pid_t parent = getpid();
4565 if (ctdb->reload_ips != NULL) {
4566 talloc_free(ctdb->reload_ips);
4567 ctdb->reload_ips = NULL;
4570 h = talloc(ctdb, struct ctdb_reloadips_handle);
4571 CTDB_NO_MEMORY(ctdb, h);
4576 if (pipe(h->fd) == -1) {
4577 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4582 h->child = ctdb_fork(ctdb);
4583 if (h->child == (pid_t)-1) {
4584 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4592 if (h->child == 0) {
4593 signed char res = 0;
4596 debug_extra = talloc_asprintf(NULL, "reloadips:");
4598 prctl_set_comment("ctdb_reloadips");
4599 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4600 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4603 res = ctdb_reloadips_child(ctdb);
4605 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4609 sys_write(h->fd[1], &res, 1);
4610 /* make sure we die when our parent dies */
4611 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4617 h->c = talloc_steal(h, c);
4620 set_close_on_exec(h->fd[0]);
4622 talloc_set_destructor(h, ctdb_reloadips_destructor);
4625 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4626 ctdb_reloadips_child_handler, (void *)h);
4627 tevent_fd_set_auto_close(h->fde);
4629 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4630 ctdb_reloadips_timeout_event, h);
4632 /* we reply later */
4633 *async_reply = true;