4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29 #include "common/reqid.h"
30 #include "common/system.h"
33 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
35 #define CTDB_ARP_INTERVAL 1
36 #define CTDB_ARP_REPEAT 3
38 /* Flags used in IP allocation algorithms. */
42 enum ctdb_runstate runstate;
46 struct ctdb_iface *prev, *next;
52 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
55 return vnn->iface->name;
61 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
65 /* Verify that we dont have an entry for this ip yet */
66 for (i=ctdb->ifaces;i;i=i->next) {
67 if (strcmp(i->name, iface) == 0) {
72 /* create a new structure for this interface */
73 i = talloc_zero(ctdb, struct ctdb_iface);
74 CTDB_NO_MEMORY_FATAL(ctdb, i);
75 i->name = talloc_strdup(i, iface);
76 CTDB_NO_MEMORY(ctdb, i->name);
80 DLIST_ADD(ctdb->ifaces, i);
85 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
90 for (n = 0; vnn->ifaces[n] != NULL; n++) {
91 if (strcmp(name, vnn->ifaces[n]) == 0) {
99 /* If any interfaces now have no possible IPs then delete them. This
100 * implementation is naive (i.e. simple) rather than clever
101 * (i.e. complex). Given that this is run on delip and that operation
102 * is rare, this doesn't need to be efficient - it needs to be
103 * foolproof. One alternative is reference counting, where the logic
104 * is distributed and can, therefore, be broken in multiple places.
105 * Another alternative is to build a red-black tree of interfaces that
106 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
107 * once) and then walking ctdb->ifaces once and deleting those not in
108 * the tree. Let's go to one of those if the naive implementation
109 * causes problems... :-)
111 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
112 struct ctdb_vnn *vnn)
114 struct ctdb_iface *i, *next;
116 /* For each interface, check if there's an IP using it. */
117 for (i = ctdb->ifaces; i != NULL; i = next) {
122 /* Only consider interfaces named in the given VNN. */
123 if (!vnn_has_interface_with_name(vnn, i->name)) {
127 /* Is the "single IP" on this interface? */
128 if ((ctdb->single_ip_vnn != NULL) &&
129 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
130 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
131 /* Found, next interface please... */
134 /* Search for a vnn with this interface. */
136 for (tv=ctdb->vnn; tv; tv=tv->next) {
137 if (vnn_has_interface_with_name(tv, i->name)) {
144 /* None of the VNNs are using this interface. */
145 DLIST_REMOVE(ctdb->ifaces, i);
152 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
155 struct ctdb_iface *i;
157 for (i=ctdb->ifaces;i;i=i->next) {
158 if (strcmp(i->name, iface) == 0) {
166 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
167 struct ctdb_vnn *vnn)
170 struct ctdb_iface *cur = NULL;
171 struct ctdb_iface *best = NULL;
173 for (i=0; vnn->ifaces[i]; i++) {
175 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
189 if (cur->references < best->references) {
198 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
199 struct ctdb_vnn *vnn)
201 struct ctdb_iface *best = NULL;
204 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
205 "still assigned to iface '%s'\n",
206 ctdb_addr_to_str(&vnn->public_address),
207 ctdb_vnn_iface_string(vnn)));
211 best = ctdb_vnn_best_iface(ctdb, vnn);
213 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
214 "cannot assign to iface any iface\n",
215 ctdb_addr_to_str(&vnn->public_address)));
221 vnn->pnn = ctdb->pnn;
223 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
224 "now assigned to iface '%s' refs[%d]\n",
225 ctdb_addr_to_str(&vnn->public_address),
226 ctdb_vnn_iface_string(vnn),
231 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
232 struct ctdb_vnn *vnn)
234 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235 "now unassigned (old iface '%s' refs[%d])\n",
236 ctdb_addr_to_str(&vnn->public_address),
237 ctdb_vnn_iface_string(vnn),
238 vnn->iface?vnn->iface->references:0));
240 vnn->iface->references--;
243 if (vnn->pnn == ctdb->pnn) {
248 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
249 struct ctdb_vnn *vnn)
253 if (vnn->delete_pending) {
257 if (vnn->iface && vnn->iface->link_up) {
261 for (i=0; vnn->ifaces[i]; i++) {
262 struct ctdb_iface *cur;
264 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
277 struct ctdb_takeover_arp {
278 struct ctdb_context *ctdb;
281 struct ctdb_tcp_array *tcparray;
282 struct ctdb_vnn *vnn;
287 lists of tcp endpoints
289 struct ctdb_tcp_list {
290 struct ctdb_tcp_list *prev, *next;
291 struct ctdb_tcp_connection connection;
295 list of clients to kill on IP release
297 struct ctdb_client_ip {
298 struct ctdb_client_ip *prev, *next;
299 struct ctdb_context *ctdb;
306 send a gratuitous arp
308 static void ctdb_control_send_arp(struct tevent_context *ev,
309 struct tevent_timer *te,
310 struct timeval t, void *private_data)
312 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
313 struct ctdb_takeover_arp);
315 struct ctdb_tcp_array *tcparray;
316 const char *iface = ctdb_vnn_iface_string(arp->vnn);
318 ret = ctdb_sys_send_arp(&arp->addr, iface);
320 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321 iface, strerror(errno)));
324 tcparray = arp->tcparray;
326 for (i=0;i<tcparray->num;i++) {
327 struct ctdb_tcp_connection *tcon;
329 tcon = &tcparray->connections[i];
330 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
332 ctdb_addr_to_str(&tcon->src_addr),
333 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334 ret = ctdb_sys_send_tcp(
339 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340 ctdb_addr_to_str(&tcon->src_addr)));
347 if (arp->count == CTDB_ARP_REPEAT) {
352 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
353 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
354 ctdb_control_send_arp, arp);
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358 struct ctdb_vnn *vnn)
360 struct ctdb_takeover_arp *arp;
361 struct ctdb_tcp_array *tcparray;
363 if (!vnn->takeover_ctx) {
364 vnn->takeover_ctx = talloc_new(vnn);
365 if (!vnn->takeover_ctx) {
370 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376 arp->addr = vnn->public_address;
379 tcparray = vnn->tcp_array;
381 /* add all of the known tcp connections for this IP to the
382 list of tcp connections to send tickle acks for */
383 arp->tcparray = talloc_steal(arp, tcparray);
385 vnn->tcp_array = NULL;
386 vnn->tcp_update_needed = true;
389 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
390 timeval_zero(), ctdb_control_send_arp, arp);
395 struct takeover_callback_state {
396 struct ctdb_req_control *c;
397 ctdb_sock_addr *addr;
398 struct ctdb_vnn *vnn;
401 struct ctdb_do_takeip_state {
402 struct ctdb_req_control *c;
403 struct ctdb_vnn *vnn;
407 called when takeip event finishes
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
412 struct ctdb_do_takeip_state *state =
413 talloc_get_type(private_data, struct ctdb_do_takeip_state);
418 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
420 if (status == -ETIME) {
423 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424 ctdb_addr_to_str(&state->vnn->public_address),
425 ctdb_vnn_iface_string(state->vnn)));
426 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
428 node->flags |= NODE_FLAGS_UNHEALTHY;
433 if (ctdb->do_checkpublicip) {
435 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
437 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
444 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445 data.dsize = strlen((char *)data.dptr) + 1;
446 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
448 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
451 /* the control succeeded */
452 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
459 state->vnn->update_in_flight = false;
464 take over an ip address
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467 struct ctdb_req_control *c,
468 struct ctdb_vnn *vnn)
471 struct ctdb_do_takeip_state *state;
473 if (vnn->update_in_flight) {
474 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475 "update for this IP already in flight\n",
476 ctdb_addr_to_str(&vnn->public_address),
477 vnn->public_netmask_bits));
481 ret = ctdb_vnn_assign_iface(ctdb, vnn);
483 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484 "assign a usable interface\n",
485 ctdb_addr_to_str(&vnn->public_address),
486 vnn->public_netmask_bits));
490 state = talloc(vnn, struct ctdb_do_takeip_state);
491 CTDB_NO_MEMORY(ctdb, state);
493 state->c = talloc_steal(ctdb, c);
496 vnn->update_in_flight = true;
497 talloc_set_destructor(state, ctdb_takeip_destructor);
499 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500 ctdb_addr_to_str(&vnn->public_address),
501 vnn->public_netmask_bits,
502 ctdb_vnn_iface_string(vnn)));
504 ret = ctdb_event_script_callback(ctdb,
506 ctdb_do_takeip_callback,
510 ctdb_vnn_iface_string(vnn),
511 ctdb_addr_to_str(&vnn->public_address),
512 vnn->public_netmask_bits);
515 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
516 ctdb_addr_to_str(&vnn->public_address),
517 ctdb_vnn_iface_string(vnn)));
525 struct ctdb_do_updateip_state {
526 struct ctdb_req_control *c;
527 struct ctdb_iface *old;
528 struct ctdb_vnn *vnn;
532 called when updateip event finishes
534 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
537 struct ctdb_do_updateip_state *state =
538 talloc_get_type(private_data, struct ctdb_do_updateip_state);
542 if (status == -ETIME) {
545 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
546 ctdb_addr_to_str(&state->vnn->public_address),
548 ctdb_vnn_iface_string(state->vnn)));
551 * All we can do is reset the old interface
552 * and let the next run fix it
554 ctdb_vnn_unassign_iface(ctdb, state->vnn);
555 state->vnn->iface = state->old;
556 state->vnn->iface->references++;
558 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
563 if (ctdb->do_checkpublicip) {
565 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574 /* the control succeeded */
575 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
580 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 state->vnn->update_in_flight = false;
587 update (move) an ip address
589 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
590 struct ctdb_req_control *c,
591 struct ctdb_vnn *vnn)
594 struct ctdb_do_updateip_state *state;
595 struct ctdb_iface *old = vnn->iface;
596 const char *new_name;
598 if (vnn->update_in_flight) {
599 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
600 "update for this IP already in flight\n",
601 ctdb_addr_to_str(&vnn->public_address),
602 vnn->public_netmask_bits));
606 ctdb_vnn_unassign_iface(ctdb, vnn);
607 ret = ctdb_vnn_assign_iface(ctdb, vnn);
609 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
610 "assin a usable interface (old iface '%s')\n",
611 ctdb_addr_to_str(&vnn->public_address),
612 vnn->public_netmask_bits,
617 new_name = ctdb_vnn_iface_string(vnn);
618 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
619 /* A benign update from one interface onto itself.
620 * no need to run the eventscripts in this case, just return
623 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
627 state = talloc(vnn, struct ctdb_do_updateip_state);
628 CTDB_NO_MEMORY(ctdb, state);
630 state->c = talloc_steal(ctdb, c);
634 vnn->update_in_flight = true;
635 talloc_set_destructor(state, ctdb_updateip_destructor);
637 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
638 "interface %s to %s\n",
639 ctdb_addr_to_str(&vnn->public_address),
640 vnn->public_netmask_bits,
644 ret = ctdb_event_script_callback(ctdb,
646 ctdb_do_updateip_callback,
648 CTDB_EVENT_UPDATE_IP,
652 ctdb_addr_to_str(&vnn->public_address),
653 vnn->public_netmask_bits);
655 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
656 ctdb_addr_to_str(&vnn->public_address),
657 old->name, new_name));
666 Find the vnn of the node that has a public ip address
667 returns -1 if the address is not known as a public address
669 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
671 struct ctdb_vnn *vnn;
673 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
674 if (ctdb_same_ip(&vnn->public_address, addr)) {
683 take over an ip address
685 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
686 struct ctdb_req_control *c,
691 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
692 struct ctdb_vnn *vnn;
693 bool have_ip = false;
694 bool do_updateip = false;
695 bool do_takeip = false;
696 struct ctdb_iface *best_iface = NULL;
698 if (pip->pnn != ctdb->pnn) {
699 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
700 "with pnn %d, but we're node %d\n",
701 ctdb_addr_to_str(&pip->addr),
702 pip->pnn, ctdb->pnn));
706 /* update out vnn list */
707 vnn = find_public_ip_vnn(ctdb, &pip->addr);
709 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
710 ctdb_addr_to_str(&pip->addr)));
714 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
715 have_ip = ctdb_sys_have_ip(&pip->addr);
717 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
718 if (best_iface == NULL) {
719 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
720 "a usable interface (old %s, have_ip %d)\n",
721 ctdb_addr_to_str(&vnn->public_address),
722 vnn->public_netmask_bits,
723 ctdb_vnn_iface_string(vnn),
728 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
729 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
734 if (vnn->iface == NULL && have_ip) {
735 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
736 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
737 ctdb_addr_to_str(&vnn->public_address)));
741 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
742 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743 "and we have it on iface[%s], but it was assigned to node %d"
744 "and we are node %d, banning ourself\n",
745 ctdb_addr_to_str(&vnn->public_address),
746 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
751 if (vnn->pnn == -1 && have_ip) {
752 vnn->pnn = ctdb->pnn;
753 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
754 "and we already have it on iface[%s], update local daemon\n",
755 ctdb_addr_to_str(&vnn->public_address),
756 ctdb_vnn_iface_string(vnn)));
761 if (vnn->iface != best_iface) {
762 if (!vnn->iface->link_up) {
764 } else if (vnn->iface->references > (best_iface->references + 1)) {
765 /* only move when the rebalance gains something */
773 ctdb_vnn_unassign_iface(ctdb, vnn);
780 ret = ctdb_do_takeip(ctdb, c, vnn);
784 } else if (do_updateip) {
785 ret = ctdb_do_updateip(ctdb, c, vnn);
791 * The interface is up and the kernel known the ip
794 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
795 ctdb_addr_to_str(&pip->addr),
796 vnn->public_netmask_bits,
797 ctdb_vnn_iface_string(vnn)));
801 /* tell ctdb_control.c that we will be replying asynchronously */
808 kill any clients that are registered with a IP that is being released
810 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
812 struct ctdb_client_ip *ip;
814 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
815 ctdb_addr_to_str(addr)));
817 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
818 ctdb_sock_addr tmp_addr;
821 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
823 ctdb_addr_to_str(&ip->addr)));
825 if (ctdb_same_ip(&tmp_addr, addr)) {
826 struct ctdb_client *client = reqid_find(ctdb->idr,
829 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
831 ctdb_addr_to_str(&ip->addr),
834 if (client->pid != 0) {
835 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
836 (unsigned)client->pid,
837 ctdb_addr_to_str(addr),
839 kill(client->pid, SIGKILL);
845 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
847 DLIST_REMOVE(ctdb->vnn, vnn);
848 ctdb_vnn_unassign_iface(ctdb, vnn);
849 ctdb_remove_orphaned_ifaces(ctdb, vnn);
854 called when releaseip event finishes
856 static void release_ip_callback(struct ctdb_context *ctdb, int status,
859 struct takeover_callback_state *state =
860 talloc_get_type(private_data, struct takeover_callback_state);
863 if (status == -ETIME) {
867 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
868 if (ctdb_sys_have_ip(state->addr)) {
870 ("IP %s still hosted during release IP callback, failing\n",
871 ctdb_addr_to_str(state->addr)));
872 ctdb_request_control_reply(ctdb, state->c,
879 /* send a message to all clients of this node telling them
880 that the cluster has been reconfigured and they should
881 release any sockets on this IP */
882 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884 data.dsize = strlen((char *)data.dptr)+1;
886 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
888 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
890 /* kill clients that have registered with this IP */
891 release_kill_clients(ctdb, state->addr);
893 ctdb_vnn_unassign_iface(ctdb, state->vnn);
895 /* Process the IP if it has been marked for deletion */
896 if (state->vnn->delete_pending) {
897 do_delete_ip(ctdb, state->vnn);
901 /* the control succeeded */
902 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
906 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
908 if (state->vnn != NULL) {
909 state->vnn->update_in_flight = false;
915 release an ip address
917 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
918 struct ctdb_req_control *c,
923 struct takeover_callback_state *state;
924 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
925 struct ctdb_vnn *vnn;
928 /* update our vnn list */
929 vnn = find_public_ip_vnn(ctdb, &pip->addr);
931 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
932 ctdb_addr_to_str(&pip->addr)));
937 /* stop any previous arps */
938 talloc_free(vnn->takeover_ctx);
939 vnn->takeover_ctx = NULL;
941 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
942 * lazy multicast to drop an IP from any node that isn't the
943 * intended new node. The following causes makes ctdbd ignore
944 * a release for any address it doesn't host.
946 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
947 if (!ctdb_sys_have_ip(&pip->addr)) {
948 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
949 ctdb_addr_to_str(&pip->addr),
950 vnn->public_netmask_bits,
951 ctdb_vnn_iface_string(vnn)));
952 ctdb_vnn_unassign_iface(ctdb, vnn);
956 if (vnn->iface == NULL) {
957 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
958 ctdb_addr_to_str(&pip->addr),
959 vnn->public_netmask_bits));
964 /* There is a potential race between take_ip and us because we
965 * update the VNN via a callback that run when the
966 * eventscripts have been run. Avoid the race by allowing one
967 * update to be in flight at a time.
969 if (vnn->update_in_flight) {
970 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
971 "update for this IP already in flight\n",
972 ctdb_addr_to_str(&vnn->public_address),
973 vnn->public_netmask_bits));
977 iface = strdup(ctdb_vnn_iface_string(vnn));
979 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
980 ctdb_addr_to_str(&pip->addr),
981 vnn->public_netmask_bits,
985 state = talloc(ctdb, struct takeover_callback_state);
987 ctdb_set_error(ctdb, "Out of memory at %s:%d",
993 state->c = talloc_steal(state, c);
994 state->addr = talloc(state, ctdb_sock_addr);
995 if (state->addr == NULL) {
996 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1002 *state->addr = pip->addr;
1005 vnn->update_in_flight = true;
1006 talloc_set_destructor(state, ctdb_releaseip_destructor);
1008 ret = ctdb_event_script_callback(ctdb,
1009 state, release_ip_callback, state,
1010 CTDB_EVENT_RELEASE_IP,
1013 ctdb_addr_to_str(&pip->addr),
1014 vnn->public_netmask_bits);
1017 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1018 ctdb_addr_to_str(&pip->addr),
1019 ctdb_vnn_iface_string(vnn)));
1024 /* tell the control that we will be reply asynchronously */
1025 *async_reply = true;
1029 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1030 ctdb_sock_addr *addr,
1031 unsigned mask, const char *ifaces,
1034 struct ctdb_vnn *vnn;
1041 tmp = strdup(ifaces);
1042 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1043 if (!ctdb_sys_check_iface_exists(iface)) {
1044 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1051 /* Verify that we dont have an entry for this ip yet */
1052 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1053 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1054 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1055 ctdb_addr_to_str(addr)));
1060 /* create a new vnn structure for this ip address */
1061 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1062 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1063 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1064 tmp = talloc_strdup(vnn, ifaces);
1065 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1066 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1067 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1068 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1069 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1070 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1074 vnn->ifaces[num] = NULL;
1075 vnn->public_address = *addr;
1076 vnn->public_netmask_bits = mask;
1078 if (check_address) {
1079 if (ctdb_sys_have_ip(addr)) {
1080 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1081 vnn->pnn = ctdb->pnn;
1085 for (i=0; vnn->ifaces[i]; i++) {
1086 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1088 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1089 "for public_address[%s]\n",
1090 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1096 DLIST_ADD(ctdb->vnn, vnn);
1102 setup the public address lists from a file
1104 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1110 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1111 if (lines == NULL) {
1112 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1115 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1119 for (i=0;i<nlines;i++) {
1121 ctdb_sock_addr addr;
1122 const char *addrstr;
1127 while ((*line == ' ') || (*line == '\t')) {
1133 if (strcmp(line, "") == 0) {
1136 tok = strtok(line, " \t");
1138 tok = strtok(NULL, " \t");
1140 if (NULL == ctdb->default_public_interface) {
1141 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1146 ifaces = ctdb->default_public_interface;
1151 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1152 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1156 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1157 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1168 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1172 struct ctdb_vnn *svnn;
1173 struct ctdb_iface *cur = NULL;
1177 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1178 CTDB_NO_MEMORY(ctdb, svnn);
1180 svnn->ifaces = talloc_array(svnn, const char *, 2);
1181 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1182 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1183 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1184 svnn->ifaces[1] = NULL;
1186 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1192 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1194 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1195 "for single_ip[%s]\n",
1197 ctdb_addr_to_str(&svnn->public_address)));
1202 /* assume the single public ip interface is initially "good" */
1203 cur = ctdb_find_iface(ctdb, iface);
1205 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1208 cur->link_up = true;
1210 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1216 ctdb->single_ip_vnn = svnn;
1220 struct ctdb_public_ip_list {
1221 struct ctdb_public_ip_list *next;
1223 ctdb_sock_addr addr;
1226 /* Given a physical node, return the number of
1227 public addresses that is currently assigned to this node.
1229 static int node_ip_coverage(struct ctdb_context *ctdb,
1231 struct ctdb_public_ip_list *ips)
1235 for (;ips;ips=ips->next) {
1236 if (ips->pnn == pnn) {
1244 /* Can the given node host the given IP: is the public IP known to the
1245 * node and is NOIPHOST unset?
1247 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1248 struct ctdb_ipflags ipflags,
1249 struct ctdb_public_ip_list *ip)
1251 struct ctdb_all_public_ips *public_ips;
1254 if (ipflags.noiphost) {
1258 public_ips = ctdb->nodes[pnn]->available_public_ips;
1260 if (public_ips == NULL) {
1264 for (i=0; i<public_ips->num; i++) {
1265 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1266 /* yes, this node can serve this public ip */
1274 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1275 struct ctdb_ipflags ipflags,
1276 struct ctdb_public_ip_list *ip)
1278 if (ipflags.noiptakeover) {
1282 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1285 /* search the node lists list for a node to takeover this ip.
1286 pick the node that currently are serving the least number of ips
1287 so that the ips get spread out evenly.
1289 static int find_takeover_node(struct ctdb_context *ctdb,
1290 struct ctdb_ipflags *ipflags,
1291 struct ctdb_public_ip_list *ip,
1292 struct ctdb_public_ip_list *all_ips)
1294 int pnn, min=0, num;
1297 numnodes = talloc_array_length(ipflags);
1299 for (i=0; i<numnodes; i++) {
1300 /* verify that this node can serve this ip */
1301 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1302 /* no it couldnt so skip to the next node */
1306 num = node_ip_coverage(ctdb, i, all_ips);
1307 /* was this the first node we checked ? */
1319 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1320 ctdb_addr_to_str(&ip->addr)));
1330 static uint32_t *ip_key(ctdb_sock_addr *ip)
1332 static uint32_t key[IP_KEYLEN];
1334 bzero(key, sizeof(key));
1336 switch (ip->sa.sa_family) {
1338 key[3] = htonl(ip->ip.sin_addr.s_addr);
1341 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1342 key[0] = htonl(s6_a32[0]);
1343 key[1] = htonl(s6_a32[1]);
1344 key[2] = htonl(s6_a32[2]);
1345 key[3] = htonl(s6_a32[3]);
1349 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1356 static void *add_ip_callback(void *parm, void *data)
1358 struct ctdb_public_ip_list *this_ip = parm;
1359 struct ctdb_public_ip_list *prev_ip = data;
1361 if (prev_ip == NULL) {
1364 if (this_ip->pnn == -1) {
1365 this_ip->pnn = prev_ip->pnn;
1371 static int getips_count_callback(void *param, void *data)
1373 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1374 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1376 new_ip->next = *ip_list;
1381 static struct ctdb_public_ip_list *
1382 create_merged_ip_list(struct ctdb_context *ctdb)
1385 struct ctdb_public_ip_list *ip_list;
1386 struct ctdb_all_public_ips *public_ips;
1388 if (ctdb->ip_tree != NULL) {
1389 talloc_free(ctdb->ip_tree);
1390 ctdb->ip_tree = NULL;
1392 ctdb->ip_tree = trbt_create(ctdb, 0);
1394 for (i=0;i<ctdb->num_nodes;i++) {
1395 public_ips = ctdb->nodes[i]->known_public_ips;
1397 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1401 /* there were no public ips for this node */
1402 if (public_ips == NULL) {
1406 for (j=0;j<public_ips->num;j++) {
1407 struct ctdb_public_ip_list *tmp_ip;
1409 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1410 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1411 /* Do not use information about IP addresses hosted
1412 * on other nodes, it may not be accurate */
1413 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1414 tmp_ip->pnn = public_ips->ips[j].pnn;
1418 tmp_ip->addr = public_ips->ips[j].addr;
1419 tmp_ip->next = NULL;
1421 trbt_insertarray32_callback(ctdb->ip_tree,
1422 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1429 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1435 * This is the length of the longtest common prefix between the IPs.
1436 * It is calculated by XOR-ing the 2 IPs together and counting the
1437 * number of leading zeroes. The implementation means that all
1438 * addresses end up being 128 bits long.
1440 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1441 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1442 * lots of nodes and IP addresses?
1444 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1446 uint32_t ip1_k[IP_KEYLEN];
1451 uint32_t distance = 0;
1453 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1455 for (i=0; i<IP_KEYLEN; i++) {
1456 x = ip1_k[i] ^ t[i];
1460 /* Count number of leading zeroes.
1461 * FIXME? This could be optimised...
1463 while ((x & (1 << 31)) == 0) {
1473 /* Calculate the IP distance for the given IP relative to IPs on the
1474 given node. The ips argument is generally the all_ips variable
1475 used in the main part of the algorithm.
1477 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1478 struct ctdb_public_ip_list *ips,
1481 struct ctdb_public_ip_list *t;
1486 for (t=ips; t != NULL; t=t->next) {
1487 if (t->pnn != pnn) {
1491 /* Optimisation: We never calculate the distance
1492 * between an address and itself. This allows us to
1493 * calculate the effect of removing an address from a
1494 * node by simply calculating the distance between
1495 * that address and all of the exitsing addresses.
1496 * Moreover, we assume that we're only ever dealing
1497 * with addresses from all_ips so we can identify an
1498 * address via a pointer rather than doing a more
1499 * expensive address comparison. */
1500 if (&(t->addr) == ip) {
1504 d = ip_distance(ip, &(t->addr));
1505 sum += d * d; /* Cheaper than pulling in math.h :-) */
1511 /* Return the LCP2 imbalance metric for addresses currently assigned
1514 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1516 struct ctdb_public_ip_list *t;
1518 uint32_t imbalance = 0;
1520 for (t=all_ips; t!=NULL; t=t->next) {
1521 if (t->pnn != pnn) {
1524 /* Pass the rest of the IPs rather than the whole
1527 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1533 /* Allocate any unassigned IPs just by looping through the IPs and
1534 * finding the best node for each.
1536 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1537 struct ctdb_ipflags *ipflags,
1538 struct ctdb_public_ip_list *all_ips)
1540 struct ctdb_public_ip_list *tmp_ip;
1542 /* loop over all ip's and find a physical node to cover for
1545 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1546 if (tmp_ip->pnn == -1) {
1547 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1548 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1549 ctdb_addr_to_str(&tmp_ip->addr)));
1555 /* Basic non-deterministic rebalancing algorithm.
1557 static void basic_failback(struct ctdb_context *ctdb,
1558 struct ctdb_ipflags *ipflags,
1559 struct ctdb_public_ip_list *all_ips,
1563 int maxnode, maxnum, minnode, minnum, num, retries;
1564 struct ctdb_public_ip_list *tmp_ip;
1566 numnodes = talloc_array_length(ipflags);
1573 /* for each ip address, loop over all nodes that can serve
1574 this ip and make sure that the difference between the node
1575 serving the most and the node serving the least ip's are
1578 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1579 if (tmp_ip->pnn == -1) {
1583 /* Get the highest and lowest number of ips's served by any
1584 valid node which can serve this ip.
1588 for (i=0; i<numnodes; i++) {
1589 /* only check nodes that can actually serve this ip */
1590 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1591 /* no it couldnt so skip to the next node */
1595 num = node_ip_coverage(ctdb, i, all_ips);
1596 if (maxnode == -1) {
1605 if (minnode == -1) {
1615 if (maxnode == -1) {
1616 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1617 ctdb_addr_to_str(&tmp_ip->addr)));
1622 /* if the spread between the smallest and largest coverage by
1623 a node is >=2 we steal one of the ips from the node with
1624 most coverage to even things out a bit.
1625 try to do this a limited number of times since we dont
1626 want to spend too much time balancing the ip coverage.
1628 if ( (maxnum > minnum+1)
1629 && (retries < (num_ips + 5)) ){
1630 struct ctdb_public_ip_list *tmp;
1632 /* Reassign one of maxnode's VNNs */
1633 for (tmp=all_ips;tmp;tmp=tmp->next) {
1634 if (tmp->pnn == maxnode) {
1635 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1644 static void lcp2_init(struct ctdb_context *tmp_ctx,
1645 struct ctdb_ipflags *ipflags,
1646 struct ctdb_public_ip_list *all_ips,
1647 uint32_t *force_rebalance_nodes,
1648 uint32_t **lcp2_imbalances,
1649 bool **rebalance_candidates)
1652 struct ctdb_public_ip_list *tmp_ip;
1654 numnodes = talloc_array_length(ipflags);
1656 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1657 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1658 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1659 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1661 for (i=0; i<numnodes; i++) {
1662 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1663 /* First step: assume all nodes are candidates */
1664 (*rebalance_candidates)[i] = true;
1667 /* 2nd step: if a node has IPs assigned then it must have been
1668 * healthy before, so we remove it from consideration. This
1669 * is overkill but is all we have because we don't maintain
1670 * state between takeover runs. An alternative would be to
1671 * keep state and invalidate it every time the recovery master
1674 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1675 if (tmp_ip->pnn != -1) {
1676 (*rebalance_candidates)[tmp_ip->pnn] = false;
1680 /* 3rd step: if a node is forced to re-balance then
1681 we allow failback onto the node */
1682 if (force_rebalance_nodes == NULL) {
1685 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1686 uint32_t pnn = force_rebalance_nodes[i];
1687 if (pnn >= numnodes) {
1689 (__location__ "unknown node %u\n", pnn));
1694 ("Forcing rebalancing of IPs to node %u\n", pnn));
1695 (*rebalance_candidates)[pnn] = true;
1699 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1700 * the IP/node combination that will cost the least.
1702 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1703 struct ctdb_ipflags *ipflags,
1704 struct ctdb_public_ip_list *all_ips,
1705 uint32_t *lcp2_imbalances)
1707 struct ctdb_public_ip_list *tmp_ip;
1708 int dstnode, numnodes;
1711 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1712 struct ctdb_public_ip_list *minip;
1714 bool should_loop = true;
1715 bool have_unassigned = true;
1717 numnodes = talloc_array_length(ipflags);
1719 while (have_unassigned && should_loop) {
1720 should_loop = false;
1722 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1723 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1729 /* loop over each unassigned ip. */
1730 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1731 if (tmp_ip->pnn != -1) {
1735 for (dstnode=0; dstnode<numnodes; dstnode++) {
1736 /* only check nodes that can actually takeover this ip */
1737 if (!can_node_takeover_ip(ctdb, dstnode,
1740 /* no it couldnt so skip to the next node */
1744 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1745 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1746 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1747 ctdb_addr_to_str(&(tmp_ip->addr)),
1749 dstimbl - lcp2_imbalances[dstnode]));
1752 if ((minnode == -1) || (dstdsum < mindsum)) {
1762 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1764 /* If we found one then assign it to the given node. */
1765 if (minnode != -1) {
1766 minip->pnn = minnode;
1767 lcp2_imbalances[minnode] = minimbl;
1768 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1769 ctdb_addr_to_str(&(minip->addr)),
1774 /* There might be a better way but at least this is clear. */
1775 have_unassigned = false;
1776 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1777 if (tmp_ip->pnn == -1) {
1778 have_unassigned = true;
1783 /* We know if we have an unassigned addresses so we might as
1786 if (have_unassigned) {
1787 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1788 if (tmp_ip->pnn == -1) {
1789 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1790 ctdb_addr_to_str(&tmp_ip->addr)));
1796 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1797 * to move IPs from, determines the best IP/destination node
1798 * combination to move from the source node.
1800 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1801 struct ctdb_ipflags *ipflags,
1802 struct ctdb_public_ip_list *all_ips,
1804 uint32_t *lcp2_imbalances,
1805 bool *rebalance_candidates)
1807 int dstnode, mindstnode, numnodes;
1808 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1809 uint32_t minsrcimbl, mindstimbl;
1810 struct ctdb_public_ip_list *minip;
1811 struct ctdb_public_ip_list *tmp_ip;
1813 /* Find an IP and destination node that best reduces imbalance. */
1820 numnodes = talloc_array_length(ipflags);
1822 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1823 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1824 srcnode, lcp2_imbalances[srcnode]));
1826 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1827 /* Only consider addresses on srcnode. */
1828 if (tmp_ip->pnn != srcnode) {
1832 /* What is this IP address costing the source node? */
1833 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1834 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1836 /* Consider this IP address would cost each potential
1837 * destination node. Destination nodes are limited to
1838 * those that are newly healthy, since we don't want
1839 * to do gratuitous failover of IPs just to make minor
1840 * balance improvements.
1842 for (dstnode=0; dstnode<numnodes; dstnode++) {
1843 if (!rebalance_candidates[dstnode]) {
1847 /* only check nodes that can actually takeover this ip */
1848 if (!can_node_takeover_ip(ctdb, dstnode,
1849 ipflags[dstnode], tmp_ip)) {
1850 /* no it couldnt so skip to the next node */
1854 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1855 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1856 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1858 ctdb_addr_to_str(&(tmp_ip->addr)),
1861 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1862 (dstdsum < srcdsum) && \
1863 ((mindstnode == -1) || \
1864 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1867 minsrcimbl = srcimbl;
1868 mindstnode = dstnode;
1869 mindstimbl = dstimbl;
1873 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1875 if (mindstnode != -1) {
1876 /* We found a move that makes things better... */
1877 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1878 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1879 ctdb_addr_to_str(&(minip->addr)),
1880 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1883 lcp2_imbalances[srcnode] = minsrcimbl;
1884 lcp2_imbalances[mindstnode] = mindstimbl;
1885 minip->pnn = mindstnode;
1894 struct lcp2_imbalance_pnn {
1899 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1901 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1902 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1904 if (lipa->imbalance > lipb->imbalance) {
1906 } else if (lipa->imbalance == lipb->imbalance) {
1913 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1914 * node with the highest LCP2 imbalance, and then determines the best
1915 * IP/destination node combination to move from the source node.
1917 static void lcp2_failback(struct ctdb_context *ctdb,
1918 struct ctdb_ipflags *ipflags,
1919 struct ctdb_public_ip_list *all_ips,
1920 uint32_t *lcp2_imbalances,
1921 bool *rebalance_candidates)
1924 struct lcp2_imbalance_pnn * lips;
1927 numnodes = talloc_array_length(ipflags);
1930 /* Put the imbalances and nodes into an array, sort them and
1931 * iterate through candidates. Usually the 1st one will be
1932 * used, so this doesn't cost much...
1934 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1935 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1936 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1937 for (i=0; i<numnodes; i++) {
1938 lips[i].imbalance = lcp2_imbalances[i];
1940 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1942 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1943 lcp2_cmp_imbalance_pnn);
1946 for (i=0; i<numnodes; i++) {
1947 /* This means that all nodes had 0 or 1 addresses, so
1948 * can't be imbalanced.
1950 if (lips[i].imbalance == 0) {
1954 if (lcp2_failback_candidate(ctdb,
1959 rebalance_candidates)) {
1971 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1972 struct ctdb_ipflags *ipflags,
1973 struct ctdb_public_ip_list *all_ips)
1975 struct ctdb_public_ip_list *tmp_ip;
1977 /* verify that the assigned nodes can serve that public ip
1978 and set it to -1 if not
1980 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1981 if (tmp_ip->pnn == -1) {
1984 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1985 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1986 /* this node can not serve this ip. */
1987 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1988 ctdb_addr_to_str(&(tmp_ip->addr)),
1995 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
1996 struct ctdb_ipflags *ipflags,
1997 struct ctdb_public_ip_list *all_ips)
1999 struct ctdb_public_ip_list *tmp_ip;
2002 numnodes = talloc_array_length(ipflags);
2004 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2005 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2006 * always be allocated the same way for a specific set of
2007 * available/unavailable nodes.
2010 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2011 tmp_ip->pnn = i % numnodes;
2014 /* IP failback doesn't make sense with deterministic
2015 * IPs, since the modulo step above implicitly fails
2016 * back IPs to their "home" node.
2018 if (1 == ctdb->tunable.no_ip_failback) {
2019 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2022 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2024 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2026 /* No failback here! */
2029 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2030 struct ctdb_ipflags *ipflags,
2031 struct ctdb_public_ip_list *all_ips)
2033 /* This should be pushed down into basic_failback. */
2034 struct ctdb_public_ip_list *tmp_ip;
2036 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2040 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2042 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2044 /* If we don't want IPs to fail back then don't rebalance IPs. */
2045 if (1 == ctdb->tunable.no_ip_failback) {
2049 /* Now, try to make sure the ip adresses are evenly distributed
2052 basic_failback(ctdb, ipflags, all_ips, num_ips);
2055 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2056 struct ctdb_ipflags *ipflags,
2057 struct ctdb_public_ip_list *all_ips,
2058 uint32_t *force_rebalance_nodes)
2060 uint32_t *lcp2_imbalances;
2061 bool *rebalance_candidates;
2062 int numnodes, num_rebalance_candidates, i;
2064 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2066 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2068 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2069 &lcp2_imbalances, &rebalance_candidates);
2071 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2073 /* If we don't want IPs to fail back then don't rebalance IPs. */
2074 if (1 == ctdb->tunable.no_ip_failback) {
2078 /* It is only worth continuing if we have suitable target
2079 * nodes to transfer IPs to. This check is much cheaper than
2082 numnodes = talloc_array_length(ipflags);
2083 num_rebalance_candidates = 0;
2084 for (i=0; i<numnodes; i++) {
2085 if (rebalance_candidates[i]) {
2086 num_rebalance_candidates++;
2089 if (num_rebalance_candidates == 0) {
2093 /* Now, try to make sure the ip adresses are evenly distributed
2096 lcp2_failback(ctdb, ipflags, all_ips,
2097 lcp2_imbalances, rebalance_candidates);
2100 talloc_free(tmp_ctx);
2103 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2107 for (i=0;i<nodemap->num;i++) {
2108 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2109 /* Found one completely healthy node */
2117 /* The calculation part of the IP allocation algorithm. */
2118 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2119 struct ctdb_ipflags *ipflags,
2120 struct ctdb_public_ip_list **all_ips_p,
2121 uint32_t *force_rebalance_nodes)
2123 /* since nodes only know about those public addresses that
2124 can be served by that particular node, no single node has
2125 a full list of all public addresses that exist in the cluster.
2126 Walk over all node structures and create a merged list of
2127 all public addresses that exist in the cluster.
2129 keep the tree of ips around as ctdb->ip_tree
2131 *all_ips_p = create_merged_ip_list(ctdb);
2133 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2134 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2135 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2136 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2138 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2141 /* at this point ->pnn is the node which will own each IP
2142 or -1 if there is no node that can cover this ip
2148 struct get_tunable_callback_data {
2149 const char *tunable;
2154 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2155 int32_t res, TDB_DATA outdata,
2158 struct get_tunable_callback_data *cd =
2159 (struct get_tunable_callback_data *)callback;
2163 /* Already handled in fail callback */
2167 if (outdata.dsize != sizeof(uint32_t)) {
2168 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2169 cd->tunable, pnn, (int)sizeof(uint32_t),
2170 (int)outdata.dsize));
2175 size = talloc_array_length(cd->out);
2177 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2178 cd->tunable, pnn, size));
2183 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2186 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2187 int32_t res, TDB_DATA outdata,
2190 struct get_tunable_callback_data *cd =
2191 (struct get_tunable_callback_data *)callback;
2196 ("Timed out getting tunable \"%s\" from node %d\n",
2202 DEBUG(DEBUG_WARNING,
2203 ("Tunable \"%s\" not implemented on node %d\n",
2208 ("Unexpected error getting tunable \"%s\" from node %d\n",
2214 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2215 TALLOC_CTX *tmp_ctx,
2216 struct ctdb_node_map *nodemap,
2217 const char *tunable,
2218 uint32_t default_value)
2221 struct ctdb_control_get_tunable *t;
2224 struct get_tunable_callback_data callback_data;
2227 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2228 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2229 for (i=0; i<nodemap->num; i++) {
2230 tvals[i] = default_value;
2233 callback_data.out = tvals;
2234 callback_data.tunable = tunable;
2235 callback_data.fatal = false;
2237 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2238 data.dptr = talloc_size(tmp_ctx, data.dsize);
2239 t = (struct ctdb_control_get_tunable *)data.dptr;
2240 t->length = strlen(tunable)+1;
2241 memcpy(t->name, tunable, t->length);
2242 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2243 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2244 nodes, 0, TAKEOVER_TIMEOUT(),
2246 get_tunable_callback,
2247 get_tunable_fail_callback,
2248 &callback_data) != 0) {
2249 if (callback_data.fatal) {
2255 talloc_free(data.dptr);
2260 struct get_runstate_callback_data {
2261 enum ctdb_runstate *out;
2265 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2266 int32_t res, TDB_DATA outdata,
2267 void *callback_data)
2269 struct get_runstate_callback_data *cd =
2270 (struct get_runstate_callback_data *)callback_data;
2274 /* Already handled in fail callback */
2278 if (outdata.dsize != sizeof(uint32_t)) {
2279 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2280 pnn, (int)sizeof(uint32_t),
2281 (int)outdata.dsize));
2286 size = talloc_array_length(cd->out);
2288 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2293 cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2296 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2297 int32_t res, TDB_DATA outdata,
2300 struct get_runstate_callback_data *cd =
2301 (struct get_runstate_callback_data *)callback;
2306 ("Timed out getting runstate from node %d\n", pnn));
2310 DEBUG(DEBUG_WARNING,
2311 ("Error getting runstate from node %d - assuming runstates not supported\n",
2316 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2317 TALLOC_CTX *tmp_ctx,
2318 struct ctdb_node_map *nodemap,
2319 enum ctdb_runstate default_value)
2322 enum ctdb_runstate *rs;
2323 struct get_runstate_callback_data callback_data;
2326 rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2327 CTDB_NO_MEMORY_NULL(ctdb, rs);
2328 for (i=0; i<nodemap->num; i++) {
2329 rs[i] = default_value;
2332 callback_data.out = rs;
2333 callback_data.fatal = false;
2335 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2336 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2337 nodes, 0, TAKEOVER_TIMEOUT(),
2339 get_runstate_callback,
2340 get_runstate_fail_callback,
2341 &callback_data) != 0) {
2342 if (callback_data.fatal) {
2352 /* Set internal flags for IP allocation:
2354 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2355 * Set NOIPHOST ip flag for each INACTIVE node
2356 * if all nodes are disabled:
2357 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2359 * Set NOIPHOST ip flags for disabled nodes
2361 static struct ctdb_ipflags *
2362 set_ipflags_internal(struct ctdb_context *ctdb,
2363 TALLOC_CTX *tmp_ctx,
2364 struct ctdb_node_map *nodemap,
2365 uint32_t *tval_noiptakeover,
2366 uint32_t *tval_noiphostonalldisabled,
2367 enum ctdb_runstate *runstate)
2370 struct ctdb_ipflags *ipflags;
2372 /* Clear IP flags - implicit due to talloc_zero */
2373 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2374 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2376 for (i=0;i<nodemap->num;i++) {
2377 /* Can not take IPs on node with NoIPTakeover set */
2378 if (tval_noiptakeover[i] != 0) {
2379 ipflags[i].noiptakeover = true;
2382 /* Can not host IPs on node not in RUNNING state */
2383 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2384 ipflags[i].noiphost = true;
2387 /* Can not host IPs on INACTIVE node */
2388 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2389 ipflags[i].noiphost = true;
2391 /* Remember the runstate */
2392 ipflags[i].runstate = runstate[i];
2395 if (all_nodes_are_disabled(nodemap)) {
2396 /* If all nodes are disabled, can not host IPs on node
2397 * with NoIPHostOnAllDisabled set
2399 for (i=0;i<nodemap->num;i++) {
2400 if (tval_noiphostonalldisabled[i] != 0) {
2401 ipflags[i].noiphost = true;
2405 /* If some nodes are not disabled, then can not host
2406 * IPs on DISABLED node
2408 for (i=0;i<nodemap->num;i++) {
2409 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2410 ipflags[i].noiphost = true;
2418 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2419 TALLOC_CTX *tmp_ctx,
2420 struct ctdb_node_map *nodemap)
2422 uint32_t *tval_noiptakeover;
2423 uint32_t *tval_noiphostonalldisabled;
2424 struct ctdb_ipflags *ipflags;
2425 enum ctdb_runstate *runstate;
2428 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2430 if (tval_noiptakeover == NULL) {
2434 tval_noiphostonalldisabled =
2435 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2436 "NoIPHostOnAllDisabled", 0);
2437 if (tval_noiphostonalldisabled == NULL) {
2438 /* Caller frees tmp_ctx */
2442 /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2443 * will default to CTDB_RUNSTATE_RUNNING. This ensures
2444 * reasonable behaviour on a mixed cluster during upgrade.
2446 runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2447 CTDB_RUNSTATE_RUNNING);
2448 if (runstate == NULL) {
2449 /* Caller frees tmp_ctx */
2453 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2455 tval_noiphostonalldisabled,
2458 talloc_free(tval_noiptakeover);
2459 talloc_free(tval_noiphostonalldisabled);
2460 talloc_free(runstate);
2465 struct iprealloc_callback_data {
2468 client_async_callback fail_callback;
2469 void *fail_callback_data;
2470 struct ctdb_node_map *nodemap;
2473 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2474 int32_t res, TDB_DATA outdata,
2478 struct iprealloc_callback_data *cd =
2479 (struct iprealloc_callback_data *)callback;
2481 numnodes = talloc_array_length(cd->retry_nodes);
2482 if (pnn > numnodes) {
2484 ("ipreallocated failure from node %d, "
2485 "but only %d nodes in nodemap\n",
2490 /* Can't run the "ipreallocated" event on a INACTIVE node */
2491 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2492 DEBUG(DEBUG_WARNING,
2493 ("ipreallocated failed on inactive node %d, ignoring\n",
2500 /* If the control timed out then that's a real error,
2501 * so call the real fail callback
2503 if (cd->fail_callback) {
2504 cd->fail_callback(ctdb, pnn, res, outdata,
2505 cd->fail_callback_data);
2507 DEBUG(DEBUG_WARNING,
2508 ("iprealloc timed out but no callback registered\n"));
2512 /* If not a timeout then either the ipreallocated
2513 * eventscript (or some setup) failed. This might
2514 * have failed because the IPREALLOCATED control isn't
2515 * implemented - right now there is no way of knowing
2516 * because the error codes are all folded down to -1.
2517 * Consider retrying using EVENTSCRIPT control...
2519 DEBUG(DEBUG_WARNING,
2520 ("ipreallocated failure from node %d, flagging retry\n",
2522 cd->retry_nodes[pnn] = true;
2527 struct takeover_callback_data {
2529 client_async_callback fail_callback;
2530 void *fail_callback_data;
2531 struct ctdb_node_map *nodemap;
2534 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2535 uint32_t node_pnn, int32_t res,
2536 TDB_DATA outdata, void *callback_data)
2538 struct takeover_callback_data *cd =
2539 talloc_get_type_abort(callback_data,
2540 struct takeover_callback_data);
2543 for (i = 0; i < cd->nodemap->num; i++) {
2544 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2549 if (i == cd->nodemap->num) {
2550 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2554 if (!cd->node_failed[i]) {
2555 cd->node_failed[i] = true;
2556 cd->fail_callback(ctdb, node_pnn, res, outdata,
2557 cd->fail_callback_data);
2562 make any IP alias changes for public addresses that are necessary
2564 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2565 uint32_t *force_rebalance_nodes,
2566 client_async_callback fail_callback, void *callback_data)
2569 struct ctdb_public_ip ip;
2571 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2573 struct timeval timeout;
2574 struct client_async_data *async_data;
2575 struct ctdb_client_control_state *state;
2576 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2577 struct ctdb_ipflags *ipflags;
2578 struct takeover_callback_data *takeover_data;
2579 struct iprealloc_callback_data iprealloc_data;
2584 * ip failover is completely disabled, just send out the
2585 * ipreallocated event.
2587 if (ctdb->tunable.disable_ip_failover != 0) {
2591 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2592 if (ipflags == NULL) {
2593 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2594 talloc_free(tmp_ctx);
2598 /* Short-circuit IP allocation if no nodes are in the RUNNING
2599 * runstate yet, since no nodes will be able to host IPs */
2600 can_host_ips = false;
2601 for (i=0; i<nodemap->num; i++) {
2602 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2603 can_host_ips = true;
2606 if (!can_host_ips) {
2607 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2611 /* Do the IP reassignment calculations */
2612 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2614 /* Now tell all nodes to release any public IPs should not
2615 * host. This will be a NOOP on nodes that don't currently
2616 * hold the given IP.
2618 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2619 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2621 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2622 bool, nodemap->num);
2623 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2624 takeover_data->fail_callback = fail_callback;
2625 takeover_data->fail_callback_data = callback_data;
2626 takeover_data->nodemap = nodemap;
2628 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2629 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2631 async_data->fail_callback = takeover_run_fail_callback;
2632 async_data->callback_data = takeover_data;
2634 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2636 /* Send a RELEASE_IP to all nodes that should not be hosting
2637 * each IP. For each IP, all but one of these will be
2638 * redundant. However, the redundant ones are used to tell
2639 * nodes which node should be hosting the IP so that commands
2640 * like "ctdb ip" can display a particular nodes idea of who
2641 * is hosting what. */
2642 for (i=0;i<nodemap->num;i++) {
2643 /* don't talk to unconnected nodes, but do talk to banned nodes */
2644 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2648 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2649 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2650 /* This node should be serving this
2651 vnn so dont tell it to release the ip
2655 ip.pnn = tmp_ip->pnn;
2656 ip.addr = tmp_ip->addr;
2658 timeout = TAKEOVER_TIMEOUT();
2659 data.dsize = sizeof(ip);
2660 data.dptr = (uint8_t *)&ip;
2661 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2662 0, CTDB_CONTROL_RELEASE_IP, 0,
2665 if (state == NULL) {
2666 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2667 talloc_free(tmp_ctx);
2671 ctdb_client_async_add(async_data, state);
2674 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2675 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2676 talloc_free(tmp_ctx);
2679 talloc_free(async_data);
2682 /* For each IP, send a TAKOVER_IP to the node that should be
2683 * hosting it. Many of these will often be redundant (since
2684 * the allocation won't have changed) but they can be useful
2685 * to recover from inconsistencies. */
2686 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2687 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2689 async_data->fail_callback = fail_callback;
2690 async_data->callback_data = callback_data;
2692 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2693 if (tmp_ip->pnn == -1) {
2694 /* this IP won't be taken over */
2698 ip.pnn = tmp_ip->pnn;
2699 ip.addr = tmp_ip->addr;
2701 timeout = TAKEOVER_TIMEOUT();
2702 data.dsize = sizeof(ip);
2703 data.dptr = (uint8_t *)&ip;
2704 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2705 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2706 data, async_data, &timeout, NULL);
2707 if (state == NULL) {
2708 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2709 talloc_free(tmp_ctx);
2713 ctdb_client_async_add(async_data, state);
2715 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2716 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2717 talloc_free(tmp_ctx);
2723 * Tell all nodes to run eventscripts to process the
2724 * "ipreallocated" event. This can do a lot of things,
2725 * including restarting services to reconfigure them if public
2726 * IPs have moved. Once upon a time this event only used to
2729 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2730 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2731 iprealloc_data.retry_nodes = retry_data;
2732 iprealloc_data.retry_count = 0;
2733 iprealloc_data.fail_callback = fail_callback;
2734 iprealloc_data.fail_callback_data = callback_data;
2735 iprealloc_data.nodemap = nodemap;
2737 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2738 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2739 nodes, 0, TAKEOVER_TIMEOUT(),
2741 NULL, iprealloc_fail_callback,
2744 /* If the control failed then we should retry to any
2745 * nodes flagged by iprealloc_fail_callback using the
2746 * EVENTSCRIPT control. This is a best-effort at
2747 * backward compatiblity when running a mixed cluster
2748 * where some nodes have not yet been upgraded to
2749 * support the IPREALLOCATED control.
2751 DEBUG(DEBUG_WARNING,
2752 ("Retry ipreallocated to some nodes using eventscript control\n"));
2754 nodes = talloc_array(tmp_ctx, uint32_t,
2755 iprealloc_data.retry_count);
2756 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2759 for (i=0; i<nodemap->num; i++) {
2760 if (iprealloc_data.retry_nodes[i]) {
2766 data.dptr = discard_const("ipreallocated");
2767 data.dsize = strlen((char *)data.dptr) + 1;
2768 ret = ctdb_client_async_control(ctdb,
2769 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2770 nodes, 0, TAKEOVER_TIMEOUT(),
2772 NULL, fail_callback,
2775 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2779 talloc_free(tmp_ctx);
2785 destroy a ctdb_client_ip structure
2787 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2789 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2790 ctdb_addr_to_str(&ip->addr),
2791 ntohs(ip->addr.ip.sin_port),
2794 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2799 called by a client to inform us of a TCP connection that it is managing
2800 that should tickled with an ACK when IP takeover is done
2802 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2805 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2806 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2807 struct ctdb_tcp_list *tcp;
2808 struct ctdb_tcp_connection t;
2811 struct ctdb_client_ip *ip;
2812 struct ctdb_vnn *vnn;
2813 ctdb_sock_addr addr;
2815 /* If we don't have public IPs, tickles are useless */
2816 if (ctdb->vnn == NULL) {
2820 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2822 addr = tcp_sock->src;
2823 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2824 addr = tcp_sock->dest;
2825 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2828 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2829 vnn = find_public_ip_vnn(ctdb, &addr);
2831 switch (addr.sa.sa_family) {
2833 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2834 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2835 ctdb_addr_to_str(&addr)));
2839 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2840 ctdb_addr_to_str(&addr)));
2843 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2849 if (vnn->pnn != ctdb->pnn) {
2850 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2851 ctdb_addr_to_str(&addr),
2852 client_id, client->pid));
2853 /* failing this call will tell smbd to die */
2857 ip = talloc(client, struct ctdb_client_ip);
2858 CTDB_NO_MEMORY(ctdb, ip);
2862 ip->client_id = client_id;
2863 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2864 DLIST_ADD(ctdb->client_ip_list, ip);
2866 tcp = talloc(client, struct ctdb_tcp_list);
2867 CTDB_NO_MEMORY(ctdb, tcp);
2869 tcp->connection.src_addr = tcp_sock->src;
2870 tcp->connection.dst_addr = tcp_sock->dest;
2872 DLIST_ADD(client->tcp_list, tcp);
2874 t.src_addr = tcp_sock->src;
2875 t.dst_addr = tcp_sock->dest;
2877 data.dptr = (uint8_t *)&t;
2878 data.dsize = sizeof(t);
2880 switch (addr.sa.sa_family) {
2882 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2883 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
2884 ctdb_addr_to_str(&tcp_sock->src),
2885 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2888 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2889 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
2890 ctdb_addr_to_str(&tcp_sock->src),
2891 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2894 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2898 /* tell all nodes about this tcp connection */
2899 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2900 CTDB_CONTROL_TCP_ADD,
2901 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2903 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2911 find a tcp address on a list
2913 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2914 struct ctdb_tcp_connection *tcp)
2918 if (array == NULL) {
2922 for (i=0;i<array->num;i++) {
2923 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2924 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2925 return &array->connections[i];
2934 called by a daemon to inform us of a TCP connection that one of its
2935 clients managing that should tickled with an ACK when IP takeover is
2938 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2940 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2941 struct ctdb_tcp_array *tcparray;
2942 struct ctdb_tcp_connection tcp;
2943 struct ctdb_vnn *vnn;
2945 /* If we don't have public IPs, tickles are useless */
2946 if (ctdb->vnn == NULL) {
2950 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2952 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2953 ctdb_addr_to_str(&p->dst_addr)));
2959 tcparray = vnn->tcp_array;
2961 /* If this is the first tickle */
2962 if (tcparray == NULL) {
2963 tcparray = talloc(vnn, struct ctdb_tcp_array);
2964 CTDB_NO_MEMORY(ctdb, tcparray);
2965 vnn->tcp_array = tcparray;
2968 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2969 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2971 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2972 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2975 if (tcp_update_needed) {
2976 vnn->tcp_update_needed = true;
2982 /* Do we already have this tickle ?*/
2983 tcp.src_addr = p->src_addr;
2984 tcp.dst_addr = p->dst_addr;
2985 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2986 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2987 ctdb_addr_to_str(&tcp.dst_addr),
2988 ntohs(tcp.dst_addr.ip.sin_port),
2993 /* A new tickle, we must add it to the array */
2994 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2995 struct ctdb_tcp_connection,
2997 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2999 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3000 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3003 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3004 ctdb_addr_to_str(&tcp.dst_addr),
3005 ntohs(tcp.dst_addr.ip.sin_port),
3008 if (tcp_update_needed) {
3009 vnn->tcp_update_needed = true;
3017 called by a daemon to inform us of a TCP connection that one of its
3018 clients managing that should tickled with an ACK when IP takeover is
3021 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3023 struct ctdb_tcp_connection *tcpp;
3024 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3027 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3028 ctdb_addr_to_str(&conn->dst_addr)));
3032 /* if the array is empty we cant remove it
3033 and we dont need to do anything
3035 if (vnn->tcp_array == NULL) {
3036 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3037 ctdb_addr_to_str(&conn->dst_addr),
3038 ntohs(conn->dst_addr.ip.sin_port)));
3043 /* See if we know this connection
3044 if we dont know this connection then we dont need to do anything
3046 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3048 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3049 ctdb_addr_to_str(&conn->dst_addr),
3050 ntohs(conn->dst_addr.ip.sin_port)));
3055 /* We need to remove this entry from the array.
3056 Instead of allocating a new array and copying data to it
3057 we cheat and just copy the last entry in the existing array
3058 to the entry that is to be removed and just shring the
3061 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3062 vnn->tcp_array->num--;
3064 /* If we deleted the last entry we also need to remove the entire array
3066 if (vnn->tcp_array->num == 0) {
3067 talloc_free(vnn->tcp_array);
3068 vnn->tcp_array = NULL;
3071 vnn->tcp_update_needed = true;
3073 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3074 ctdb_addr_to_str(&conn->src_addr),
3075 ntohs(conn->src_addr.ip.sin_port)));
3080 called by a daemon to inform us of a TCP connection that one of its
3081 clients used are no longer needed in the tickle database
3083 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3085 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3087 /* If we don't have public IPs, tickles are useless */
3088 if (ctdb->vnn == NULL) {
3092 ctdb_remove_tcp_connection(ctdb, conn);
3099 Called when another daemon starts - causes all tickles for all
3100 public addresses we are serving to be sent to the new node on the
3101 next check. This actually causes the next scheduled call to
3102 tdb_update_tcp_tickles() to update all nodes. This is simple and
3103 doesn't require careful error handling.
3105 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3107 struct ctdb_vnn *vnn;
3109 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3110 (unsigned long) pnn));
3112 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3113 vnn->tcp_update_needed = true;
3121 called when a client structure goes away - hook to remove
3122 elements from the tcp_list in all daemons
3124 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3126 while (client->tcp_list) {
3127 struct ctdb_tcp_list *tcp = client->tcp_list;
3128 DLIST_REMOVE(client->tcp_list, tcp);
3129 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3134 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3136 struct ctdb_vnn *vnn;
3139 if (ctdb->tunable.disable_ip_failover == 1) {
3143 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3144 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3145 ctdb_vnn_unassign_iface(ctdb, vnn);
3152 /* Don't allow multiple releases at once. Some code,
3153 * particularly ctdb_tickle_sentenced_connections() is
3155 if (vnn->update_in_flight) {
3156 DEBUG(DEBUG_WARNING,
3158 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3159 ctdb_addr_to_str(&vnn->public_address),
3160 vnn->public_netmask_bits,
3161 ctdb_vnn_iface_string(vnn)));
3164 vnn->update_in_flight = true;
3166 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3167 ctdb_addr_to_str(&vnn->public_address),
3168 vnn->public_netmask_bits,
3169 ctdb_vnn_iface_string(vnn)));
3171 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3172 ctdb_vnn_iface_string(vnn),
3173 ctdb_addr_to_str(&vnn->public_address),
3174 vnn->public_netmask_bits);
3175 release_kill_clients(ctdb, &vnn->public_address);
3176 ctdb_vnn_unassign_iface(ctdb, vnn);
3177 vnn->update_in_flight = false;
3181 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3186 get list of public IPs
3188 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3189 struct ctdb_req_control *c, TDB_DATA *outdata)
3192 struct ctdb_all_public_ips *ips;
3193 struct ctdb_vnn *vnn;
3194 bool only_available = false;
3196 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3197 only_available = true;
3200 /* count how many public ip structures we have */
3202 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3206 len = offsetof(struct ctdb_all_public_ips, ips) +
3207 num*sizeof(struct ctdb_public_ip);
3208 ips = talloc_zero_size(outdata, len);
3209 CTDB_NO_MEMORY(ctdb, ips);
3212 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3213 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3216 ips->ips[i].pnn = vnn->pnn;
3217 ips->ips[i].addr = vnn->public_address;
3221 len = offsetof(struct ctdb_all_public_ips, ips) +
3222 i*sizeof(struct ctdb_public_ip);
3224 outdata->dsize = len;
3225 outdata->dptr = (uint8_t *)ips;
3231 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3232 struct ctdb_req_control *c,
3237 ctdb_sock_addr *addr;
3238 struct ctdb_control_public_ip_info *info;
3239 struct ctdb_vnn *vnn;
3241 addr = (ctdb_sock_addr *)indata.dptr;
3243 vnn = find_public_ip_vnn(ctdb, addr);
3245 /* if it is not a public ip it could be our 'single ip' */
3246 if (ctdb->single_ip_vnn) {
3247 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3248 vnn = ctdb->single_ip_vnn;
3253 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3254 "'%s'not a public address\n",
3255 ctdb_addr_to_str(addr)));
3259 /* count how many public ip structures we have */
3261 for (;vnn->ifaces[num];) {
3265 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3266 num*sizeof(struct ctdb_control_iface_info);
3267 info = talloc_zero_size(outdata, len);
3268 CTDB_NO_MEMORY(ctdb, info);
3270 info->ip.addr = vnn->public_address;
3271 info->ip.pnn = vnn->pnn;
3272 info->active_idx = 0xFFFFFFFF;
3274 for (i=0; vnn->ifaces[i]; i++) {
3275 struct ctdb_iface *cur;
3277 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3279 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3283 if (vnn->iface == cur) {
3284 info->active_idx = i;
3286 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3287 info->ifaces[i].link_state = cur->link_up;
3288 info->ifaces[i].references = cur->references;
3291 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3292 i*sizeof(struct ctdb_control_iface_info);
3294 outdata->dsize = len;
3295 outdata->dptr = (uint8_t *)info;
3300 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3301 struct ctdb_req_control *c,
3305 struct ctdb_control_get_ifaces *ifaces;
3306 struct ctdb_iface *cur;
3308 /* count how many public ip structures we have */
3310 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3314 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3315 num*sizeof(struct ctdb_control_iface_info);
3316 ifaces = talloc_zero_size(outdata, len);
3317 CTDB_NO_MEMORY(ctdb, ifaces);
3320 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3321 strcpy(ifaces->ifaces[i].name, cur->name);
3322 ifaces->ifaces[i].link_state = cur->link_up;
3323 ifaces->ifaces[i].references = cur->references;
3327 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3328 i*sizeof(struct ctdb_control_iface_info);
3330 outdata->dsize = len;
3331 outdata->dptr = (uint8_t *)ifaces;
3336 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3337 struct ctdb_req_control *c,
3340 struct ctdb_control_iface_info *info;
3341 struct ctdb_iface *iface;
3342 bool link_up = false;
3344 info = (struct ctdb_control_iface_info *)indata.dptr;
3346 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3347 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3348 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3349 len, len, info->name));
3353 switch (info->link_state) {
3361 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3362 (unsigned int)info->link_state));
3366 if (info->references != 0) {
3367 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3368 (unsigned int)info->references));
3372 iface = ctdb_find_iface(ctdb, info->name);
3373 if (iface == NULL) {
3377 if (link_up == iface->link_up) {
3381 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3382 ("iface[%s] has changed it's link status %s => %s\n",
3384 iface->link_up?"up":"down",
3385 link_up?"up":"down"));
3387 iface->link_up = link_up;
3393 structure containing the listening socket and the list of tcp connections
3394 that the ctdb daemon is to kill
3396 struct ctdb_kill_tcp {
3397 struct ctdb_vnn *vnn;
3398 struct ctdb_context *ctdb;
3400 struct tevent_fd *fde;
3401 trbt_tree_t *connections;
3406 a tcp connection that is to be killed
3408 struct ctdb_killtcp_con {
3409 ctdb_sock_addr src_addr;
3410 ctdb_sock_addr dst_addr;
3412 struct ctdb_kill_tcp *killtcp;
3415 /* this function is used to create a key to represent this socketpair
3416 in the killtcp tree.
3417 this key is used to insert and lookup matching socketpairs that are
3418 to be tickled and RST
3420 #define KILLTCP_KEYLEN 10
3421 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3423 static uint32_t key[KILLTCP_KEYLEN];
3425 bzero(key, sizeof(key));
3427 if (src->sa.sa_family != dst->sa.sa_family) {
3428 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3432 switch (src->sa.sa_family) {
3434 key[0] = dst->ip.sin_addr.s_addr;
3435 key[1] = src->ip.sin_addr.s_addr;
3436 key[2] = dst->ip.sin_port;
3437 key[3] = src->ip.sin_port;
3440 uint32_t *dst6_addr32 =
3441 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3442 uint32_t *src6_addr32 =
3443 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3444 key[0] = dst6_addr32[3];
3445 key[1] = src6_addr32[3];
3446 key[2] = dst6_addr32[2];
3447 key[3] = src6_addr32[2];
3448 key[4] = dst6_addr32[1];
3449 key[5] = src6_addr32[1];
3450 key[6] = dst6_addr32[0];
3451 key[7] = src6_addr32[0];
3452 key[8] = dst->ip6.sin6_port;
3453 key[9] = src->ip6.sin6_port;
3457 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3465 called when we get a read event on the raw socket
3467 static void capture_tcp_handler(struct tevent_context *ev,
3468 struct tevent_fd *fde,
3469 uint16_t flags, void *private_data)
3471 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3472 struct ctdb_killtcp_con *con;
3473 ctdb_sock_addr src, dst;
3474 uint32_t ack_seq, seq;
3476 if (!(flags & TEVENT_FD_READ)) {
3480 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3481 killtcp->private_data,
3483 &ack_seq, &seq) != 0) {
3484 /* probably a non-tcp ACK packet */
3488 /* check if we have this guy in our list of connections
3491 con = trbt_lookuparray32(killtcp->connections,
3492 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3494 /* no this was some other packet we can just ignore */
3498 /* This one has been tickled !
3499 now reset him and remove him from the list.
3501 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3502 ntohs(con->dst_addr.ip.sin_port),
3503 ctdb_addr_to_str(&con->src_addr),
3504 ntohs(con->src_addr.ip.sin_port)));
3506 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3511 /* when traversing the list of all tcp connections to send tickle acks to
3512 (so that we can capture the ack coming back and kill the connection
3514 this callback is called for each connection we are currently trying to kill
3516 static int tickle_connection_traverse(void *param, void *data)
3518 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3520 /* have tried too many times, just give up */
3521 if (con->count >= 5) {
3522 /* can't delete in traverse: reparent to delete_cons */
3523 talloc_steal(param, con);
3527 /* othervise, try tickling it again */
3530 (ctdb_sock_addr *)&con->dst_addr,
3531 (ctdb_sock_addr *)&con->src_addr,
3538 called every second until all sentenced connections have been reset
3540 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3541 struct tevent_timer *te,
3542 struct timeval t, void *private_data)
3544 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3545 void *delete_cons = talloc_new(NULL);
3547 /* loop over all connections sending tickle ACKs */
3548 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3550 /* now we've finished traverse, it's safe to do deletion. */
3551 talloc_free(delete_cons);
3553 /* If there are no more connections to kill we can remove the
3554 entire killtcp structure
3556 if ( (killtcp->connections == NULL) ||
3557 (killtcp->connections->root == NULL) ) {
3558 talloc_free(killtcp);
3562 /* try tickling them again in a seconds time
3564 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3565 timeval_current_ofs(1, 0),
3566 ctdb_tickle_sentenced_connections, killtcp);
3570 destroy the killtcp structure
3572 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3574 struct ctdb_vnn *tmpvnn;
3576 /* verify that this vnn is still active */
3577 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3578 if (tmpvnn == killtcp->vnn) {
3583 if (tmpvnn == NULL) {
3587 if (killtcp->vnn->killtcp != killtcp) {
3591 killtcp->vnn->killtcp = NULL;
3597 /* nothing fancy here, just unconditionally replace any existing
3598 connection structure with the new one.
3600 dont even free the old one if it did exist, that one is talloc_stolen
3601 by the same node in the tree anyway and will be deleted when the new data
3604 static void *add_killtcp_callback(void *parm, void *data)
3610 add a tcp socket to the list of connections we want to RST
3612 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3616 ctdb_sock_addr src, dst;
3617 struct ctdb_kill_tcp *killtcp;
3618 struct ctdb_killtcp_con *con;
3619 struct ctdb_vnn *vnn;
3621 ctdb_canonicalize_ip(s, &src);
3622 ctdb_canonicalize_ip(d, &dst);
3624 vnn = find_public_ip_vnn(ctdb, &dst);
3626 vnn = find_public_ip_vnn(ctdb, &src);
3629 /* if it is not a public ip it could be our 'single ip' */
3630 if (ctdb->single_ip_vnn) {
3631 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3632 vnn = ctdb->single_ip_vnn;
3637 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3641 killtcp = vnn->killtcp;
3643 /* If this is the first connection to kill we must allocate
3646 if (killtcp == NULL) {
3647 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3648 CTDB_NO_MEMORY(ctdb, killtcp);
3651 killtcp->ctdb = ctdb;
3652 killtcp->capture_fd = -1;
3653 killtcp->connections = trbt_create(killtcp, 0);
3655 vnn->killtcp = killtcp;
3656 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3661 /* create a structure that describes this connection we want to
3662 RST and store it in killtcp->connections
3664 con = talloc(killtcp, struct ctdb_killtcp_con);
3665 CTDB_NO_MEMORY(ctdb, con);
3666 con->src_addr = src;
3667 con->dst_addr = dst;
3669 con->killtcp = killtcp;
3672 trbt_insertarray32_callback(killtcp->connections,
3673 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3674 add_killtcp_callback, con);
3677 If we dont have a socket to listen on yet we must create it
3679 if (killtcp->capture_fd == -1) {
3680 const char *iface = ctdb_vnn_iface_string(vnn);
3681 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3682 if (killtcp->capture_fd == -1) {
3683 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3684 "socket on iface '%s' for killtcp (%s)\n",
3685 iface, strerror(errno)));
3691 if (killtcp->fde == NULL) {
3692 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3693 killtcp->capture_fd,
3695 capture_tcp_handler, killtcp);
3696 tevent_fd_set_auto_close(killtcp->fde);
3698 /* We also need to set up some events to tickle all these connections
3699 until they are all reset
3701 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3702 ctdb_tickle_sentenced_connections, killtcp);
3705 /* tickle him once now */
3714 talloc_free(vnn->killtcp);
3715 vnn->killtcp = NULL;
3720 kill a TCP connection.
3722 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3724 struct ctdb_tcp_connection *killtcp = (struct ctdb_tcp_connection *)indata.dptr;
3726 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3730 called by a daemon to inform us of the entire list of TCP tickles for
3731 a particular public address.
3732 this control should only be sent by the node that is currently serving
3733 that public address.
3735 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3737 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3738 struct ctdb_tcp_array *tcparray;
3739 struct ctdb_vnn *vnn;
3741 /* We must at least have tickles.num or else we cant verify the size
3742 of the received data blob
3744 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3745 tickles.connections)) {
3746 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3750 /* verify that the size of data matches what we expect */
3751 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3752 tickles.connections)
3753 + sizeof(struct ctdb_tcp_connection)
3754 * list->tickles.num) {
3755 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3759 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3760 ctdb_addr_to_str(&list->addr)));
3762 vnn = find_public_ip_vnn(ctdb, &list->addr);
3764 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3765 ctdb_addr_to_str(&list->addr)));
3770 /* remove any old ticklelist we might have */
3771 talloc_free(vnn->tcp_array);
3772 vnn->tcp_array = NULL;
3774 tcparray = talloc(vnn, struct ctdb_tcp_array);
3775 CTDB_NO_MEMORY(ctdb, tcparray);
3777 tcparray->num = list->tickles.num;
3779 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3780 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3782 memcpy(tcparray->connections, &list->tickles.connections[0],
3783 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3785 /* We now have a new fresh tickle list array for this vnn */
3786 vnn->tcp_array = tcparray;
3792 called to return the full list of tickles for the puclic address associated
3793 with the provided vnn
3795 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3797 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3798 struct ctdb_control_tcp_tickle_list *list;
3799 struct ctdb_tcp_array *tcparray;
3801 struct ctdb_vnn *vnn;
3803 vnn = find_public_ip_vnn(ctdb, addr);
3805 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3806 ctdb_addr_to_str(addr)));
3811 tcparray = vnn->tcp_array;
3813 num = tcparray->num;
3818 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3819 tickles.connections)
3820 + sizeof(struct ctdb_tcp_connection) * num;
3822 outdata->dptr = talloc_size(outdata, outdata->dsize);
3823 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3824 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3827 list->tickles.num = num;
3829 memcpy(&list->tickles.connections[0], tcparray->connections,
3830 sizeof(struct ctdb_tcp_connection) * num);
3838 set the list of all tcp tickles for a public address
3840 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3841 ctdb_sock_addr *addr,
3842 struct ctdb_tcp_array *tcparray)
3846 struct ctdb_control_tcp_tickle_list *list;
3849 num = tcparray->num;
3854 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3855 tickles.connections) +
3856 sizeof(struct ctdb_tcp_connection) * num;
3857 data.dptr = talloc_size(ctdb, data.dsize);
3858 CTDB_NO_MEMORY(ctdb, data.dptr);
3860 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3862 list->tickles.num = num;
3864 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3867 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3868 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3869 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3871 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3875 talloc_free(data.dptr);
3882 perform tickle updates if required
3884 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3885 struct tevent_timer *te,
3886 struct timeval t, void *private_data)
3888 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3890 struct ctdb_vnn *vnn;
3892 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3893 /* we only send out updates for public addresses that
3896 if (ctdb->pnn != vnn->pnn) {
3899 /* We only send out the updates if we need to */
3900 if (!vnn->tcp_update_needed) {
3903 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3904 &vnn->public_address,
3907 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3908 ctdb_addr_to_str(&vnn->public_address)));
3911 ("Sent tickle update for public address %s\n",
3912 ctdb_addr_to_str(&vnn->public_address)));
3913 vnn->tcp_update_needed = false;
3917 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3918 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3919 ctdb_update_tcp_tickles, ctdb);
3923 start periodic update of tcp tickles
3925 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3927 ctdb->tickle_update_context = talloc_new(ctdb);
3929 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3930 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3931 ctdb_update_tcp_tickles, ctdb);
3937 struct control_gratious_arp {
3938 struct ctdb_context *ctdb;
3939 ctdb_sock_addr addr;
3945 send a control_gratuitous arp
3947 static void send_gratious_arp(struct tevent_context *ev,
3948 struct tevent_timer *te,
3949 struct timeval t, void *private_data)
3952 struct control_gratious_arp *arp = talloc_get_type(private_data,
3953 struct control_gratious_arp);
3955 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3957 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3958 arp->iface, strerror(errno)));
3963 if (arp->count == CTDB_ARP_REPEAT) {
3968 tevent_add_timer(arp->ctdb->ev, arp,
3969 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3970 send_gratious_arp, arp);
3977 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3979 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3980 struct control_gratious_arp *arp;
3982 /* verify the size of indata */
3983 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3984 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3985 (unsigned)indata.dsize,
3986 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3990 ( offsetof(struct ctdb_control_gratious_arp, iface)
3991 + gratious_arp->len ) ){
3993 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3994 "but should be %u bytes\n",
3995 (unsigned)indata.dsize,
3996 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4001 arp = talloc(ctdb, struct control_gratious_arp);
4002 CTDB_NO_MEMORY(ctdb, arp);
4005 arp->addr = gratious_arp->addr;
4006 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4007 CTDB_NO_MEMORY(ctdb, arp->iface);
4010 tevent_add_timer(arp->ctdb->ev, arp,
4011 timeval_zero(), send_gratious_arp, arp);
4016 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4018 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4021 /* verify the size of indata */
4022 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4023 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4027 ( offsetof(struct ctdb_control_ip_iface, iface)
4030 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4031 "but should be %u bytes\n",
4032 (unsigned)indata.dsize,
4033 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4037 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4039 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4042 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4049 struct delete_ip_callback_state {
4050 struct ctdb_req_control *c;
4054 called when releaseip event finishes for del_public_address
4056 static void delete_ip_callback(struct ctdb_context *ctdb,
4057 int32_t status, TDB_DATA data,
4058 const char *errormsg,
4061 struct delete_ip_callback_state *state =
4062 talloc_get_type(private_data, struct delete_ip_callback_state);
4064 /* If release failed then fail. */
4065 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4066 talloc_free(private_data);
4069 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4070 struct ctdb_req_control *c,
4071 TDB_DATA indata, bool *async_reply)
4073 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4074 struct ctdb_vnn *vnn;
4076 /* verify the size of indata */
4077 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4078 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4082 ( offsetof(struct ctdb_control_ip_iface, iface)
4085 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4086 "but should be %u bytes\n",
4087 (unsigned)indata.dsize,
4088 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4092 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4094 /* walk over all public addresses until we find a match */
4095 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4096 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4097 if (vnn->pnn == ctdb->pnn) {
4098 struct delete_ip_callback_state *state;
4099 struct ctdb_public_ip *ip;
4103 vnn->delete_pending = true;
4105 state = talloc(ctdb,
4106 struct delete_ip_callback_state);
4107 CTDB_NO_MEMORY(ctdb, state);
4110 ip = talloc(state, struct ctdb_public_ip);
4113 (__location__ " Out of memory\n"));
4118 ip->addr = pub->addr;
4120 data.dsize = sizeof(struct ctdb_public_ip);
4121 data.dptr = (unsigned char *)ip;
4123 ret = ctdb_daemon_send_control(ctdb,
4126 CTDB_CONTROL_RELEASE_IP,
4133 (__location__ "Unable to send "
4134 "CTDB_CONTROL_RELEASE_IP\n"));
4139 state->c = talloc_steal(state, c);
4140 *async_reply = true;
4142 /* This IP is not hosted on the
4143 * current node so just delete it
4145 do_delete_ip(ctdb, vnn);
4152 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4153 ctdb_addr_to_str(&pub->addr)));
4158 struct ipreallocated_callback_state {
4159 struct ctdb_req_control *c;
4162 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4163 int status, void *p)
4165 struct ipreallocated_callback_state *state =
4166 talloc_get_type(p, struct ipreallocated_callback_state);
4170 (" \"ipreallocated\" event script failed (status %d)\n",
4172 if (status == -ETIME) {
4173 ctdb_ban_self(ctdb);
4177 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4181 /* A control to run the ipreallocated event */
4182 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4183 struct ctdb_req_control *c,
4187 struct ipreallocated_callback_state *state;
4189 state = talloc(ctdb, struct ipreallocated_callback_state);
4190 CTDB_NO_MEMORY(ctdb, state);
4192 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4194 ret = ctdb_event_script_callback(ctdb, state,
4195 ctdb_ipreallocated_callback, state,
4196 CTDB_EVENT_IPREALLOCATED,
4200 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4205 /* tell the control that we will be reply asynchronously */
4206 state->c = talloc_steal(state, c);
4207 *async_reply = true;
4213 /* This function is called from the recovery daemon to verify that a remote
4214 node has the expected ip allocation.
4215 This is verified against ctdb->ip_tree
4217 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4218 struct ctdb_all_public_ips *ips,
4221 struct ctdb_public_ip_list *tmp_ip;
4224 if (ctdb->ip_tree == NULL) {
4225 /* dont know the expected allocation yet, assume remote node
4234 for (i=0; i<ips->num; i++) {
4235 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4236 if (tmp_ip == NULL) {
4237 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4241 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4245 if (tmp_ip->pnn != ips->ips[i].pnn) {
4247 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4249 ctdb_addr_to_str(&ips->ips[i].addr),
4250 ips->ips[i].pnn, tmp_ip->pnn));
4258 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4260 struct ctdb_public_ip_list *tmp_ip;
4262 /* IP tree is never built if DisableIPFailover is set */
4263 if (ctdb->tunable.disable_ip_failover != 0) {
4267 if (ctdb->ip_tree == NULL) {
4268 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4272 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4273 if (tmp_ip == NULL) {
4274 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4278 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4279 tmp_ip->pnn = ip->pnn;
4284 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4286 TALLOC_FREE(ctdb->ip_tree);
4289 struct ctdb_reloadips_handle {
4290 struct ctdb_context *ctdb;
4291 struct ctdb_req_control *c;
4295 struct tevent_fd *fde;
4298 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4300 if (h == h->ctdb->reload_ips) {
4301 h->ctdb->reload_ips = NULL;
4304 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4307 ctdb_kill(h->ctdb, h->child, SIGKILL);
4311 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4312 struct tevent_timer *te,
4313 struct timeval t, void *private_data)
4315 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4320 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4321 struct tevent_fd *fde,
4322 uint16_t flags, void *private_data)
4324 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4329 ret = sys_read(h->fd[0], &res, 1);
4330 if (ret < 1 || res != 0) {
4331 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4339 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4341 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4342 struct ctdb_all_public_ips *ips;
4343 struct ctdb_vnn *vnn;
4344 struct client_async_data *async_data;
4345 struct timeval timeout;
4347 struct ctdb_client_control_state *state;
4351 CTDB_NO_MEMORY(ctdb, mem_ctx);
4353 /* Read IPs from local node */
4354 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4355 CTDB_CURRENT_NODE, mem_ctx, &ips);
4358 ("Unable to fetch public IPs from local node\n"));
4359 talloc_free(mem_ctx);
4363 /* Read IPs file - this is safe since this is a child process */
4365 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4366 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4367 talloc_free(mem_ctx);
4371 async_data = talloc_zero(mem_ctx, struct client_async_data);
4372 CTDB_NO_MEMORY(ctdb, async_data);
4374 /* Compare IPs between node and file for IPs to be deleted */
4375 for (i = 0; i < ips->num; i++) {
4377 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4378 if (ctdb_same_ip(&vnn->public_address,
4379 &ips->ips[i].addr)) {
4380 /* IP is still in file */
4386 /* Delete IP ips->ips[i] */
4387 struct ctdb_control_ip_iface *pub;
4390 ("IP %s no longer configured, deleting it\n",
4391 ctdb_addr_to_str(&ips->ips[i].addr)));
4393 pub = talloc_zero(mem_ctx,
4394 struct ctdb_control_ip_iface);
4395 CTDB_NO_MEMORY(ctdb, pub);
4397 pub->addr = ips->ips[i].addr;
4401 timeout = TAKEOVER_TIMEOUT();
4403 data.dsize = offsetof(struct ctdb_control_ip_iface,
4405 data.dptr = (uint8_t *)pub;
4407 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4408 CTDB_CONTROL_DEL_PUBLIC_IP,
4409 0, data, async_data,
4411 if (state == NULL) {
4414 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4418 ctdb_client_async_add(async_data, state);
4422 /* Compare IPs between node and file for IPs to be added */
4424 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4425 for (i = 0; i < ips->num; i++) {
4426 if (ctdb_same_ip(&vnn->public_address,
4427 &ips->ips[i].addr)) {
4428 /* IP already on node */
4432 if (i == ips->num) {
4433 /* Add IP ips->ips[i] */
4434 struct ctdb_control_ip_iface *pub;
4435 const char *ifaces = NULL;
4440 ("New IP %s configured, adding it\n",
4441 ctdb_addr_to_str(&vnn->public_address)));
4443 uint32_t pnn = ctdb_get_pnn(ctdb);
4445 data.dsize = sizeof(pnn);
4446 data.dptr = (uint8_t *)&pnn;
4448 ret = ctdb_client_send_message(
4450 CTDB_BROADCAST_CONNECTED,
4451 CTDB_SRVID_REBALANCE_NODE,
4454 DEBUG(DEBUG_WARNING,
4455 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4461 ifaces = vnn->ifaces[0];
4463 while (vnn->ifaces[iface] != NULL) {
4464 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4465 vnn->ifaces[iface]);
4469 len = strlen(ifaces) + 1;
4470 pub = talloc_zero_size(mem_ctx,
4471 offsetof(struct ctdb_control_ip_iface, iface) + len);
4472 CTDB_NO_MEMORY(ctdb, pub);
4474 pub->addr = vnn->public_address;
4475 pub->mask = vnn->public_netmask_bits;
4477 memcpy(&pub->iface[0], ifaces, pub->len);
4479 timeout = TAKEOVER_TIMEOUT();
4481 data.dsize = offsetof(struct ctdb_control_ip_iface,
4483 data.dptr = (uint8_t *)pub;
4485 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4486 CTDB_CONTROL_ADD_PUBLIC_IP,
4487 0, data, async_data,
4489 if (state == NULL) {
4492 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4496 ctdb_client_async_add(async_data, state);
4500 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4501 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4505 talloc_free(mem_ctx);
4509 talloc_free(mem_ctx);
4513 /* This control is sent to force the node to re-read the public addresses file
4514 and drop any addresses we should nnot longer host, and add new addresses
4515 that we are now able to host
4517 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4519 struct ctdb_reloadips_handle *h;
4520 pid_t parent = getpid();
4522 if (ctdb->reload_ips != NULL) {
4523 talloc_free(ctdb->reload_ips);
4524 ctdb->reload_ips = NULL;
4527 h = talloc(ctdb, struct ctdb_reloadips_handle);
4528 CTDB_NO_MEMORY(ctdb, h);
4533 if (pipe(h->fd) == -1) {
4534 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4539 h->child = ctdb_fork(ctdb);
4540 if (h->child == (pid_t)-1) {
4541 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4549 if (h->child == 0) {
4550 signed char res = 0;
4553 debug_extra = talloc_asprintf(NULL, "reloadips:");
4555 ctdb_set_process_name("ctdb_reloadips");
4556 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4557 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4560 res = ctdb_reloadips_child(ctdb);
4562 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4566 sys_write(h->fd[1], &res, 1);
4567 /* make sure we die when our parent dies */
4568 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4574 h->c = talloc_steal(h, c);
4577 set_close_on_exec(h->fd[0]);
4579 talloc_set_destructor(h, ctdb_reloadips_destructor);
4582 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4583 ctdb_reloadips_child_handler, (void *)h);
4584 tevent_fd_set_auto_close(h->fde);
4586 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4587 ctdb_reloadips_timeout_event, h);
4589 /* we reply later */
4590 *async_reply = true;