4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 /* Flags used in IP allocation algorithms. */
40 enum ctdb_runstate runstate;
44 struct ctdb_iface *prev, *next;
50 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
53 return vnn->iface->name;
59 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
63 /* Verify that we dont have an entry for this ip yet */
64 for (i=ctdb->ifaces;i;i=i->next) {
65 if (strcmp(i->name, iface) == 0) {
70 /* create a new structure for this interface */
71 i = talloc_zero(ctdb, struct ctdb_iface);
72 CTDB_NO_MEMORY_FATAL(ctdb, i);
73 i->name = talloc_strdup(i, iface);
74 CTDB_NO_MEMORY(ctdb, i->name);
76 * If link_up defaults to true then IPs can be allocated to a
77 * node during the first recovery. However, then an interface
78 * could have its link marked down during the startup event,
79 * causing the IP to move almost immediately. If link_up
80 * defaults to false then, during normal operation, IPs added
81 * to a new interface can't be assigned until a monitor cycle
82 * has occurred and marked the new interfaces up. This makes
83 * IP allocation unpredictable. The following is a neat
84 * compromise: early in startup link_up defaults to false, so
85 * IPs can't be assigned, and after startup IPs can be
86 * assigned immediately.
88 i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
90 DLIST_ADD(ctdb->ifaces, i);
95 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
100 for (n = 0; vnn->ifaces[n] != NULL; n++) {
101 if (strcmp(name, vnn->ifaces[n]) == 0) {
109 /* If any interfaces now have no possible IPs then delete them. This
110 * implementation is naive (i.e. simple) rather than clever
111 * (i.e. complex). Given that this is run on delip and that operation
112 * is rare, this doesn't need to be efficient - it needs to be
113 * foolproof. One alternative is reference counting, where the logic
114 * is distributed and can, therefore, be broken in multiple places.
115 * Another alternative is to build a red-black tree of interfaces that
116 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
117 * once) and then walking ctdb->ifaces once and deleting those not in
118 * the tree. Let's go to one of those if the naive implementation
119 * causes problems... :-)
121 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
122 struct ctdb_vnn *vnn)
124 struct ctdb_iface *i, *next;
126 /* For each interface, check if there's an IP using it. */
127 for (i = ctdb->ifaces; i != NULL; i = next) {
132 /* Only consider interfaces named in the given VNN. */
133 if (!vnn_has_interface_with_name(vnn, i->name)) {
137 /* Is the "single IP" on this interface? */
138 if ((ctdb->single_ip_vnn != NULL) &&
139 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
140 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
141 /* Found, next interface please... */
144 /* Search for a vnn with this interface. */
146 for (tv=ctdb->vnn; tv; tv=tv->next) {
147 if (vnn_has_interface_with_name(tv, i->name)) {
154 /* None of the VNNs are using this interface. */
155 DLIST_REMOVE(ctdb->ifaces, i);
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
165 struct ctdb_iface *i;
167 for (i=ctdb->ifaces;i;i=i->next) {
168 if (strcmp(i->name, iface) == 0) {
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177 struct ctdb_vnn *vnn)
180 struct ctdb_iface *cur = NULL;
181 struct ctdb_iface *best = NULL;
183 for (i=0; vnn->ifaces[i]; i++) {
185 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
199 if (cur->references < best->references) {
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209 struct ctdb_vnn *vnn)
211 struct ctdb_iface *best = NULL;
214 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215 "still assigned to iface '%s'\n",
216 ctdb_addr_to_str(&vnn->public_address),
217 ctdb_vnn_iface_string(vnn)));
221 best = ctdb_vnn_best_iface(ctdb, vnn);
223 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224 "cannot assign to iface any iface\n",
225 ctdb_addr_to_str(&vnn->public_address)));
231 vnn->pnn = ctdb->pnn;
233 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234 "now assigned to iface '%s' refs[%d]\n",
235 ctdb_addr_to_str(&vnn->public_address),
236 ctdb_vnn_iface_string(vnn),
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242 struct ctdb_vnn *vnn)
244 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245 "now unassigned (old iface '%s' refs[%d])\n",
246 ctdb_addr_to_str(&vnn->public_address),
247 ctdb_vnn_iface_string(vnn),
248 vnn->iface?vnn->iface->references:0));
250 vnn->iface->references--;
253 if (vnn->pnn == ctdb->pnn) {
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259 struct ctdb_vnn *vnn)
263 if (vnn->delete_pending) {
267 if (vnn->iface && vnn->iface->link_up) {
271 for (i=0; vnn->ifaces[i]; i++) {
272 struct ctdb_iface *cur;
274 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
287 struct ctdb_takeover_arp {
288 struct ctdb_context *ctdb;
291 struct ctdb_tcp_array *tcparray;
292 struct ctdb_vnn *vnn;
297 lists of tcp endpoints
299 struct ctdb_tcp_list {
300 struct ctdb_tcp_list *prev, *next;
301 struct ctdb_tcp_connection connection;
305 list of clients to kill on IP release
307 struct ctdb_client_ip {
308 struct ctdb_client_ip *prev, *next;
309 struct ctdb_context *ctdb;
316 send a gratuitous arp
318 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
319 struct timeval t, void *private_data)
321 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
322 struct ctdb_takeover_arp);
324 struct ctdb_tcp_array *tcparray;
325 const char *iface = ctdb_vnn_iface_string(arp->vnn);
327 ret = ctdb_sys_send_arp(&arp->addr, iface);
329 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
330 iface, strerror(errno)));
333 tcparray = arp->tcparray;
335 for (i=0;i<tcparray->num;i++) {
336 struct ctdb_tcp_connection *tcon;
338 tcon = &tcparray->connections[i];
339 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
340 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
341 ctdb_addr_to_str(&tcon->src_addr),
342 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
343 ret = ctdb_sys_send_tcp(
348 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
349 ctdb_addr_to_str(&tcon->src_addr)));
356 if (arp->count == CTDB_ARP_REPEAT) {
361 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
362 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
363 ctdb_control_send_arp, arp);
366 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
367 struct ctdb_vnn *vnn)
369 struct ctdb_takeover_arp *arp;
370 struct ctdb_tcp_array *tcparray;
372 if (!vnn->takeover_ctx) {
373 vnn->takeover_ctx = talloc_new(vnn);
374 if (!vnn->takeover_ctx) {
379 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
385 arp->addr = vnn->public_address;
388 tcparray = vnn->tcp_array;
390 /* add all of the known tcp connections for this IP to the
391 list of tcp connections to send tickle acks for */
392 arp->tcparray = talloc_steal(arp, tcparray);
394 vnn->tcp_array = NULL;
395 vnn->tcp_update_needed = true;
398 event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
399 timeval_zero(), ctdb_control_send_arp, arp);
404 struct takeover_callback_state {
405 struct ctdb_req_control *c;
406 ctdb_sock_addr *addr;
407 struct ctdb_vnn *vnn;
410 struct ctdb_do_takeip_state {
411 struct ctdb_req_control *c;
412 struct ctdb_vnn *vnn;
416 called when takeip event finishes
418 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
421 struct ctdb_do_takeip_state *state =
422 talloc_get_type(private_data, struct ctdb_do_takeip_state);
427 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
429 if (status == -ETIME) {
432 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
433 ctdb_addr_to_str(&state->vnn->public_address),
434 ctdb_vnn_iface_string(state->vnn)));
435 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
437 node->flags |= NODE_FLAGS_UNHEALTHY;
442 if (ctdb->do_checkpublicip) {
444 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
446 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
453 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
454 data.dsize = strlen((char *)data.dptr) + 1;
455 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
457 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
460 /* the control succeeded */
461 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
466 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
468 state->vnn->update_in_flight = false;
473 take over an ip address
475 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
476 struct ctdb_req_control *c,
477 struct ctdb_vnn *vnn)
480 struct ctdb_do_takeip_state *state;
482 if (vnn->update_in_flight) {
483 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
484 "update for this IP already in flight\n",
485 ctdb_addr_to_str(&vnn->public_address),
486 vnn->public_netmask_bits));
490 ret = ctdb_vnn_assign_iface(ctdb, vnn);
492 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
493 "assign a usable interface\n",
494 ctdb_addr_to_str(&vnn->public_address),
495 vnn->public_netmask_bits));
499 state = talloc(vnn, struct ctdb_do_takeip_state);
500 CTDB_NO_MEMORY(ctdb, state);
502 state->c = talloc_steal(ctdb, c);
505 vnn->update_in_flight = true;
506 talloc_set_destructor(state, ctdb_takeip_destructor);
508 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits,
511 ctdb_vnn_iface_string(vnn)));
513 ret = ctdb_event_script_callback(ctdb,
515 ctdb_do_takeip_callback,
519 ctdb_vnn_iface_string(vnn),
520 ctdb_addr_to_str(&vnn->public_address),
521 vnn->public_netmask_bits);
524 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
525 ctdb_addr_to_str(&vnn->public_address),
526 ctdb_vnn_iface_string(vnn)));
534 struct ctdb_do_updateip_state {
535 struct ctdb_req_control *c;
536 struct ctdb_iface *old;
537 struct ctdb_vnn *vnn;
541 called when updateip event finishes
543 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
546 struct ctdb_do_updateip_state *state =
547 talloc_get_type(private_data, struct ctdb_do_updateip_state);
551 if (status == -ETIME) {
554 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
555 ctdb_addr_to_str(&state->vnn->public_address),
557 ctdb_vnn_iface_string(state->vnn)));
560 * All we can do is reset the old interface
561 * and let the next run fix it
563 ctdb_vnn_unassign_iface(ctdb, state->vnn);
564 state->vnn->iface = state->old;
565 state->vnn->iface->references++;
567 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
572 if (ctdb->do_checkpublicip) {
574 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
576 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
583 /* the control succeeded */
584 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
589 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
591 state->vnn->update_in_flight = false;
596 update (move) an ip address
598 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
599 struct ctdb_req_control *c,
600 struct ctdb_vnn *vnn)
603 struct ctdb_do_updateip_state *state;
604 struct ctdb_iface *old = vnn->iface;
605 const char *new_name;
607 if (vnn->update_in_flight) {
608 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
609 "update for this IP already in flight\n",
610 ctdb_addr_to_str(&vnn->public_address),
611 vnn->public_netmask_bits));
615 ctdb_vnn_unassign_iface(ctdb, vnn);
616 ret = ctdb_vnn_assign_iface(ctdb, vnn);
618 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
619 "assin a usable interface (old iface '%s')\n",
620 ctdb_addr_to_str(&vnn->public_address),
621 vnn->public_netmask_bits,
626 new_name = ctdb_vnn_iface_string(vnn);
627 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
628 /* A benign update from one interface onto itself.
629 * no need to run the eventscripts in this case, just return
632 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
636 state = talloc(vnn, struct ctdb_do_updateip_state);
637 CTDB_NO_MEMORY(ctdb, state);
639 state->c = talloc_steal(ctdb, c);
643 vnn->update_in_flight = true;
644 talloc_set_destructor(state, ctdb_updateip_destructor);
646 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
647 "interface %s to %s\n",
648 ctdb_addr_to_str(&vnn->public_address),
649 vnn->public_netmask_bits,
653 ret = ctdb_event_script_callback(ctdb,
655 ctdb_do_updateip_callback,
657 CTDB_EVENT_UPDATE_IP,
661 ctdb_addr_to_str(&vnn->public_address),
662 vnn->public_netmask_bits);
664 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
665 ctdb_addr_to_str(&vnn->public_address),
666 old->name, new_name));
675 Find the vnn of the node that has a public ip address
676 returns -1 if the address is not known as a public address
678 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
680 struct ctdb_vnn *vnn;
682 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
683 if (ctdb_same_ip(&vnn->public_address, addr)) {
692 take over an ip address
694 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
695 struct ctdb_req_control *c,
700 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
701 struct ctdb_vnn *vnn;
702 bool have_ip = false;
703 bool do_updateip = false;
704 bool do_takeip = false;
705 struct ctdb_iface *best_iface = NULL;
707 if (pip->pnn != ctdb->pnn) {
708 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
709 "with pnn %d, but we're node %d\n",
710 ctdb_addr_to_str(&pip->addr),
711 pip->pnn, ctdb->pnn));
715 /* update out vnn list */
716 vnn = find_public_ip_vnn(ctdb, &pip->addr);
718 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
719 ctdb_addr_to_str(&pip->addr)));
723 if (ctdb->do_checkpublicip) {
724 have_ip = ctdb_sys_have_ip(&pip->addr);
726 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
727 if (best_iface == NULL) {
728 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
729 "a usable interface (old %s, have_ip %d)\n",
730 ctdb_addr_to_str(&vnn->public_address),
731 vnn->public_netmask_bits,
732 ctdb_vnn_iface_string(vnn),
737 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
738 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
743 if (vnn->iface == NULL && have_ip) {
744 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
746 ctdb_addr_to_str(&vnn->public_address)));
750 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
751 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752 "and we have it on iface[%s], but it was assigned to node %d"
753 "and we are node %d, banning ourself\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
760 if (vnn->pnn == -1 && have_ip) {
761 vnn->pnn = ctdb->pnn;
762 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
763 "and we already have it on iface[%s], update local daemon\n",
764 ctdb_addr_to_str(&vnn->public_address),
765 ctdb_vnn_iface_string(vnn)));
770 if (vnn->iface != best_iface) {
771 if (!vnn->iface->link_up) {
773 } else if (vnn->iface->references > (best_iface->references + 1)) {
774 /* only move when the rebalance gains something */
782 ctdb_vnn_unassign_iface(ctdb, vnn);
789 ret = ctdb_do_takeip(ctdb, c, vnn);
793 } else if (do_updateip) {
794 ret = ctdb_do_updateip(ctdb, c, vnn);
800 * The interface is up and the kernel known the ip
803 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
804 ctdb_addr_to_str(&pip->addr),
805 vnn->public_netmask_bits,
806 ctdb_vnn_iface_string(vnn)));
810 /* tell ctdb_control.c that we will be replying asynchronously */
817 kill any clients that are registered with a IP that is being released
819 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
821 struct ctdb_client_ip *ip;
823 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
824 ctdb_addr_to_str(addr)));
826 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
827 ctdb_sock_addr tmp_addr;
830 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
832 ctdb_addr_to_str(&ip->addr)));
834 if (ctdb_same_ip(&tmp_addr, addr)) {
835 struct ctdb_client *client = ctdb_reqid_find(ctdb,
838 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
840 ctdb_addr_to_str(&ip->addr),
843 if (client->pid != 0) {
844 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
845 (unsigned)client->pid,
846 ctdb_addr_to_str(addr),
848 kill(client->pid, SIGKILL);
854 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
856 DLIST_REMOVE(ctdb->vnn, vnn);
857 ctdb_vnn_unassign_iface(ctdb, vnn);
858 ctdb_remove_orphaned_ifaces(ctdb, vnn);
863 called when releaseip event finishes
865 static void release_ip_callback(struct ctdb_context *ctdb, int status,
868 struct takeover_callback_state *state =
869 talloc_get_type(private_data, struct takeover_callback_state);
872 if (status == -ETIME) {
876 if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
877 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
878 ctdb_addr_to_str(state->addr)));
879 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
884 /* send a message to all clients of this node telling them
885 that the cluster has been reconfigured and they should
886 release any sockets on this IP */
887 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
888 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
889 data.dsize = strlen((char *)data.dptr)+1;
891 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
893 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
895 /* kill clients that have registered with this IP */
896 release_kill_clients(ctdb, state->addr);
898 ctdb_vnn_unassign_iface(ctdb, state->vnn);
900 /* Process the IP if it has been marked for deletion */
901 if (state->vnn->delete_pending) {
902 do_delete_ip(ctdb, state->vnn);
906 /* the control succeeded */
907 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
911 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
913 if (state->vnn != NULL) {
914 state->vnn->update_in_flight = false;
920 release an ip address
922 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
923 struct ctdb_req_control *c,
928 struct takeover_callback_state *state;
929 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
930 struct ctdb_vnn *vnn;
933 /* update our vnn list */
934 vnn = find_public_ip_vnn(ctdb, &pip->addr);
936 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
937 ctdb_addr_to_str(&pip->addr)));
942 /* stop any previous arps */
943 talloc_free(vnn->takeover_ctx);
944 vnn->takeover_ctx = NULL;
946 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
947 * lazy multicast to drop an IP from any node that isn't the
948 * intended new node. The following causes makes ctdbd ignore
949 * a release for any address it doesn't host.
951 if (ctdb->do_checkpublicip) {
952 if (!ctdb_sys_have_ip(&pip->addr)) {
953 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
954 ctdb_addr_to_str(&pip->addr),
955 vnn->public_netmask_bits,
956 ctdb_vnn_iface_string(vnn)));
957 ctdb_vnn_unassign_iface(ctdb, vnn);
961 if (vnn->iface == NULL) {
962 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
963 ctdb_addr_to_str(&pip->addr),
964 vnn->public_netmask_bits));
969 /* There is a potential race between take_ip and us because we
970 * update the VNN via a callback that run when the
971 * eventscripts have been run. Avoid the race by allowing one
972 * update to be in flight at a time.
974 if (vnn->update_in_flight) {
975 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
976 "update for this IP already in flight\n",
977 ctdb_addr_to_str(&vnn->public_address),
978 vnn->public_netmask_bits));
982 iface = strdup(ctdb_vnn_iface_string(vnn));
984 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
985 ctdb_addr_to_str(&pip->addr),
986 vnn->public_netmask_bits,
990 state = talloc(ctdb, struct takeover_callback_state);
992 ctdb_set_error(ctdb, "Out of memory at %s:%d",
998 state->c = talloc_steal(state, c);
999 state->addr = talloc(state, ctdb_sock_addr);
1000 if (state->addr == NULL) {
1001 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1002 __FILE__, __LINE__);
1007 *state->addr = pip->addr;
1010 vnn->update_in_flight = true;
1011 talloc_set_destructor(state, ctdb_releaseip_destructor);
1013 ret = ctdb_event_script_callback(ctdb,
1014 state, release_ip_callback, state,
1015 CTDB_EVENT_RELEASE_IP,
1018 ctdb_addr_to_str(&pip->addr),
1019 vnn->public_netmask_bits);
1022 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1023 ctdb_addr_to_str(&pip->addr),
1024 ctdb_vnn_iface_string(vnn)));
1029 /* tell the control that we will be reply asynchronously */
1030 *async_reply = true;
1034 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1035 ctdb_sock_addr *addr,
1036 unsigned mask, const char *ifaces,
1039 struct ctdb_vnn *vnn;
1046 tmp = strdup(ifaces);
1047 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1048 if (!ctdb_sys_check_iface_exists(iface)) {
1049 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1056 /* Verify that we dont have an entry for this ip yet */
1057 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1058 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1059 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1060 ctdb_addr_to_str(addr)));
1065 /* create a new vnn structure for this ip address */
1066 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1067 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1068 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1069 tmp = talloc_strdup(vnn, ifaces);
1070 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1071 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1072 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1073 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1074 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1075 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1079 vnn->ifaces[num] = NULL;
1080 vnn->public_address = *addr;
1081 vnn->public_netmask_bits = mask;
1083 if (check_address) {
1084 if (ctdb_sys_have_ip(addr)) {
1085 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1086 vnn->pnn = ctdb->pnn;
1090 for (i=0; vnn->ifaces[i]; i++) {
1091 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1093 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1094 "for public_address[%s]\n",
1095 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1101 DLIST_ADD(ctdb->vnn, vnn);
1107 setup the public address lists from a file
1109 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1115 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1116 if (lines == NULL) {
1117 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1120 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1124 for (i=0;i<nlines;i++) {
1126 ctdb_sock_addr addr;
1127 const char *addrstr;
1132 while ((*line == ' ') || (*line == '\t')) {
1138 if (strcmp(line, "") == 0) {
1141 tok = strtok(line, " \t");
1143 tok = strtok(NULL, " \t");
1145 if (NULL == ctdb->default_public_interface) {
1146 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1151 ifaces = ctdb->default_public_interface;
1156 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1157 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1161 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1162 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1173 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1177 struct ctdb_vnn *svnn;
1178 struct ctdb_iface *cur = NULL;
1182 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1183 CTDB_NO_MEMORY(ctdb, svnn);
1185 svnn->ifaces = talloc_array(svnn, const char *, 2);
1186 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1187 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1188 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1189 svnn->ifaces[1] = NULL;
1191 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1197 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1199 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1200 "for single_ip[%s]\n",
1202 ctdb_addr_to_str(&svnn->public_address)));
1207 /* assume the single public ip interface is initially "good" */
1208 cur = ctdb_find_iface(ctdb, iface);
1210 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1213 cur->link_up = true;
1215 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1221 ctdb->single_ip_vnn = svnn;
1225 struct ctdb_public_ip_list {
1226 struct ctdb_public_ip_list *next;
1228 ctdb_sock_addr addr;
1231 /* Given a physical node, return the number of
1232 public addresses that is currently assigned to this node.
1234 static int node_ip_coverage(struct ctdb_context *ctdb,
1236 struct ctdb_public_ip_list *ips)
1240 for (;ips;ips=ips->next) {
1241 if (ips->pnn == pnn) {
1249 /* Can the given node host the given IP: is the public IP known to the
1250 * node and is NOIPHOST unset?
1252 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1253 struct ctdb_ipflags ipflags,
1254 struct ctdb_public_ip_list *ip)
1256 struct ctdb_all_public_ips *public_ips;
1259 if (ipflags.noiphost) {
1263 public_ips = ctdb->nodes[pnn]->available_public_ips;
1265 if (public_ips == NULL) {
1269 for (i=0; i<public_ips->num; i++) {
1270 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1271 /* yes, this node can serve this public ip */
1279 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1280 struct ctdb_ipflags ipflags,
1281 struct ctdb_public_ip_list *ip)
1283 if (ipflags.noiptakeover) {
1287 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1290 /* search the node lists list for a node to takeover this ip.
1291 pick the node that currently are serving the least number of ips
1292 so that the ips get spread out evenly.
1294 static int find_takeover_node(struct ctdb_context *ctdb,
1295 struct ctdb_ipflags *ipflags,
1296 struct ctdb_public_ip_list *ip,
1297 struct ctdb_public_ip_list *all_ips)
1299 int pnn, min=0, num;
1302 numnodes = talloc_array_length(ipflags);
1304 for (i=0; i<numnodes; i++) {
1305 /* verify that this node can serve this ip */
1306 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1307 /* no it couldnt so skip to the next node */
1311 num = node_ip_coverage(ctdb, i, all_ips);
1312 /* was this the first node we checked ? */
1324 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1325 ctdb_addr_to_str(&ip->addr)));
1335 static uint32_t *ip_key(ctdb_sock_addr *ip)
1337 static uint32_t key[IP_KEYLEN];
1339 bzero(key, sizeof(key));
1341 switch (ip->sa.sa_family) {
1343 key[3] = htonl(ip->ip.sin_addr.s_addr);
1346 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1347 key[0] = htonl(s6_a32[0]);
1348 key[1] = htonl(s6_a32[1]);
1349 key[2] = htonl(s6_a32[2]);
1350 key[3] = htonl(s6_a32[3]);
1354 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1361 static void *add_ip_callback(void *parm, void *data)
1363 struct ctdb_public_ip_list *this_ip = parm;
1364 struct ctdb_public_ip_list *prev_ip = data;
1366 if (prev_ip == NULL) {
1369 if (this_ip->pnn == -1) {
1370 this_ip->pnn = prev_ip->pnn;
1376 static int getips_count_callback(void *param, void *data)
1378 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1379 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1381 new_ip->next = *ip_list;
1386 static struct ctdb_public_ip_list *
1387 create_merged_ip_list(struct ctdb_context *ctdb)
1390 struct ctdb_public_ip_list *ip_list;
1391 struct ctdb_all_public_ips *public_ips;
1393 if (ctdb->ip_tree != NULL) {
1394 talloc_free(ctdb->ip_tree);
1395 ctdb->ip_tree = NULL;
1397 ctdb->ip_tree = trbt_create(ctdb, 0);
1399 for (i=0;i<ctdb->num_nodes;i++) {
1400 public_ips = ctdb->nodes[i]->known_public_ips;
1402 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1406 /* there were no public ips for this node */
1407 if (public_ips == NULL) {
1411 for (j=0;j<public_ips->num;j++) {
1412 struct ctdb_public_ip_list *tmp_ip;
1414 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1415 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1416 /* Do not use information about IP addresses hosted
1417 * on other nodes, it may not be accurate */
1418 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1419 tmp_ip->pnn = public_ips->ips[j].pnn;
1423 tmp_ip->addr = public_ips->ips[j].addr;
1424 tmp_ip->next = NULL;
1426 trbt_insertarray32_callback(ctdb->ip_tree,
1427 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1434 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1440 * This is the length of the longtest common prefix between the IPs.
1441 * It is calculated by XOR-ing the 2 IPs together and counting the
1442 * number of leading zeroes. The implementation means that all
1443 * addresses end up being 128 bits long.
1445 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1446 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1447 * lots of nodes and IP addresses?
1449 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1451 uint32_t ip1_k[IP_KEYLEN];
1456 uint32_t distance = 0;
1458 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1460 for (i=0; i<IP_KEYLEN; i++) {
1461 x = ip1_k[i] ^ t[i];
1465 /* Count number of leading zeroes.
1466 * FIXME? This could be optimised...
1468 while ((x & (1 << 31)) == 0) {
1478 /* Calculate the IP distance for the given IP relative to IPs on the
1479 given node. The ips argument is generally the all_ips variable
1480 used in the main part of the algorithm.
1482 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1483 struct ctdb_public_ip_list *ips,
1486 struct ctdb_public_ip_list *t;
1491 for (t=ips; t != NULL; t=t->next) {
1492 if (t->pnn != pnn) {
1496 /* Optimisation: We never calculate the distance
1497 * between an address and itself. This allows us to
1498 * calculate the effect of removing an address from a
1499 * node by simply calculating the distance between
1500 * that address and all of the exitsing addresses.
1501 * Moreover, we assume that we're only ever dealing
1502 * with addresses from all_ips so we can identify an
1503 * address via a pointer rather than doing a more
1504 * expensive address comparison. */
1505 if (&(t->addr) == ip) {
1509 d = ip_distance(ip, &(t->addr));
1510 sum += d * d; /* Cheaper than pulling in math.h :-) */
1516 /* Return the LCP2 imbalance metric for addresses currently assigned
1519 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1521 struct ctdb_public_ip_list *t;
1523 uint32_t imbalance = 0;
1525 for (t=all_ips; t!=NULL; t=t->next) {
1526 if (t->pnn != pnn) {
1529 /* Pass the rest of the IPs rather than the whole
1532 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1538 /* Allocate any unassigned IPs just by looping through the IPs and
1539 * finding the best node for each.
1541 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1542 struct ctdb_ipflags *ipflags,
1543 struct ctdb_public_ip_list *all_ips)
1545 struct ctdb_public_ip_list *tmp_ip;
1547 /* loop over all ip's and find a physical node to cover for
1550 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1551 if (tmp_ip->pnn == -1) {
1552 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1553 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1554 ctdb_addr_to_str(&tmp_ip->addr)));
1560 /* Basic non-deterministic rebalancing algorithm.
1562 static void basic_failback(struct ctdb_context *ctdb,
1563 struct ctdb_ipflags *ipflags,
1564 struct ctdb_public_ip_list *all_ips,
1568 int maxnode, maxnum, minnode, minnum, num, retries;
1569 struct ctdb_public_ip_list *tmp_ip;
1571 numnodes = talloc_array_length(ipflags);
1578 /* for each ip address, loop over all nodes that can serve
1579 this ip and make sure that the difference between the node
1580 serving the most and the node serving the least ip's are
1583 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1584 if (tmp_ip->pnn == -1) {
1588 /* Get the highest and lowest number of ips's served by any
1589 valid node which can serve this ip.
1593 for (i=0; i<numnodes; i++) {
1594 /* only check nodes that can actually serve this ip */
1595 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1596 /* no it couldnt so skip to the next node */
1600 num = node_ip_coverage(ctdb, i, all_ips);
1601 if (maxnode == -1) {
1610 if (minnode == -1) {
1620 if (maxnode == -1) {
1621 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1622 ctdb_addr_to_str(&tmp_ip->addr)));
1627 /* if the spread between the smallest and largest coverage by
1628 a node is >=2 we steal one of the ips from the node with
1629 most coverage to even things out a bit.
1630 try to do this a limited number of times since we dont
1631 want to spend too much time balancing the ip coverage.
1633 if ( (maxnum > minnum+1)
1634 && (retries < (num_ips + 5)) ){
1635 struct ctdb_public_ip_list *tmp;
1637 /* Reassign one of maxnode's VNNs */
1638 for (tmp=all_ips;tmp;tmp=tmp->next) {
1639 if (tmp->pnn == maxnode) {
1640 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1649 static void lcp2_init(struct ctdb_context *tmp_ctx,
1650 struct ctdb_ipflags *ipflags,
1651 struct ctdb_public_ip_list *all_ips,
1652 uint32_t *force_rebalance_nodes,
1653 uint32_t **lcp2_imbalances,
1654 bool **rebalance_candidates)
1657 struct ctdb_public_ip_list *tmp_ip;
1659 numnodes = talloc_array_length(ipflags);
1661 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1662 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1663 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1664 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1666 for (i=0; i<numnodes; i++) {
1667 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1668 /* First step: assume all nodes are candidates */
1669 (*rebalance_candidates)[i] = true;
1672 /* 2nd step: if a node has IPs assigned then it must have been
1673 * healthy before, so we remove it from consideration. This
1674 * is overkill but is all we have because we don't maintain
1675 * state between takeover runs. An alternative would be to
1676 * keep state and invalidate it every time the recovery master
1679 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1680 if (tmp_ip->pnn != -1) {
1681 (*rebalance_candidates)[tmp_ip->pnn] = false;
1685 /* 3rd step: if a node is forced to re-balance then
1686 we allow failback onto the node */
1687 if (force_rebalance_nodes == NULL) {
1690 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1691 uint32_t pnn = force_rebalance_nodes[i];
1692 if (pnn >= numnodes) {
1694 (__location__ "unknown node %u\n", pnn));
1699 ("Forcing rebalancing of IPs to node %u\n", pnn));
1700 (*rebalance_candidates)[pnn] = true;
1704 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1705 * the IP/node combination that will cost the least.
1707 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1708 struct ctdb_ipflags *ipflags,
1709 struct ctdb_public_ip_list *all_ips,
1710 uint32_t *lcp2_imbalances)
1712 struct ctdb_public_ip_list *tmp_ip;
1713 int dstnode, numnodes;
1716 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1717 struct ctdb_public_ip_list *minip;
1719 bool should_loop = true;
1720 bool have_unassigned = true;
1722 numnodes = talloc_array_length(ipflags);
1724 while (have_unassigned && should_loop) {
1725 should_loop = false;
1727 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1728 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1734 /* loop over each unassigned ip. */
1735 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1736 if (tmp_ip->pnn != -1) {
1740 for (dstnode=0; dstnode<numnodes; dstnode++) {
1741 /* only check nodes that can actually takeover this ip */
1742 if (!can_node_takeover_ip(ctdb, dstnode,
1745 /* no it couldnt so skip to the next node */
1749 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1750 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1751 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1752 ctdb_addr_to_str(&(tmp_ip->addr)),
1754 dstimbl - lcp2_imbalances[dstnode]));
1757 if ((minnode == -1) || (dstdsum < mindsum)) {
1767 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1769 /* If we found one then assign it to the given node. */
1770 if (minnode != -1) {
1771 minip->pnn = minnode;
1772 lcp2_imbalances[minnode] = minimbl;
1773 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1774 ctdb_addr_to_str(&(minip->addr)),
1779 /* There might be a better way but at least this is clear. */
1780 have_unassigned = false;
1781 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1782 if (tmp_ip->pnn == -1) {
1783 have_unassigned = true;
1788 /* We know if we have an unassigned addresses so we might as
1791 if (have_unassigned) {
1792 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1793 if (tmp_ip->pnn == -1) {
1794 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1795 ctdb_addr_to_str(&tmp_ip->addr)));
1801 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1802 * to move IPs from, determines the best IP/destination node
1803 * combination to move from the source node.
1805 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1806 struct ctdb_ipflags *ipflags,
1807 struct ctdb_public_ip_list *all_ips,
1809 uint32_t *lcp2_imbalances,
1810 bool *rebalance_candidates)
1812 int dstnode, mindstnode, numnodes;
1813 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1814 uint32_t minsrcimbl, mindstimbl;
1815 struct ctdb_public_ip_list *minip;
1816 struct ctdb_public_ip_list *tmp_ip;
1818 /* Find an IP and destination node that best reduces imbalance. */
1825 numnodes = talloc_array_length(ipflags);
1827 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1828 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1829 srcnode, lcp2_imbalances[srcnode]));
1831 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1832 /* Only consider addresses on srcnode. */
1833 if (tmp_ip->pnn != srcnode) {
1837 /* What is this IP address costing the source node? */
1838 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1839 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1841 /* Consider this IP address would cost each potential
1842 * destination node. Destination nodes are limited to
1843 * those that are newly healthy, since we don't want
1844 * to do gratuitous failover of IPs just to make minor
1845 * balance improvements.
1847 for (dstnode=0; dstnode<numnodes; dstnode++) {
1848 if (!rebalance_candidates[dstnode]) {
1852 /* only check nodes that can actually takeover this ip */
1853 if (!can_node_takeover_ip(ctdb, dstnode,
1854 ipflags[dstnode], tmp_ip)) {
1855 /* no it couldnt so skip to the next node */
1859 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1860 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1861 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1863 ctdb_addr_to_str(&(tmp_ip->addr)),
1866 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1867 (dstdsum < srcdsum) && \
1868 ((mindstnode == -1) || \
1869 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1872 minsrcimbl = srcimbl;
1873 mindstnode = dstnode;
1874 mindstimbl = dstimbl;
1878 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1880 if (mindstnode != -1) {
1881 /* We found a move that makes things better... */
1882 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1883 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1884 ctdb_addr_to_str(&(minip->addr)),
1885 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1888 lcp2_imbalances[srcnode] = minsrcimbl;
1889 lcp2_imbalances[mindstnode] = mindstimbl;
1890 minip->pnn = mindstnode;
1899 struct lcp2_imbalance_pnn {
1904 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1906 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1907 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1909 if (lipa->imbalance > lipb->imbalance) {
1911 } else if (lipa->imbalance == lipb->imbalance) {
1918 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1919 * node with the highest LCP2 imbalance, and then determines the best
1920 * IP/destination node combination to move from the source node.
1922 static void lcp2_failback(struct ctdb_context *ctdb,
1923 struct ctdb_ipflags *ipflags,
1924 struct ctdb_public_ip_list *all_ips,
1925 uint32_t *lcp2_imbalances,
1926 bool *rebalance_candidates)
1929 struct lcp2_imbalance_pnn * lips;
1932 numnodes = talloc_array_length(ipflags);
1935 /* Put the imbalances and nodes into an array, sort them and
1936 * iterate through candidates. Usually the 1st one will be
1937 * used, so this doesn't cost much...
1939 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1940 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1941 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1942 for (i=0; i<numnodes; i++) {
1943 lips[i].imbalance = lcp2_imbalances[i];
1945 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1947 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1948 lcp2_cmp_imbalance_pnn);
1951 for (i=0; i<numnodes; i++) {
1952 /* This means that all nodes had 0 or 1 addresses, so
1953 * can't be imbalanced.
1955 if (lips[i].imbalance == 0) {
1959 if (lcp2_failback_candidate(ctdb,
1964 rebalance_candidates)) {
1976 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1977 struct ctdb_ipflags *ipflags,
1978 struct ctdb_public_ip_list *all_ips)
1980 struct ctdb_public_ip_list *tmp_ip;
1982 /* verify that the assigned nodes can serve that public ip
1983 and set it to -1 if not
1985 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1986 if (tmp_ip->pnn == -1) {
1989 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1990 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1991 /* this node can not serve this ip. */
1992 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1993 ctdb_addr_to_str(&(tmp_ip->addr)),
2000 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2001 struct ctdb_ipflags *ipflags,
2002 struct ctdb_public_ip_list *all_ips)
2004 struct ctdb_public_ip_list *tmp_ip;
2007 numnodes = talloc_array_length(ipflags);
2009 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2010 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2011 * always be allocated the same way for a specific set of
2012 * available/unavailable nodes.
2015 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2016 tmp_ip->pnn = i % numnodes;
2019 /* IP failback doesn't make sense with deterministic
2020 * IPs, since the modulo step above implicitly fails
2021 * back IPs to their "home" node.
2023 if (1 == ctdb->tunable.no_ip_failback) {
2024 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2027 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2029 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2031 /* No failback here! */
2034 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2035 struct ctdb_ipflags *ipflags,
2036 struct ctdb_public_ip_list *all_ips)
2038 /* This should be pushed down into basic_failback. */
2039 struct ctdb_public_ip_list *tmp_ip;
2041 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2045 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2047 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2049 /* If we don't want IPs to fail back then don't rebalance IPs. */
2050 if (1 == ctdb->tunable.no_ip_failback) {
2054 /* Now, try to make sure the ip adresses are evenly distributed
2057 basic_failback(ctdb, ipflags, all_ips, num_ips);
2060 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2061 struct ctdb_ipflags *ipflags,
2062 struct ctdb_public_ip_list *all_ips,
2063 uint32_t *force_rebalance_nodes)
2065 uint32_t *lcp2_imbalances;
2066 bool *rebalance_candidates;
2067 int numnodes, num_rebalance_candidates, i;
2069 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2071 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2073 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2074 &lcp2_imbalances, &rebalance_candidates);
2076 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2078 /* If we don't want IPs to fail back then don't rebalance IPs. */
2079 if (1 == ctdb->tunable.no_ip_failback) {
2083 /* It is only worth continuing if we have suitable target
2084 * nodes to transfer IPs to. This check is much cheaper than
2087 numnodes = talloc_array_length(ipflags);
2088 num_rebalance_candidates = 0;
2089 for (i=0; i<numnodes; i++) {
2090 if (rebalance_candidates[i]) {
2091 num_rebalance_candidates++;
2094 if (num_rebalance_candidates == 0) {
2098 /* Now, try to make sure the ip adresses are evenly distributed
2101 lcp2_failback(ctdb, ipflags, all_ips,
2102 lcp2_imbalances, rebalance_candidates);
2105 talloc_free(tmp_ctx);
2108 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2112 for (i=0;i<nodemap->num;i++) {
2113 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2114 /* Found one completely healthy node */
2122 /* The calculation part of the IP allocation algorithm. */
2123 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2124 struct ctdb_ipflags *ipflags,
2125 struct ctdb_public_ip_list **all_ips_p,
2126 uint32_t *force_rebalance_nodes)
2128 /* since nodes only know about those public addresses that
2129 can be served by that particular node, no single node has
2130 a full list of all public addresses that exist in the cluster.
2131 Walk over all node structures and create a merged list of
2132 all public addresses that exist in the cluster.
2134 keep the tree of ips around as ctdb->ip_tree
2136 *all_ips_p = create_merged_ip_list(ctdb);
2138 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2139 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2140 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2141 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2143 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2146 /* at this point ->pnn is the node which will own each IP
2147 or -1 if there is no node that can cover this ip
2153 struct get_tunable_callback_data {
2154 const char *tunable;
2159 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2160 int32_t res, TDB_DATA outdata,
2163 struct get_tunable_callback_data *cd =
2164 (struct get_tunable_callback_data *)callback;
2168 /* Already handled in fail callback */
2172 if (outdata.dsize != sizeof(uint32_t)) {
2173 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2174 cd->tunable, pnn, (int)sizeof(uint32_t),
2175 (int)outdata.dsize));
2180 size = talloc_array_length(cd->out);
2182 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2183 cd->tunable, pnn, size));
2188 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2191 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2192 int32_t res, TDB_DATA outdata,
2195 struct get_tunable_callback_data *cd =
2196 (struct get_tunable_callback_data *)callback;
2201 ("Timed out getting tunable \"%s\" from node %d\n",
2207 DEBUG(DEBUG_WARNING,
2208 ("Tunable \"%s\" not implemented on node %d\n",
2213 ("Unexpected error getting tunable \"%s\" from node %d\n",
2219 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2220 TALLOC_CTX *tmp_ctx,
2221 struct ctdb_node_map *nodemap,
2222 const char *tunable,
2223 uint32_t default_value)
2226 struct ctdb_control_get_tunable *t;
2229 struct get_tunable_callback_data callback_data;
2232 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2233 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2234 for (i=0; i<nodemap->num; i++) {
2235 tvals[i] = default_value;
2238 callback_data.out = tvals;
2239 callback_data.tunable = tunable;
2240 callback_data.fatal = false;
2242 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2243 data.dptr = talloc_size(tmp_ctx, data.dsize);
2244 t = (struct ctdb_control_get_tunable *)data.dptr;
2245 t->length = strlen(tunable)+1;
2246 memcpy(t->name, tunable, t->length);
2247 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2248 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2249 nodes, 0, TAKEOVER_TIMEOUT(),
2251 get_tunable_callback,
2252 get_tunable_fail_callback,
2253 &callback_data) != 0) {
2254 if (callback_data.fatal) {
2260 talloc_free(data.dptr);
2265 struct get_runstate_callback_data {
2266 enum ctdb_runstate *out;
2270 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2271 int32_t res, TDB_DATA outdata,
2272 void *callback_data)
2274 struct get_runstate_callback_data *cd =
2275 (struct get_runstate_callback_data *)callback_data;
2279 /* Already handled in fail callback */
2283 if (outdata.dsize != sizeof(uint32_t)) {
2284 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2285 pnn, (int)sizeof(uint32_t),
2286 (int)outdata.dsize));
2291 size = talloc_array_length(cd->out);
2293 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2298 cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2301 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2302 int32_t res, TDB_DATA outdata,
2305 struct get_runstate_callback_data *cd =
2306 (struct get_runstate_callback_data *)callback;
2311 ("Timed out getting runstate from node %d\n", pnn));
2315 DEBUG(DEBUG_WARNING,
2316 ("Error getting runstate from node %d - assuming runstates not supported\n",
2321 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2322 TALLOC_CTX *tmp_ctx,
2323 struct ctdb_node_map *nodemap,
2324 enum ctdb_runstate default_value)
2327 enum ctdb_runstate *rs;
2328 struct get_runstate_callback_data callback_data;
2331 rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2332 CTDB_NO_MEMORY_NULL(ctdb, rs);
2333 for (i=0; i<nodemap->num; i++) {
2334 rs[i] = default_value;
2337 callback_data.out = rs;
2338 callback_data.fatal = false;
2340 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2341 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2342 nodes, 0, TAKEOVER_TIMEOUT(),
2344 get_runstate_callback,
2345 get_runstate_fail_callback,
2346 &callback_data) != 0) {
2347 if (callback_data.fatal) {
2357 /* Set internal flags for IP allocation:
2359 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2360 * Set NOIPHOST ip flag for each INACTIVE node
2361 * if all nodes are disabled:
2362 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2364 * Set NOIPHOST ip flags for disabled nodes
2366 static struct ctdb_ipflags *
2367 set_ipflags_internal(struct ctdb_context *ctdb,
2368 TALLOC_CTX *tmp_ctx,
2369 struct ctdb_node_map *nodemap,
2370 uint32_t *tval_noiptakeover,
2371 uint32_t *tval_noiphostonalldisabled,
2372 enum ctdb_runstate *runstate)
2375 struct ctdb_ipflags *ipflags;
2377 /* Clear IP flags - implicit due to talloc_zero */
2378 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2379 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2381 for (i=0;i<nodemap->num;i++) {
2382 /* Can not take IPs on node with NoIPTakeover set */
2383 if (tval_noiptakeover[i] != 0) {
2384 ipflags[i].noiptakeover = true;
2387 /* Can not host IPs on node not in RUNNING state */
2388 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2389 ipflags[i].noiphost = true;
2392 /* Can not host IPs on INACTIVE node */
2393 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2394 ipflags[i].noiphost = true;
2396 /* Remember the runstate */
2397 ipflags[i].runstate = runstate[i];
2400 if (all_nodes_are_disabled(nodemap)) {
2401 /* If all nodes are disabled, can not host IPs on node
2402 * with NoIPHostOnAllDisabled set
2404 for (i=0;i<nodemap->num;i++) {
2405 if (tval_noiphostonalldisabled[i] != 0) {
2406 ipflags[i].noiphost = true;
2410 /* If some nodes are not disabled, then can not host
2411 * IPs on DISABLED node
2413 for (i=0;i<nodemap->num;i++) {
2414 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2415 ipflags[i].noiphost = true;
2423 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2424 TALLOC_CTX *tmp_ctx,
2425 struct ctdb_node_map *nodemap)
2427 uint32_t *tval_noiptakeover;
2428 uint32_t *tval_noiphostonalldisabled;
2429 struct ctdb_ipflags *ipflags;
2430 enum ctdb_runstate *runstate;
2433 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2435 if (tval_noiptakeover == NULL) {
2439 tval_noiphostonalldisabled =
2440 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2441 "NoIPHostOnAllDisabled", 0);
2442 if (tval_noiphostonalldisabled == NULL) {
2443 /* Caller frees tmp_ctx */
2447 /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2448 * will default to CTDB_RUNSTATE_RUNNING. This ensures
2449 * reasonable behaviour on a mixed cluster during upgrade.
2451 runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2452 CTDB_RUNSTATE_RUNNING);
2453 if (runstate == NULL) {
2454 /* Caller frees tmp_ctx */
2458 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2460 tval_noiphostonalldisabled,
2463 talloc_free(tval_noiptakeover);
2464 talloc_free(tval_noiphostonalldisabled);
2465 talloc_free(runstate);
2470 struct iprealloc_callback_data {
2473 client_async_callback fail_callback;
2474 void *fail_callback_data;
2475 struct ctdb_node_map *nodemap;
2478 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2479 int32_t res, TDB_DATA outdata,
2483 struct iprealloc_callback_data *cd =
2484 (struct iprealloc_callback_data *)callback;
2486 numnodes = talloc_array_length(cd->retry_nodes);
2487 if (pnn > numnodes) {
2489 ("ipreallocated failure from node %d, "
2490 "but only %d nodes in nodemap\n",
2495 /* Can't run the "ipreallocated" event on a INACTIVE node */
2496 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2497 DEBUG(DEBUG_WARNING,
2498 ("ipreallocated failed on inactive node %d, ignoring\n",
2505 /* If the control timed out then that's a real error,
2506 * so call the real fail callback
2508 if (cd->fail_callback) {
2509 cd->fail_callback(ctdb, pnn, res, outdata,
2510 cd->fail_callback_data);
2512 DEBUG(DEBUG_WARNING,
2513 ("iprealloc timed out but no callback registered\n"));
2517 /* If not a timeout then either the ipreallocated
2518 * eventscript (or some setup) failed. This might
2519 * have failed because the IPREALLOCATED control isn't
2520 * implemented - right now there is no way of knowing
2521 * because the error codes are all folded down to -1.
2522 * Consider retrying using EVENTSCRIPT control...
2524 DEBUG(DEBUG_WARNING,
2525 ("ipreallocated failure from node %d, flagging retry\n",
2527 cd->retry_nodes[pnn] = true;
2532 struct takeover_callback_data {
2534 client_async_callback fail_callback;
2535 void *fail_callback_data;
2536 struct ctdb_node_map *nodemap;
2539 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2540 uint32_t node_pnn, int32_t res,
2541 TDB_DATA outdata, void *callback_data)
2543 struct takeover_callback_data *cd =
2544 talloc_get_type_abort(callback_data,
2545 struct takeover_callback_data);
2548 for (i = 0; i < cd->nodemap->num; i++) {
2549 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2554 if (i == cd->nodemap->num) {
2555 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2559 if (!cd->node_failed[i]) {
2560 cd->node_failed[i] = true;
2561 cd->fail_callback(ctdb, node_pnn, res, outdata,
2562 cd->fail_callback_data);
2567 make any IP alias changes for public addresses that are necessary
2569 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2570 uint32_t *force_rebalance_nodes,
2571 client_async_callback fail_callback, void *callback_data)
2574 struct ctdb_public_ip ip;
2576 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2578 struct timeval timeout;
2579 struct client_async_data *async_data;
2580 struct ctdb_client_control_state *state;
2581 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2582 struct ctdb_ipflags *ipflags;
2583 struct takeover_callback_data *takeover_data;
2584 struct iprealloc_callback_data iprealloc_data;
2589 * ip failover is completely disabled, just send out the
2590 * ipreallocated event.
2592 if (ctdb->tunable.disable_ip_failover != 0) {
2596 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2597 if (ipflags == NULL) {
2598 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2599 talloc_free(tmp_ctx);
2603 /* Short-circuit IP allocation if no nodes are in the RUNNING
2604 * runstate yet, since no nodes will be able to host IPs */
2605 can_host_ips = false;
2606 for (i=0; i<nodemap->num; i++) {
2607 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2608 can_host_ips = true;
2611 if (!can_host_ips) {
2612 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2616 /* Do the IP reassignment calculations */
2617 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2619 /* Now tell all nodes to release any public IPs should not
2620 * host. This will be a NOOP on nodes that don't currently
2621 * hold the given IP.
2623 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2624 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2626 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2627 bool, nodemap->num);
2628 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2629 takeover_data->fail_callback = fail_callback;
2630 takeover_data->fail_callback_data = callback_data;
2631 takeover_data->nodemap = nodemap;
2633 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2634 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2636 async_data->fail_callback = takeover_run_fail_callback;
2637 async_data->callback_data = takeover_data;
2639 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2641 /* Send a RELEASE_IP to all nodes that should not be hosting
2642 * each IP. For each IP, all but one of these will be
2643 * redundant. However, the redundant ones are used to tell
2644 * nodes which node should be hosting the IP so that commands
2645 * like "ctdb ip" can display a particular nodes idea of who
2646 * is hosting what. */
2647 for (i=0;i<nodemap->num;i++) {
2648 /* don't talk to unconnected nodes, but do talk to banned nodes */
2649 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2653 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2654 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2655 /* This node should be serving this
2656 vnn so dont tell it to release the ip
2660 ip.pnn = tmp_ip->pnn;
2661 ip.addr = tmp_ip->addr;
2663 timeout = TAKEOVER_TIMEOUT();
2664 data.dsize = sizeof(ip);
2665 data.dptr = (uint8_t *)&ip;
2666 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2667 0, CTDB_CONTROL_RELEASE_IP, 0,
2670 if (state == NULL) {
2671 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2672 talloc_free(tmp_ctx);
2676 ctdb_client_async_add(async_data, state);
2679 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2680 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2681 talloc_free(tmp_ctx);
2684 talloc_free(async_data);
2687 /* For each IP, send a TAKOVER_IP to the node that should be
2688 * hosting it. Many of these will often be redundant (since
2689 * the allocation won't have changed) but they can be useful
2690 * to recover from inconsistencies. */
2691 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2692 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2694 async_data->fail_callback = fail_callback;
2695 async_data->callback_data = callback_data;
2697 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2698 if (tmp_ip->pnn == -1) {
2699 /* this IP won't be taken over */
2703 ip.pnn = tmp_ip->pnn;
2704 ip.addr = tmp_ip->addr;
2706 timeout = TAKEOVER_TIMEOUT();
2707 data.dsize = sizeof(ip);
2708 data.dptr = (uint8_t *)&ip;
2709 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2710 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2711 data, async_data, &timeout, NULL);
2712 if (state == NULL) {
2713 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2714 talloc_free(tmp_ctx);
2718 ctdb_client_async_add(async_data, state);
2720 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2721 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2722 talloc_free(tmp_ctx);
2728 * Tell all nodes to run eventscripts to process the
2729 * "ipreallocated" event. This can do a lot of things,
2730 * including restarting services to reconfigure them if public
2731 * IPs have moved. Once upon a time this event only used to
2734 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2735 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2736 iprealloc_data.retry_nodes = retry_data;
2737 iprealloc_data.retry_count = 0;
2738 iprealloc_data.fail_callback = fail_callback;
2739 iprealloc_data.fail_callback_data = callback_data;
2740 iprealloc_data.nodemap = nodemap;
2742 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2743 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2744 nodes, 0, TAKEOVER_TIMEOUT(),
2746 NULL, iprealloc_fail_callback,
2749 /* If the control failed then we should retry to any
2750 * nodes flagged by iprealloc_fail_callback using the
2751 * EVENTSCRIPT control. This is a best-effort at
2752 * backward compatiblity when running a mixed cluster
2753 * where some nodes have not yet been upgraded to
2754 * support the IPREALLOCATED control.
2756 DEBUG(DEBUG_WARNING,
2757 ("Retry ipreallocated to some nodes using eventscript control\n"));
2759 nodes = talloc_array(tmp_ctx, uint32_t,
2760 iprealloc_data.retry_count);
2761 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2764 for (i=0; i<nodemap->num; i++) {
2765 if (iprealloc_data.retry_nodes[i]) {
2771 data.dptr = discard_const("ipreallocated");
2772 data.dsize = strlen((char *)data.dptr) + 1;
2773 ret = ctdb_client_async_control(ctdb,
2774 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2775 nodes, 0, TAKEOVER_TIMEOUT(),
2777 NULL, fail_callback,
2780 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2784 talloc_free(tmp_ctx);
2790 destroy a ctdb_client_ip structure
2792 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2794 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2795 ctdb_addr_to_str(&ip->addr),
2796 ntohs(ip->addr.ip.sin_port),
2799 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2804 called by a client to inform us of a TCP connection that it is managing
2805 that should tickled with an ACK when IP takeover is done
2807 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2810 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2811 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2812 struct ctdb_tcp_list *tcp;
2813 struct ctdb_tcp_connection t;
2816 struct ctdb_client_ip *ip;
2817 struct ctdb_vnn *vnn;
2818 ctdb_sock_addr addr;
2820 /* If we don't have public IPs, tickles are useless */
2821 if (ctdb->vnn == NULL) {
2825 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2827 addr = tcp_sock->src;
2828 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2829 addr = tcp_sock->dest;
2830 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2833 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2834 vnn = find_public_ip_vnn(ctdb, &addr);
2836 switch (addr.sa.sa_family) {
2838 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2839 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2840 ctdb_addr_to_str(&addr)));
2844 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2845 ctdb_addr_to_str(&addr)));
2848 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2854 if (vnn->pnn != ctdb->pnn) {
2855 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2856 ctdb_addr_to_str(&addr),
2857 client_id, client->pid));
2858 /* failing this call will tell smbd to die */
2862 ip = talloc(client, struct ctdb_client_ip);
2863 CTDB_NO_MEMORY(ctdb, ip);
2867 ip->client_id = client_id;
2868 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2869 DLIST_ADD(ctdb->client_ip_list, ip);
2871 tcp = talloc(client, struct ctdb_tcp_list);
2872 CTDB_NO_MEMORY(ctdb, tcp);
2874 tcp->connection.src_addr = tcp_sock->src;
2875 tcp->connection.dst_addr = tcp_sock->dest;
2877 DLIST_ADD(client->tcp_list, tcp);
2879 t.src_addr = tcp_sock->src;
2880 t.dst_addr = tcp_sock->dest;
2882 data.dptr = (uint8_t *)&t;
2883 data.dsize = sizeof(t);
2885 switch (addr.sa.sa_family) {
2887 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2888 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
2889 ctdb_addr_to_str(&tcp_sock->src),
2890 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2893 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2894 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
2895 ctdb_addr_to_str(&tcp_sock->src),
2896 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2899 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2903 /* tell all nodes about this tcp connection */
2904 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2905 CTDB_CONTROL_TCP_ADD,
2906 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2908 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2916 find a tcp address on a list
2918 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2919 struct ctdb_tcp_connection *tcp)
2923 if (array == NULL) {
2927 for (i=0;i<array->num;i++) {
2928 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2929 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2930 return &array->connections[i];
2939 called by a daemon to inform us of a TCP connection that one of its
2940 clients managing that should tickled with an ACK when IP takeover is
2943 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2945 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2946 struct ctdb_tcp_array *tcparray;
2947 struct ctdb_tcp_connection tcp;
2948 struct ctdb_vnn *vnn;
2950 /* If we don't have public IPs, tickles are useless */
2951 if (ctdb->vnn == NULL) {
2955 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2957 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2958 ctdb_addr_to_str(&p->dst_addr)));
2964 tcparray = vnn->tcp_array;
2966 /* If this is the first tickle */
2967 if (tcparray == NULL) {
2968 tcparray = talloc(vnn, struct ctdb_tcp_array);
2969 CTDB_NO_MEMORY(ctdb, tcparray);
2970 vnn->tcp_array = tcparray;
2973 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2974 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2976 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2977 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2980 if (tcp_update_needed) {
2981 vnn->tcp_update_needed = true;
2987 /* Do we already have this tickle ?*/
2988 tcp.src_addr = p->src_addr;
2989 tcp.dst_addr = p->dst_addr;
2990 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2991 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2992 ctdb_addr_to_str(&tcp.dst_addr),
2993 ntohs(tcp.dst_addr.ip.sin_port),
2998 /* A new tickle, we must add it to the array */
2999 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3000 struct ctdb_tcp_connection,
3002 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3004 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3005 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3008 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3009 ctdb_addr_to_str(&tcp.dst_addr),
3010 ntohs(tcp.dst_addr.ip.sin_port),
3013 if (tcp_update_needed) {
3014 vnn->tcp_update_needed = true;
3022 called by a daemon to inform us of a TCP connection that one of its
3023 clients managing that should tickled with an ACK when IP takeover is
3026 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3028 struct ctdb_tcp_connection *tcpp;
3029 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3032 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3033 ctdb_addr_to_str(&conn->dst_addr)));
3037 /* if the array is empty we cant remove it
3038 and we dont need to do anything
3040 if (vnn->tcp_array == NULL) {
3041 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3042 ctdb_addr_to_str(&conn->dst_addr),
3043 ntohs(conn->dst_addr.ip.sin_port)));
3048 /* See if we know this connection
3049 if we dont know this connection then we dont need to do anything
3051 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3053 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3054 ctdb_addr_to_str(&conn->dst_addr),
3055 ntohs(conn->dst_addr.ip.sin_port)));
3060 /* We need to remove this entry from the array.
3061 Instead of allocating a new array and copying data to it
3062 we cheat and just copy the last entry in the existing array
3063 to the entry that is to be removed and just shring the
3066 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3067 vnn->tcp_array->num--;
3069 /* If we deleted the last entry we also need to remove the entire array
3071 if (vnn->tcp_array->num == 0) {
3072 talloc_free(vnn->tcp_array);
3073 vnn->tcp_array = NULL;
3076 vnn->tcp_update_needed = true;
3078 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3079 ctdb_addr_to_str(&conn->src_addr),
3080 ntohs(conn->src_addr.ip.sin_port)));
3085 called by a daemon to inform us of a TCP connection that one of its
3086 clients used are no longer needed in the tickle database
3088 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3090 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3092 /* If we don't have public IPs, tickles are useless */
3093 if (ctdb->vnn == NULL) {
3097 ctdb_remove_tcp_connection(ctdb, conn);
3104 Called when another daemon starts - causes all tickles for all
3105 public addresses we are serving to be sent to the new node on the
3106 next check. This actually causes the next scheduled call to
3107 tdb_update_tcp_tickles() to update all nodes. This is simple and
3108 doesn't require careful error handling.
3110 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3112 struct ctdb_vnn *vnn;
3114 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3115 (unsigned long) pnn));
3117 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3118 vnn->tcp_update_needed = true;
3126 called when a client structure goes away - hook to remove
3127 elements from the tcp_list in all daemons
3129 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3131 while (client->tcp_list) {
3132 struct ctdb_tcp_list *tcp = client->tcp_list;
3133 DLIST_REMOVE(client->tcp_list, tcp);
3134 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3140 release all IPs on shutdown
3142 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3144 struct ctdb_vnn *vnn;
3147 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3148 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3149 ctdb_vnn_unassign_iface(ctdb, vnn);
3156 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3157 ctdb_addr_to_str(&vnn->public_address),
3158 vnn->public_netmask_bits,
3159 ctdb_vnn_iface_string(vnn)));
3161 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3162 ctdb_vnn_iface_string(vnn),
3163 ctdb_addr_to_str(&vnn->public_address),
3164 vnn->public_netmask_bits);
3165 release_kill_clients(ctdb, &vnn->public_address);
3166 ctdb_vnn_unassign_iface(ctdb, vnn);
3170 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3175 get list of public IPs
3177 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3178 struct ctdb_req_control *c, TDB_DATA *outdata)
3181 struct ctdb_all_public_ips *ips;
3182 struct ctdb_vnn *vnn;
3183 bool only_available = false;
3185 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3186 only_available = true;
3189 /* count how many public ip structures we have */
3191 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3195 len = offsetof(struct ctdb_all_public_ips, ips) +
3196 num*sizeof(struct ctdb_public_ip);
3197 ips = talloc_zero_size(outdata, len);
3198 CTDB_NO_MEMORY(ctdb, ips);
3201 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3202 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3205 ips->ips[i].pnn = vnn->pnn;
3206 ips->ips[i].addr = vnn->public_address;
3210 len = offsetof(struct ctdb_all_public_ips, ips) +
3211 i*sizeof(struct ctdb_public_ip);
3213 outdata->dsize = len;
3214 outdata->dptr = (uint8_t *)ips;
3220 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3221 struct ctdb_req_control *c,
3226 ctdb_sock_addr *addr;
3227 struct ctdb_control_public_ip_info *info;
3228 struct ctdb_vnn *vnn;
3230 addr = (ctdb_sock_addr *)indata.dptr;
3232 vnn = find_public_ip_vnn(ctdb, addr);
3234 /* if it is not a public ip it could be our 'single ip' */
3235 if (ctdb->single_ip_vnn) {
3236 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3237 vnn = ctdb->single_ip_vnn;
3242 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3243 "'%s'not a public address\n",
3244 ctdb_addr_to_str(addr)));
3248 /* count how many public ip structures we have */
3250 for (;vnn->ifaces[num];) {
3254 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3255 num*sizeof(struct ctdb_control_iface_info);
3256 info = talloc_zero_size(outdata, len);
3257 CTDB_NO_MEMORY(ctdb, info);
3259 info->ip.addr = vnn->public_address;
3260 info->ip.pnn = vnn->pnn;
3261 info->active_idx = 0xFFFFFFFF;
3263 for (i=0; vnn->ifaces[i]; i++) {
3264 struct ctdb_iface *cur;
3266 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3268 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3272 if (vnn->iface == cur) {
3273 info->active_idx = i;
3275 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3276 info->ifaces[i].link_state = cur->link_up;
3277 info->ifaces[i].references = cur->references;
3280 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3281 i*sizeof(struct ctdb_control_iface_info);
3283 outdata->dsize = len;
3284 outdata->dptr = (uint8_t *)info;
3289 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3290 struct ctdb_req_control *c,
3294 struct ctdb_control_get_ifaces *ifaces;
3295 struct ctdb_iface *cur;
3297 /* count how many public ip structures we have */
3299 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3303 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3304 num*sizeof(struct ctdb_control_iface_info);
3305 ifaces = talloc_zero_size(outdata, len);
3306 CTDB_NO_MEMORY(ctdb, ifaces);
3309 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3310 strcpy(ifaces->ifaces[i].name, cur->name);
3311 ifaces->ifaces[i].link_state = cur->link_up;
3312 ifaces->ifaces[i].references = cur->references;
3316 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3317 i*sizeof(struct ctdb_control_iface_info);
3319 outdata->dsize = len;
3320 outdata->dptr = (uint8_t *)ifaces;
3325 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3326 struct ctdb_req_control *c,
3329 struct ctdb_control_iface_info *info;
3330 struct ctdb_iface *iface;
3331 bool link_up = false;
3333 info = (struct ctdb_control_iface_info *)indata.dptr;
3335 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3336 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3337 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3338 len, len, info->name));
3342 switch (info->link_state) {
3350 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3351 (unsigned int)info->link_state));
3355 if (info->references != 0) {
3356 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3357 (unsigned int)info->references));
3361 iface = ctdb_find_iface(ctdb, info->name);
3362 if (iface == NULL) {
3366 if (link_up == iface->link_up) {
3370 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3371 ("iface[%s] has changed it's link status %s => %s\n",
3373 iface->link_up?"up":"down",
3374 link_up?"up":"down"));
3376 iface->link_up = link_up;
3382 structure containing the listening socket and the list of tcp connections
3383 that the ctdb daemon is to kill
3385 struct ctdb_kill_tcp {
3386 struct ctdb_vnn *vnn;
3387 struct ctdb_context *ctdb;
3389 struct fd_event *fde;
3390 trbt_tree_t *connections;
3395 a tcp connection that is to be killed
3397 struct ctdb_killtcp_con {
3398 ctdb_sock_addr src_addr;
3399 ctdb_sock_addr dst_addr;
3401 struct ctdb_kill_tcp *killtcp;
3404 /* this function is used to create a key to represent this socketpair
3405 in the killtcp tree.
3406 this key is used to insert and lookup matching socketpairs that are
3407 to be tickled and RST
3409 #define KILLTCP_KEYLEN 10
3410 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3412 static uint32_t key[KILLTCP_KEYLEN];
3414 bzero(key, sizeof(key));
3416 if (src->sa.sa_family != dst->sa.sa_family) {
3417 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3421 switch (src->sa.sa_family) {
3423 key[0] = dst->ip.sin_addr.s_addr;
3424 key[1] = src->ip.sin_addr.s_addr;
3425 key[2] = dst->ip.sin_port;
3426 key[3] = src->ip.sin_port;
3429 uint32_t *dst6_addr32 =
3430 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3431 uint32_t *src6_addr32 =
3432 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3433 key[0] = dst6_addr32[3];
3434 key[1] = src6_addr32[3];
3435 key[2] = dst6_addr32[2];
3436 key[3] = src6_addr32[2];
3437 key[4] = dst6_addr32[1];
3438 key[5] = src6_addr32[1];
3439 key[6] = dst6_addr32[0];
3440 key[7] = src6_addr32[0];
3441 key[8] = dst->ip6.sin6_port;
3442 key[9] = src->ip6.sin6_port;
3446 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3454 called when we get a read event on the raw socket
3456 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
3457 uint16_t flags, void *private_data)
3459 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3460 struct ctdb_killtcp_con *con;
3461 ctdb_sock_addr src, dst;
3462 uint32_t ack_seq, seq;
3464 if (!(flags & EVENT_FD_READ)) {
3468 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3469 killtcp->private_data,
3471 &ack_seq, &seq) != 0) {
3472 /* probably a non-tcp ACK packet */
3476 /* check if we have this guy in our list of connections
3479 con = trbt_lookuparray32(killtcp->connections,
3480 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3482 /* no this was some other packet we can just ignore */
3486 /* This one has been tickled !
3487 now reset him and remove him from the list.
3489 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3490 ntohs(con->dst_addr.ip.sin_port),
3491 ctdb_addr_to_str(&con->src_addr),
3492 ntohs(con->src_addr.ip.sin_port)));
3494 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3499 /* when traversing the list of all tcp connections to send tickle acks to
3500 (so that we can capture the ack coming back and kill the connection
3502 this callback is called for each connection we are currently trying to kill
3504 static int tickle_connection_traverse(void *param, void *data)
3506 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3508 /* have tried too many times, just give up */
3509 if (con->count >= 5) {
3510 /* can't delete in traverse: reparent to delete_cons */
3511 talloc_steal(param, con);
3515 /* othervise, try tickling it again */
3518 (ctdb_sock_addr *)&con->dst_addr,
3519 (ctdb_sock_addr *)&con->src_addr,
3526 called every second until all sentenced connections have been reset
3528 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
3529 struct timeval t, void *private_data)
3531 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3532 void *delete_cons = talloc_new(NULL);
3534 /* loop over all connections sending tickle ACKs */
3535 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3537 /* now we've finished traverse, it's safe to do deletion. */
3538 talloc_free(delete_cons);
3540 /* If there are no more connections to kill we can remove the
3541 entire killtcp structure
3543 if ( (killtcp->connections == NULL) ||
3544 (killtcp->connections->root == NULL) ) {
3545 talloc_free(killtcp);
3549 /* try tickling them again in a seconds time
3551 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3552 ctdb_tickle_sentenced_connections, killtcp);
3556 destroy the killtcp structure
3558 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3560 struct ctdb_vnn *tmpvnn;
3562 /* verify that this vnn is still active */
3563 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3564 if (tmpvnn == killtcp->vnn) {
3569 if (tmpvnn == NULL) {
3573 if (killtcp->vnn->killtcp != killtcp) {
3577 killtcp->vnn->killtcp = NULL;
3583 /* nothing fancy here, just unconditionally replace any existing
3584 connection structure with the new one.
3586 dont even free the old one if it did exist, that one is talloc_stolen
3587 by the same node in the tree anyway and will be deleted when the new data
3590 static void *add_killtcp_callback(void *parm, void *data)
3596 add a tcp socket to the list of connections we want to RST
3598 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3602 ctdb_sock_addr src, dst;
3603 struct ctdb_kill_tcp *killtcp;
3604 struct ctdb_killtcp_con *con;
3605 struct ctdb_vnn *vnn;
3607 ctdb_canonicalize_ip(s, &src);
3608 ctdb_canonicalize_ip(d, &dst);
3610 vnn = find_public_ip_vnn(ctdb, &dst);
3612 vnn = find_public_ip_vnn(ctdb, &src);
3615 /* if it is not a public ip it could be our 'single ip' */
3616 if (ctdb->single_ip_vnn) {
3617 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3618 vnn = ctdb->single_ip_vnn;
3623 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3627 killtcp = vnn->killtcp;
3629 /* If this is the first connection to kill we must allocate
3632 if (killtcp == NULL) {
3633 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3634 CTDB_NO_MEMORY(ctdb, killtcp);
3637 killtcp->ctdb = ctdb;
3638 killtcp->capture_fd = -1;
3639 killtcp->connections = trbt_create(killtcp, 0);
3641 vnn->killtcp = killtcp;
3642 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3647 /* create a structure that describes this connection we want to
3648 RST and store it in killtcp->connections
3650 con = talloc(killtcp, struct ctdb_killtcp_con);
3651 CTDB_NO_MEMORY(ctdb, con);
3652 con->src_addr = src;
3653 con->dst_addr = dst;
3655 con->killtcp = killtcp;
3658 trbt_insertarray32_callback(killtcp->connections,
3659 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3660 add_killtcp_callback, con);
3663 If we dont have a socket to listen on yet we must create it
3665 if (killtcp->capture_fd == -1) {
3666 const char *iface = ctdb_vnn_iface_string(vnn);
3667 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3668 if (killtcp->capture_fd == -1) {
3669 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3670 "socket on iface '%s' for killtcp (%s)\n",
3671 iface, strerror(errno)));
3677 if (killtcp->fde == NULL) {
3678 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
3680 capture_tcp_handler, killtcp);
3681 tevent_fd_set_auto_close(killtcp->fde);
3683 /* We also need to set up some events to tickle all these connections
3684 until they are all reset
3686 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3687 ctdb_tickle_sentenced_connections, killtcp);
3690 /* tickle him once now */
3699 talloc_free(vnn->killtcp);
3700 vnn->killtcp = NULL;
3705 kill a TCP connection.
3707 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3709 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3711 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3715 called by a daemon to inform us of the entire list of TCP tickles for
3716 a particular public address.
3717 this control should only be sent by the node that is currently serving
3718 that public address.
3720 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3722 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3723 struct ctdb_tcp_array *tcparray;
3724 struct ctdb_vnn *vnn;
3726 /* We must at least have tickles.num or else we cant verify the size
3727 of the received data blob
3729 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3730 tickles.connections)) {
3731 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3735 /* verify that the size of data matches what we expect */
3736 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3737 tickles.connections)
3738 + sizeof(struct ctdb_tcp_connection)
3739 * list->tickles.num) {
3740 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3744 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3745 ctdb_addr_to_str(&list->addr)));
3747 vnn = find_public_ip_vnn(ctdb, &list->addr);
3749 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3750 ctdb_addr_to_str(&list->addr)));
3755 /* remove any old ticklelist we might have */
3756 talloc_free(vnn->tcp_array);
3757 vnn->tcp_array = NULL;
3759 tcparray = talloc(vnn, struct ctdb_tcp_array);
3760 CTDB_NO_MEMORY(ctdb, tcparray);
3762 tcparray->num = list->tickles.num;
3764 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3765 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3767 memcpy(tcparray->connections, &list->tickles.connections[0],
3768 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3770 /* We now have a new fresh tickle list array for this vnn */
3771 vnn->tcp_array = tcparray;
3777 called to return the full list of tickles for the puclic address associated
3778 with the provided vnn
3780 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3782 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3783 struct ctdb_control_tcp_tickle_list *list;
3784 struct ctdb_tcp_array *tcparray;
3786 struct ctdb_vnn *vnn;
3788 vnn = find_public_ip_vnn(ctdb, addr);
3790 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3791 ctdb_addr_to_str(addr)));
3796 tcparray = vnn->tcp_array;
3798 num = tcparray->num;
3803 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3804 tickles.connections)
3805 + sizeof(struct ctdb_tcp_connection) * num;
3807 outdata->dptr = talloc_size(outdata, outdata->dsize);
3808 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3809 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3812 list->tickles.num = num;
3814 memcpy(&list->tickles.connections[0], tcparray->connections,
3815 sizeof(struct ctdb_tcp_connection) * num);
3823 set the list of all tcp tickles for a public address
3825 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3826 ctdb_sock_addr *addr,
3827 struct ctdb_tcp_array *tcparray)
3831 struct ctdb_control_tcp_tickle_list *list;
3834 num = tcparray->num;
3839 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3840 tickles.connections) +
3841 sizeof(struct ctdb_tcp_connection) * num;
3842 data.dptr = talloc_size(ctdb, data.dsize);
3843 CTDB_NO_MEMORY(ctdb, data.dptr);
3845 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3847 list->tickles.num = num;
3849 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3852 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3853 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3854 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3856 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3860 talloc_free(data.dptr);
3867 perform tickle updates if required
3869 static void ctdb_update_tcp_tickles(struct event_context *ev,
3870 struct timed_event *te,
3871 struct timeval t, void *private_data)
3873 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3875 struct ctdb_vnn *vnn;
3877 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3878 /* we only send out updates for public addresses that
3881 if (ctdb->pnn != vnn->pnn) {
3884 /* We only send out the updates if we need to */
3885 if (!vnn->tcp_update_needed) {
3888 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3889 &vnn->public_address,
3892 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3893 ctdb_addr_to_str(&vnn->public_address)));
3896 ("Sent tickle update for public address %s\n",
3897 ctdb_addr_to_str(&vnn->public_address)));
3898 vnn->tcp_update_needed = false;
3902 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3903 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3904 ctdb_update_tcp_tickles, ctdb);
3909 start periodic update of tcp tickles
3911 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3913 ctdb->tickle_update_context = talloc_new(ctdb);
3915 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3916 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3917 ctdb_update_tcp_tickles, ctdb);
3923 struct control_gratious_arp {
3924 struct ctdb_context *ctdb;
3925 ctdb_sock_addr addr;
3931 send a control_gratuitous arp
3933 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
3934 struct timeval t, void *private_data)
3937 struct control_gratious_arp *arp = talloc_get_type(private_data,
3938 struct control_gratious_arp);
3940 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3942 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3943 arp->iface, strerror(errno)));
3948 if (arp->count == CTDB_ARP_REPEAT) {
3953 event_add_timed(arp->ctdb->ev, arp,
3954 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3955 send_gratious_arp, arp);
3962 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3964 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3965 struct control_gratious_arp *arp;
3967 /* verify the size of indata */
3968 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3969 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3970 (unsigned)indata.dsize,
3971 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3975 ( offsetof(struct ctdb_control_gratious_arp, iface)
3976 + gratious_arp->len ) ){
3978 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3979 "but should be %u bytes\n",
3980 (unsigned)indata.dsize,
3981 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3986 arp = talloc(ctdb, struct control_gratious_arp);
3987 CTDB_NO_MEMORY(ctdb, arp);
3990 arp->addr = gratious_arp->addr;
3991 arp->iface = talloc_strdup(arp, gratious_arp->iface);
3992 CTDB_NO_MEMORY(ctdb, arp->iface);
3995 event_add_timed(arp->ctdb->ev, arp,
3996 timeval_zero(), send_gratious_arp, arp);
4001 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4003 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4006 /* verify the size of indata */
4007 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4008 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4012 ( offsetof(struct ctdb_control_ip_iface, iface)
4015 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4016 "but should be %u bytes\n",
4017 (unsigned)indata.dsize,
4018 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4022 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4024 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4027 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4034 struct delete_ip_callback_state {
4035 struct ctdb_req_control *c;
4039 called when releaseip event finishes for del_public_address
4041 static void delete_ip_callback(struct ctdb_context *ctdb,
4042 int32_t status, TDB_DATA data,
4043 const char *errormsg,
4046 struct delete_ip_callback_state *state =
4047 talloc_get_type(private_data, struct delete_ip_callback_state);
4049 /* If release failed then fail. */
4050 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4051 talloc_free(private_data);
4054 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4055 struct ctdb_req_control *c,
4056 TDB_DATA indata, bool *async_reply)
4058 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4059 struct ctdb_vnn *vnn;
4061 /* verify the size of indata */
4062 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4063 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4067 ( offsetof(struct ctdb_control_ip_iface, iface)
4070 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4071 "but should be %u bytes\n",
4072 (unsigned)indata.dsize,
4073 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4077 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4079 /* walk over all public addresses until we find a match */
4080 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4081 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4082 if (vnn->pnn == ctdb->pnn) {
4083 struct delete_ip_callback_state *state;
4084 struct ctdb_public_ip *ip;
4088 vnn->delete_pending = true;
4090 state = talloc(ctdb,
4091 struct delete_ip_callback_state);
4092 CTDB_NO_MEMORY(ctdb, state);
4095 ip = talloc(state, struct ctdb_public_ip);
4098 (__location__ " Out of memory\n"));
4103 ip->addr = pub->addr;
4105 data.dsize = sizeof(struct ctdb_public_ip);
4106 data.dptr = (unsigned char *)ip;
4108 ret = ctdb_daemon_send_control(ctdb,
4111 CTDB_CONTROL_RELEASE_IP,
4118 (__location__ "Unable to send "
4119 "CTDB_CONTROL_RELEASE_IP\n"));
4124 state->c = talloc_steal(state, c);
4125 *async_reply = true;
4127 /* This IP is not hosted on the
4128 * current node so just delete it
4130 do_delete_ip(ctdb, vnn);
4137 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4138 ctdb_addr_to_str(&pub->addr)));
4143 struct ipreallocated_callback_state {
4144 struct ctdb_req_control *c;
4147 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4148 int status, void *p)
4150 struct ipreallocated_callback_state *state =
4151 talloc_get_type(p, struct ipreallocated_callback_state);
4155 (" \"ipreallocated\" event script failed (status %d)\n",
4157 if (status == -ETIME) {
4158 ctdb_ban_self(ctdb);
4162 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4166 /* A control to run the ipreallocated event */
4167 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4168 struct ctdb_req_control *c,
4172 struct ipreallocated_callback_state *state;
4174 state = talloc(ctdb, struct ipreallocated_callback_state);
4175 CTDB_NO_MEMORY(ctdb, state);
4177 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4179 ret = ctdb_event_script_callback(ctdb, state,
4180 ctdb_ipreallocated_callback, state,
4181 CTDB_EVENT_IPREALLOCATED,
4185 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4190 /* tell the control that we will be reply asynchronously */
4191 state->c = talloc_steal(state, c);
4192 *async_reply = true;
4198 /* This function is called from the recovery daemon to verify that a remote
4199 node has the expected ip allocation.
4200 This is verified against ctdb->ip_tree
4202 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4203 struct ctdb_all_public_ips *ips,
4206 struct ctdb_public_ip_list *tmp_ip;
4209 if (ctdb->ip_tree == NULL) {
4210 /* dont know the expected allocation yet, assume remote node
4219 for (i=0; i<ips->num; i++) {
4220 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4221 if (tmp_ip == NULL) {
4222 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4226 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4230 if (tmp_ip->pnn != ips->ips[i].pnn) {
4232 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4234 ctdb_addr_to_str(&ips->ips[i].addr),
4235 ips->ips[i].pnn, tmp_ip->pnn));
4243 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4245 struct ctdb_public_ip_list *tmp_ip;
4247 if (ctdb->ip_tree == NULL) {
4248 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4252 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4253 if (tmp_ip == NULL) {
4254 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4258 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4259 tmp_ip->pnn = ip->pnn;
4265 struct ctdb_reloadips_handle {
4266 struct ctdb_context *ctdb;
4267 struct ctdb_req_control *c;
4271 struct fd_event *fde;
4274 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4276 if (h == h->ctdb->reload_ips) {
4277 h->ctdb->reload_ips = NULL;
4280 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4283 ctdb_kill(h->ctdb, h->child, SIGKILL);
4287 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4288 struct timed_event *te,
4289 struct timeval t, void *private_data)
4291 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4296 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde,
4297 uint16_t flags, void *private_data)
4299 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4304 ret = sys_read(h->fd[0], &res, 1);
4305 if (ret < 1 || res != 0) {
4306 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4314 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4316 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4317 struct ctdb_all_public_ips *ips;
4318 struct ctdb_vnn *vnn;
4319 struct client_async_data *async_data;
4320 struct timeval timeout;
4322 struct ctdb_client_control_state *state;
4326 CTDB_NO_MEMORY(ctdb, mem_ctx);
4328 /* Read IPs from local node */
4329 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4330 CTDB_CURRENT_NODE, mem_ctx, &ips);
4333 ("Unable to fetch public IPs from local node\n"));
4334 talloc_free(mem_ctx);
4338 /* Read IPs file - this is safe since this is a child process */
4340 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4341 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4342 talloc_free(mem_ctx);
4346 async_data = talloc_zero(mem_ctx, struct client_async_data);
4347 CTDB_NO_MEMORY(ctdb, async_data);
4349 /* Compare IPs between node and file for IPs to be deleted */
4350 for (i = 0; i < ips->num; i++) {
4352 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4353 if (ctdb_same_ip(&vnn->public_address,
4354 &ips->ips[i].addr)) {
4355 /* IP is still in file */
4361 /* Delete IP ips->ips[i] */
4362 struct ctdb_control_ip_iface *pub;
4365 ("IP %s no longer configured, deleting it\n",
4366 ctdb_addr_to_str(&ips->ips[i].addr)));
4368 pub = talloc_zero(mem_ctx,
4369 struct ctdb_control_ip_iface);
4370 CTDB_NO_MEMORY(ctdb, pub);
4372 pub->addr = ips->ips[i].addr;
4376 timeout = TAKEOVER_TIMEOUT();
4378 data.dsize = offsetof(struct ctdb_control_ip_iface,
4380 data.dptr = (uint8_t *)pub;
4382 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4383 CTDB_CONTROL_DEL_PUBLIC_IP,
4384 0, data, async_data,
4386 if (state == NULL) {
4389 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4393 ctdb_client_async_add(async_data, state);
4397 /* Compare IPs between node and file for IPs to be added */
4399 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4400 for (i = 0; i < ips->num; i++) {
4401 if (ctdb_same_ip(&vnn->public_address,
4402 &ips->ips[i].addr)) {
4403 /* IP already on node */
4407 if (i == ips->num) {
4408 /* Add IP ips->ips[i] */
4409 struct ctdb_control_ip_iface *pub;
4410 const char *ifaces = NULL;
4415 ("New IP %s configured, adding it\n",
4416 ctdb_addr_to_str(&vnn->public_address)));
4418 uint32_t pnn = ctdb_get_pnn(ctdb);
4420 data.dsize = sizeof(pnn);
4421 data.dptr = (uint8_t *)&pnn;
4423 ret = ctdb_client_send_message(
4425 CTDB_BROADCAST_CONNECTED,
4426 CTDB_SRVID_REBALANCE_NODE,
4429 DEBUG(DEBUG_WARNING,
4430 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4436 ifaces = vnn->ifaces[0];
4438 while (vnn->ifaces[iface] != NULL) {
4439 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4440 vnn->ifaces[iface]);
4444 len = strlen(ifaces) + 1;
4445 pub = talloc_zero_size(mem_ctx,
4446 offsetof(struct ctdb_control_ip_iface, iface) + len);
4447 CTDB_NO_MEMORY(ctdb, pub);
4449 pub->addr = vnn->public_address;
4450 pub->mask = vnn->public_netmask_bits;
4452 memcpy(&pub->iface[0], ifaces, pub->len);
4454 timeout = TAKEOVER_TIMEOUT();
4456 data.dsize = offsetof(struct ctdb_control_ip_iface,
4458 data.dptr = (uint8_t *)pub;
4460 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4461 CTDB_CONTROL_ADD_PUBLIC_IP,
4462 0, data, async_data,
4464 if (state == NULL) {
4467 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4471 ctdb_client_async_add(async_data, state);
4475 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4476 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4480 talloc_free(mem_ctx);
4484 talloc_free(mem_ctx);
4488 /* This control is sent to force the node to re-read the public addresses file
4489 and drop any addresses we should nnot longer host, and add new addresses
4490 that we are now able to host
4492 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4494 struct ctdb_reloadips_handle *h;
4495 pid_t parent = getpid();
4497 if (ctdb->reload_ips != NULL) {
4498 talloc_free(ctdb->reload_ips);
4499 ctdb->reload_ips = NULL;
4502 h = talloc(ctdb, struct ctdb_reloadips_handle);
4503 CTDB_NO_MEMORY(ctdb, h);
4508 if (pipe(h->fd) == -1) {
4509 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4514 h->child = ctdb_fork(ctdb);
4515 if (h->child == (pid_t)-1) {
4516 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4524 if (h->child == 0) {
4525 signed char res = 0;
4528 debug_extra = talloc_asprintf(NULL, "reloadips:");
4530 ctdb_set_process_name("ctdb_reloadips");
4531 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4532 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4535 res = ctdb_reloadips_child(ctdb);
4537 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4541 sys_write(h->fd[1], &res, 1);
4542 /* make sure we die when our parent dies */
4543 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4549 h->c = talloc_steal(h, c);
4552 set_close_on_exec(h->fd[0]);
4554 talloc_set_destructor(h, ctdb_reloadips_destructor);
4557 h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4558 EVENT_FD_READ, ctdb_reloadips_child_handler,
4560 tevent_fd_set_auto_close(h->fde);
4562 event_add_timed(ctdb->ev, h,
4563 timeval_current_ofs(120, 0),
4564 ctdb_reloadips_timeout_event, h);
4566 /* we reply later */
4567 *async_reply = true;