4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 /* Flags used in IP allocation algorithms. */
43 struct ctdb_iface *prev, *next;
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
52 return vnn->iface->name;
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
62 /* Verify that we dont have an entry for this ip yet */
63 for (i=ctdb->ifaces;i;i=i->next) {
64 if (strcmp(i->name, iface) == 0) {
69 /* create a new structure for this interface */
70 i = talloc_zero(ctdb, struct ctdb_iface);
71 CTDB_NO_MEMORY_FATAL(ctdb, i);
72 i->name = talloc_strdup(i, iface);
73 CTDB_NO_MEMORY(ctdb, i->name);
75 * If link_up defaults to true then IPs can be allocated to a
76 * node during the first recovery. However, then an interface
77 * could have its link marked down during the startup event,
78 * causing the IP to move almost immediately. If link_up
79 * defaults to false then, during normal operation, IPs added
80 * to a new interface can't be assigned until a monitor cycle
81 * has occurred and marked the new interfaces up. This makes
82 * IP allocation unpredictable. The following is a neat
83 * compromise: early in startup link_up defaults to false, so
84 * IPs can't be assigned, and after startup IPs can be
85 * assigned immediately.
87 i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
89 DLIST_ADD(ctdb->ifaces, i);
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
99 for (n = 0; vnn->ifaces[n] != NULL; n++) {
100 if (strcmp(name, vnn->ifaces[n]) == 0) {
108 /* If any interfaces now have no possible IPs then delete them. This
109 * implementation is naive (i.e. simple) rather than clever
110 * (i.e. complex). Given that this is run on delip and that operation
111 * is rare, this doesn't need to be efficient - it needs to be
112 * foolproof. One alternative is reference counting, where the logic
113 * is distributed and can, therefore, be broken in multiple places.
114 * Another alternative is to build a red-black tree of interfaces that
115 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116 * once) and then walking ctdb->ifaces once and deleting those not in
117 * the tree. Let's go to one of those if the naive implementation
118 * causes problems... :-)
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121 struct ctdb_vnn *vnn,
124 struct ctdb_iface *i;
126 /* For each interface, check if there's an IP using it. */
127 for(i=ctdb->ifaces; i; i=i->next) {
131 /* Only consider interfaces named in the given VNN. */
132 if (!vnn_has_interface_with_name(vnn, i->name)) {
136 /* Is the "single IP" on this interface? */
137 if ((ctdb->single_ip_vnn != NULL) &&
138 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140 /* Found, next interface please... */
143 /* Search for a vnn with this interface. */
145 for (tv=ctdb->vnn; tv; tv=tv->next) {
146 if (vnn_has_interface_with_name(tv, i->name)) {
153 /* None of the VNNs are using this interface. */
154 DLIST_REMOVE(ctdb->ifaces, i);
155 /* Caller will free mem_ctx when convenient. */
156 talloc_steal(mem_ctx, i);
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
165 struct ctdb_iface *i;
167 for (i=ctdb->ifaces;i;i=i->next) {
168 if (strcmp(i->name, iface) == 0) {
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177 struct ctdb_vnn *vnn)
180 struct ctdb_iface *cur = NULL;
181 struct ctdb_iface *best = NULL;
183 for (i=0; vnn->ifaces[i]; i++) {
185 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
199 if (cur->references < best->references) {
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209 struct ctdb_vnn *vnn)
211 struct ctdb_iface *best = NULL;
214 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215 "still assigned to iface '%s'\n",
216 ctdb_addr_to_str(&vnn->public_address),
217 ctdb_vnn_iface_string(vnn)));
221 best = ctdb_vnn_best_iface(ctdb, vnn);
223 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224 "cannot assign to iface any iface\n",
225 ctdb_addr_to_str(&vnn->public_address)));
231 vnn->pnn = ctdb->pnn;
233 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234 "now assigned to iface '%s' refs[%d]\n",
235 ctdb_addr_to_str(&vnn->public_address),
236 ctdb_vnn_iface_string(vnn),
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242 struct ctdb_vnn *vnn)
244 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245 "now unassigned (old iface '%s' refs[%d])\n",
246 ctdb_addr_to_str(&vnn->public_address),
247 ctdb_vnn_iface_string(vnn),
248 vnn->iface?vnn->iface->references:0));
250 vnn->iface->references--;
253 if (vnn->pnn == ctdb->pnn) {
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259 struct ctdb_vnn *vnn)
263 if (vnn->delete_pending) {
267 if (vnn->iface && vnn->iface->link_up) {
271 for (i=0; vnn->ifaces[i]; i++) {
272 struct ctdb_iface *cur;
274 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
287 struct ctdb_takeover_arp {
288 struct ctdb_context *ctdb;
291 struct ctdb_tcp_array *tcparray;
292 struct ctdb_vnn *vnn;
297 lists of tcp endpoints
299 struct ctdb_tcp_list {
300 struct ctdb_tcp_list *prev, *next;
301 struct ctdb_tcp_connection connection;
305 list of clients to kill on IP release
307 struct ctdb_client_ip {
308 struct ctdb_client_ip *prev, *next;
309 struct ctdb_context *ctdb;
316 send a gratuitous arp
318 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
319 struct timeval t, void *private_data)
321 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
322 struct ctdb_takeover_arp);
324 struct ctdb_tcp_array *tcparray;
325 const char *iface = ctdb_vnn_iface_string(arp->vnn);
327 ret = ctdb_sys_send_arp(&arp->addr, iface);
329 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
330 iface, strerror(errno)));
333 tcparray = arp->tcparray;
335 for (i=0;i<tcparray->num;i++) {
336 struct ctdb_tcp_connection *tcon;
338 tcon = &tcparray->connections[i];
339 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
340 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
341 ctdb_addr_to_str(&tcon->src_addr),
342 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
343 ret = ctdb_sys_send_tcp(
348 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
349 ctdb_addr_to_str(&tcon->src_addr)));
356 if (arp->count == CTDB_ARP_REPEAT) {
361 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
362 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
363 ctdb_control_send_arp, arp);
366 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
367 struct ctdb_vnn *vnn)
369 struct ctdb_takeover_arp *arp;
370 struct ctdb_tcp_array *tcparray;
372 if (!vnn->takeover_ctx) {
373 vnn->takeover_ctx = talloc_new(vnn);
374 if (!vnn->takeover_ctx) {
379 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
385 arp->addr = vnn->public_address;
388 tcparray = vnn->tcp_array;
390 /* add all of the known tcp connections for this IP to the
391 list of tcp connections to send tickle acks for */
392 arp->tcparray = talloc_steal(arp, tcparray);
394 vnn->tcp_array = NULL;
395 vnn->tcp_update_needed = true;
398 event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
399 timeval_zero(), ctdb_control_send_arp, arp);
404 struct takeover_callback_state {
405 struct ctdb_req_control *c;
406 ctdb_sock_addr *addr;
407 struct ctdb_vnn *vnn;
410 struct ctdb_do_takeip_state {
411 struct ctdb_req_control *c;
412 struct ctdb_vnn *vnn;
416 called when takeip event finishes
418 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
421 struct ctdb_do_takeip_state *state =
422 talloc_get_type(private_data, struct ctdb_do_takeip_state);
427 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
429 if (status == -ETIME) {
432 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
433 ctdb_addr_to_str(&state->vnn->public_address),
434 ctdb_vnn_iface_string(state->vnn)));
435 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
437 node->flags |= NODE_FLAGS_UNHEALTHY;
442 if (ctdb->do_checkpublicip) {
444 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
446 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
453 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
454 data.dsize = strlen((char *)data.dptr) + 1;
455 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
457 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
460 /* the control succeeded */
461 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
466 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
468 state->vnn->update_in_flight = false;
473 take over an ip address
475 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
476 struct ctdb_req_control *c,
477 struct ctdb_vnn *vnn)
480 struct ctdb_do_takeip_state *state;
482 if (vnn->update_in_flight) {
483 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
484 "update for this IP already in flight\n",
485 ctdb_addr_to_str(&vnn->public_address),
486 vnn->public_netmask_bits));
490 ret = ctdb_vnn_assign_iface(ctdb, vnn);
492 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
493 "assign a usable interface\n",
494 ctdb_addr_to_str(&vnn->public_address),
495 vnn->public_netmask_bits));
499 state = talloc(vnn, struct ctdb_do_takeip_state);
500 CTDB_NO_MEMORY(ctdb, state);
502 state->c = talloc_steal(ctdb, c);
505 vnn->update_in_flight = true;
506 talloc_set_destructor(state, ctdb_takeip_destructor);
508 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits,
511 ctdb_vnn_iface_string(vnn)));
513 ret = ctdb_event_script_callback(ctdb,
515 ctdb_do_takeip_callback,
519 ctdb_vnn_iface_string(vnn),
520 ctdb_addr_to_str(&vnn->public_address),
521 vnn->public_netmask_bits);
524 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
525 ctdb_addr_to_str(&vnn->public_address),
526 ctdb_vnn_iface_string(vnn)));
534 struct ctdb_do_updateip_state {
535 struct ctdb_req_control *c;
536 struct ctdb_iface *old;
537 struct ctdb_vnn *vnn;
541 called when updateip event finishes
543 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
546 struct ctdb_do_updateip_state *state =
547 talloc_get_type(private_data, struct ctdb_do_updateip_state);
551 if (status == -ETIME) {
554 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
555 ctdb_addr_to_str(&state->vnn->public_address),
557 ctdb_vnn_iface_string(state->vnn)));
560 * All we can do is reset the old interface
561 * and let the next run fix it
563 ctdb_vnn_unassign_iface(ctdb, state->vnn);
564 state->vnn->iface = state->old;
565 state->vnn->iface->references++;
567 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
572 if (ctdb->do_checkpublicip) {
574 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
576 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
583 /* the control succeeded */
584 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
589 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
591 state->vnn->update_in_flight = false;
596 update (move) an ip address
598 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
599 struct ctdb_req_control *c,
600 struct ctdb_vnn *vnn)
603 struct ctdb_do_updateip_state *state;
604 struct ctdb_iface *old = vnn->iface;
605 const char *new_name;
607 if (vnn->update_in_flight) {
608 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
609 "update for this IP already in flight\n",
610 ctdb_addr_to_str(&vnn->public_address),
611 vnn->public_netmask_bits));
615 ctdb_vnn_unassign_iface(ctdb, vnn);
616 ret = ctdb_vnn_assign_iface(ctdb, vnn);
618 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
619 "assin a usable interface (old iface '%s')\n",
620 ctdb_addr_to_str(&vnn->public_address),
621 vnn->public_netmask_bits,
626 new_name = ctdb_vnn_iface_string(vnn);
627 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
628 /* A benign update from one interface onto itself.
629 * no need to run the eventscripts in this case, just return
632 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
636 state = talloc(vnn, struct ctdb_do_updateip_state);
637 CTDB_NO_MEMORY(ctdb, state);
639 state->c = talloc_steal(ctdb, c);
643 vnn->update_in_flight = true;
644 talloc_set_destructor(state, ctdb_updateip_destructor);
646 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
647 "interface %s to %s\n",
648 ctdb_addr_to_str(&vnn->public_address),
649 vnn->public_netmask_bits,
653 ret = ctdb_event_script_callback(ctdb,
655 ctdb_do_updateip_callback,
657 CTDB_EVENT_UPDATE_IP,
661 ctdb_addr_to_str(&vnn->public_address),
662 vnn->public_netmask_bits);
664 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
665 ctdb_addr_to_str(&vnn->public_address),
666 old->name, new_name));
675 Find the vnn of the node that has a public ip address
676 returns -1 if the address is not known as a public address
678 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
680 struct ctdb_vnn *vnn;
682 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
683 if (ctdb_same_ip(&vnn->public_address, addr)) {
692 take over an ip address
694 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
695 struct ctdb_req_control *c,
700 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
701 struct ctdb_vnn *vnn;
702 bool have_ip = false;
703 bool do_updateip = false;
704 bool do_takeip = false;
705 struct ctdb_iface *best_iface = NULL;
707 if (pip->pnn != ctdb->pnn) {
708 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
709 "with pnn %d, but we're node %d\n",
710 ctdb_addr_to_str(&pip->addr),
711 pip->pnn, ctdb->pnn));
715 /* update out vnn list */
716 vnn = find_public_ip_vnn(ctdb, &pip->addr);
718 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
719 ctdb_addr_to_str(&pip->addr)));
723 if (ctdb->do_checkpublicip) {
724 have_ip = ctdb_sys_have_ip(&pip->addr);
726 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
727 if (best_iface == NULL) {
728 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
729 "a usable interface (old %s, have_ip %d)\n",
730 ctdb_addr_to_str(&vnn->public_address),
731 vnn->public_netmask_bits,
732 ctdb_vnn_iface_string(vnn),
737 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
738 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
743 if (vnn->iface == NULL && have_ip) {
744 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
746 ctdb_addr_to_str(&vnn->public_address)));
750 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
751 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752 "and we have it on iface[%s], but it was assigned to node %d"
753 "and we are node %d, banning ourself\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
760 if (vnn->pnn == -1 && have_ip) {
761 vnn->pnn = ctdb->pnn;
762 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
763 "and we already have it on iface[%s], update local daemon\n",
764 ctdb_addr_to_str(&vnn->public_address),
765 ctdb_vnn_iface_string(vnn)));
770 if (vnn->iface != best_iface) {
771 if (!vnn->iface->link_up) {
773 } else if (vnn->iface->references > (best_iface->references + 1)) {
774 /* only move when the rebalance gains something */
782 ctdb_vnn_unassign_iface(ctdb, vnn);
789 ret = ctdb_do_takeip(ctdb, c, vnn);
793 } else if (do_updateip) {
794 ret = ctdb_do_updateip(ctdb, c, vnn);
800 * The interface is up and the kernel known the ip
803 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
804 ctdb_addr_to_str(&pip->addr),
805 vnn->public_netmask_bits,
806 ctdb_vnn_iface_string(vnn)));
810 /* tell ctdb_control.c that we will be replying asynchronously */
817 takeover an ip address old v4 style
819 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb,
820 struct ctdb_req_control *c,
826 data.dsize = sizeof(struct ctdb_public_ip);
827 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
828 CTDB_NO_MEMORY(ctdb, data.dptr);
830 memcpy(data.dptr, indata.dptr, indata.dsize);
831 return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
835 kill any clients that are registered with a IP that is being released
837 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
839 struct ctdb_client_ip *ip;
841 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
842 ctdb_addr_to_str(addr)));
844 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
845 ctdb_sock_addr tmp_addr;
848 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
850 ctdb_addr_to_str(&ip->addr)));
852 if (ctdb_same_ip(&tmp_addr, addr)) {
853 struct ctdb_client *client = ctdb_reqid_find(ctdb,
856 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
858 ctdb_addr_to_str(&ip->addr),
861 if (client->pid != 0) {
862 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
863 (unsigned)client->pid,
864 ctdb_addr_to_str(addr),
866 kill(client->pid, SIGKILL);
872 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
874 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
876 DLIST_REMOVE(ctdb->vnn, vnn);
877 ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
878 ctdb_vnn_unassign_iface(ctdb, vnn);
880 talloc_free(mem_ctx);
884 called when releaseip event finishes
886 static void release_ip_callback(struct ctdb_context *ctdb, int status,
889 struct takeover_callback_state *state =
890 talloc_get_type(private_data, struct takeover_callback_state);
893 if (status == -ETIME) {
897 if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
898 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
899 ctdb_addr_to_str(state->addr)));
900 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
905 /* send a message to all clients of this node telling them
906 that the cluster has been reconfigured and they should
907 release any sockets on this IP */
908 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
909 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
910 data.dsize = strlen((char *)data.dptr)+1;
912 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
914 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
916 /* kill clients that have registered with this IP */
917 release_kill_clients(ctdb, state->addr);
919 ctdb_vnn_unassign_iface(ctdb, state->vnn);
921 /* Process the IP if it has been marked for deletion */
922 if (state->vnn->delete_pending) {
923 do_delete_ip(ctdb, state->vnn);
927 /* the control succeeded */
928 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
932 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
934 if (state->vnn != NULL) {
935 state->vnn->update_in_flight = false;
941 release an ip address
943 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
944 struct ctdb_req_control *c,
949 struct takeover_callback_state *state;
950 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
951 struct ctdb_vnn *vnn;
954 /* update our vnn list */
955 vnn = find_public_ip_vnn(ctdb, &pip->addr);
957 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
958 ctdb_addr_to_str(&pip->addr)));
963 /* stop any previous arps */
964 talloc_free(vnn->takeover_ctx);
965 vnn->takeover_ctx = NULL;
967 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
968 * lazy multicast to drop an IP from any node that isn't the
969 * intended new node. The following causes makes ctdbd ignore
970 * a release for any address it doesn't host.
972 if (ctdb->do_checkpublicip) {
973 if (!ctdb_sys_have_ip(&pip->addr)) {
974 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
975 ctdb_addr_to_str(&pip->addr),
976 vnn->public_netmask_bits,
977 ctdb_vnn_iface_string(vnn)));
978 ctdb_vnn_unassign_iface(ctdb, vnn);
982 if (vnn->iface == NULL) {
983 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
984 ctdb_addr_to_str(&pip->addr),
985 vnn->public_netmask_bits));
990 /* There is a potential race between take_ip and us because we
991 * update the VNN via a callback that run when the
992 * eventscripts have been run. Avoid the race by allowing one
993 * update to be in flight at a time.
995 if (vnn->update_in_flight) {
996 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
997 "update for this IP already in flight\n",
998 ctdb_addr_to_str(&vnn->public_address),
999 vnn->public_netmask_bits));
1003 if (ctdb->do_checkpublicip) {
1004 iface = ctdb_sys_find_ifname(&pip->addr);
1005 if (iface == NULL) {
1006 DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
1009 if (vnn->iface == NULL) {
1010 DEBUG(DEBUG_WARNING,
1011 ("Public IP %s is hosted on interface %s but we have no VNN\n",
1012 ctdb_addr_to_str(&pip->addr),
1014 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
1015 DEBUG(DEBUG_WARNING,
1016 ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
1017 ctdb_addr_to_str(&pip->addr),
1019 ctdb_vnn_iface_string(vnn)));
1020 /* Should we fix vnn->iface? If we do, what
1021 * happens to reference counts?
1025 iface = strdup(ctdb_vnn_iface_string(vnn));
1028 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1029 ctdb_addr_to_str(&pip->addr),
1030 vnn->public_netmask_bits,
1034 state = talloc(ctdb, struct takeover_callback_state);
1035 CTDB_NO_MEMORY(ctdb, state);
1037 state->c = talloc_steal(state, c);
1038 state->addr = talloc(state, ctdb_sock_addr);
1039 CTDB_NO_MEMORY(ctdb, state->addr);
1040 *state->addr = pip->addr;
1043 vnn->update_in_flight = true;
1044 talloc_set_destructor(state, ctdb_releaseip_destructor);
1046 ret = ctdb_event_script_callback(ctdb,
1047 state, release_ip_callback, state,
1048 CTDB_EVENT_RELEASE_IP,
1051 ctdb_addr_to_str(&pip->addr),
1052 vnn->public_netmask_bits);
1055 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1056 ctdb_addr_to_str(&pip->addr),
1057 ctdb_vnn_iface_string(vnn)));
1062 /* tell the control that we will be reply asynchronously */
1063 *async_reply = true;
1068 release an ip address old v4 style
1070 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb,
1071 struct ctdb_req_control *c,
1077 data.dsize = sizeof(struct ctdb_public_ip);
1078 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1079 CTDB_NO_MEMORY(ctdb, data.dptr);
1081 memcpy(data.dptr, indata.dptr, indata.dsize);
1082 return ctdb_control_release_ip(ctdb, c, data, async_reply);
1086 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1087 ctdb_sock_addr *addr,
1088 unsigned mask, const char *ifaces,
1091 struct ctdb_vnn *vnn;
1098 tmp = strdup(ifaces);
1099 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100 if (!ctdb_sys_check_iface_exists(iface)) {
1101 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1108 /* Verify that we dont have an entry for this ip yet */
1109 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1110 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1111 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1112 ctdb_addr_to_str(addr)));
1117 /* create a new vnn structure for this ip address */
1118 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1119 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1120 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1121 tmp = talloc_strdup(vnn, ifaces);
1122 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1123 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1124 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1125 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1126 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1127 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1131 vnn->ifaces[num] = NULL;
1132 vnn->public_address = *addr;
1133 vnn->public_netmask_bits = mask;
1135 if (check_address) {
1136 if (ctdb_sys_have_ip(addr)) {
1137 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1138 vnn->pnn = ctdb->pnn;
1142 for (i=0; vnn->ifaces[i]; i++) {
1143 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1145 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1146 "for public_address[%s]\n",
1147 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1153 DLIST_ADD(ctdb->vnn, vnn);
1158 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te,
1159 struct timeval t, void *private_data)
1161 struct ctdb_context *ctdb = talloc_get_type(private_data,
1162 struct ctdb_context);
1163 struct ctdb_vnn *vnn;
1165 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1168 for (i=0; vnn->ifaces[i] != NULL; i++) {
1169 if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1170 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1172 ctdb_addr_to_str(&vnn->public_address)));
1177 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
1178 timeval_current_ofs(30, 0),
1179 ctdb_check_interfaces_event, ctdb);
1183 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1185 if (ctdb->check_public_ifaces_ctx != NULL) {
1186 talloc_free(ctdb->check_public_ifaces_ctx);
1187 ctdb->check_public_ifaces_ctx = NULL;
1190 ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1191 if (ctdb->check_public_ifaces_ctx == NULL) {
1192 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1195 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
1196 timeval_current_ofs(30, 0),
1197 ctdb_check_interfaces_event, ctdb);
1204 setup the public address lists from a file
1206 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1212 lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1213 if (lines == NULL) {
1214 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1217 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1221 for (i=0;i<nlines;i++) {
1223 ctdb_sock_addr addr;
1224 const char *addrstr;
1229 while ((*line == ' ') || (*line == '\t')) {
1235 if (strcmp(line, "") == 0) {
1238 tok = strtok(line, " \t");
1240 tok = strtok(NULL, " \t");
1242 if (NULL == ctdb->default_public_interface) {
1243 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1248 ifaces = ctdb->default_public_interface;
1253 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1254 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1258 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1259 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1270 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1274 struct ctdb_vnn *svnn;
1275 struct ctdb_iface *cur = NULL;
1279 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1280 CTDB_NO_MEMORY(ctdb, svnn);
1282 svnn->ifaces = talloc_array(svnn, const char *, 2);
1283 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1284 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1285 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1286 svnn->ifaces[1] = NULL;
1288 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1294 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1296 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1297 "for single_ip[%s]\n",
1299 ctdb_addr_to_str(&svnn->public_address)));
1304 /* assume the single public ip interface is initially "good" */
1305 cur = ctdb_find_iface(ctdb, iface);
1307 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1310 cur->link_up = true;
1312 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1318 ctdb->single_ip_vnn = svnn;
1322 struct ctdb_public_ip_list {
1323 struct ctdb_public_ip_list *next;
1325 ctdb_sock_addr addr;
1328 /* Given a physical node, return the number of
1329 public addresses that is currently assigned to this node.
1331 static int node_ip_coverage(struct ctdb_context *ctdb,
1333 struct ctdb_public_ip_list *ips)
1337 for (;ips;ips=ips->next) {
1338 if (ips->pnn == pnn) {
1346 /* Can the given node host the given IP: is the public IP known to the
1347 * node and is NOIPHOST unset?
1349 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1350 struct ctdb_ipflags ipflags,
1351 struct ctdb_public_ip_list *ip)
1353 struct ctdb_all_public_ips *public_ips;
1356 if (ipflags.noiphost) {
1360 public_ips = ctdb->nodes[pnn]->available_public_ips;
1362 if (public_ips == NULL) {
1366 for (i=0; i<public_ips->num; i++) {
1367 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1368 /* yes, this node can serve this public ip */
1376 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1377 struct ctdb_ipflags ipflags,
1378 struct ctdb_public_ip_list *ip)
1380 if (ipflags.noiptakeover) {
1384 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1387 /* search the node lists list for a node to takeover this ip.
1388 pick the node that currently are serving the least number of ips
1389 so that the ips get spread out evenly.
1391 static int find_takeover_node(struct ctdb_context *ctdb,
1392 struct ctdb_ipflags *ipflags,
1393 struct ctdb_public_ip_list *ip,
1394 struct ctdb_public_ip_list *all_ips)
1396 int pnn, min=0, num;
1399 numnodes = talloc_array_length(ipflags);
1401 for (i=0; i<numnodes; i++) {
1402 /* verify that this node can serve this ip */
1403 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1404 /* no it couldnt so skip to the next node */
1408 num = node_ip_coverage(ctdb, i, all_ips);
1409 /* was this the first node we checked ? */
1421 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1422 ctdb_addr_to_str(&ip->addr)));
1432 static uint32_t *ip_key(ctdb_sock_addr *ip)
1434 static uint32_t key[IP_KEYLEN];
1436 bzero(key, sizeof(key));
1438 switch (ip->sa.sa_family) {
1440 key[3] = htonl(ip->ip.sin_addr.s_addr);
1443 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1444 key[0] = htonl(s6_a32[0]);
1445 key[1] = htonl(s6_a32[1]);
1446 key[2] = htonl(s6_a32[2]);
1447 key[3] = htonl(s6_a32[3]);
1451 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1458 static void *add_ip_callback(void *parm, void *data)
1460 struct ctdb_public_ip_list *this_ip = parm;
1461 struct ctdb_public_ip_list *prev_ip = data;
1463 if (prev_ip == NULL) {
1466 if (this_ip->pnn == -1) {
1467 this_ip->pnn = prev_ip->pnn;
1473 static int getips_count_callback(void *param, void *data)
1475 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1476 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1478 new_ip->next = *ip_list;
1483 static struct ctdb_public_ip_list *
1484 create_merged_ip_list(struct ctdb_context *ctdb)
1487 struct ctdb_public_ip_list *ip_list;
1488 struct ctdb_all_public_ips *public_ips;
1490 if (ctdb->ip_tree != NULL) {
1491 talloc_free(ctdb->ip_tree);
1492 ctdb->ip_tree = NULL;
1494 ctdb->ip_tree = trbt_create(ctdb, 0);
1496 for (i=0;i<ctdb->num_nodes;i++) {
1497 public_ips = ctdb->nodes[i]->known_public_ips;
1499 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1503 /* there were no public ips for this node */
1504 if (public_ips == NULL) {
1508 for (j=0;j<public_ips->num;j++) {
1509 struct ctdb_public_ip_list *tmp_ip;
1511 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1512 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1513 /* Do not use information about IP addresses hosted
1514 * on other nodes, it may not be accurate */
1515 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1516 tmp_ip->pnn = public_ips->ips[j].pnn;
1520 tmp_ip->addr = public_ips->ips[j].addr;
1521 tmp_ip->next = NULL;
1523 trbt_insertarray32_callback(ctdb->ip_tree,
1524 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1531 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1537 * This is the length of the longtest common prefix between the IPs.
1538 * It is calculated by XOR-ing the 2 IPs together and counting the
1539 * number of leading zeroes. The implementation means that all
1540 * addresses end up being 128 bits long.
1542 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1543 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1544 * lots of nodes and IP addresses?
1546 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1548 uint32_t ip1_k[IP_KEYLEN];
1553 uint32_t distance = 0;
1555 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1557 for (i=0; i<IP_KEYLEN; i++) {
1558 x = ip1_k[i] ^ t[i];
1562 /* Count number of leading zeroes.
1563 * FIXME? This could be optimised...
1565 while ((x & (1 << 31)) == 0) {
1575 /* Calculate the IP distance for the given IP relative to IPs on the
1576 given node. The ips argument is generally the all_ips variable
1577 used in the main part of the algorithm.
1579 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1580 struct ctdb_public_ip_list *ips,
1583 struct ctdb_public_ip_list *t;
1588 for (t=ips; t != NULL; t=t->next) {
1589 if (t->pnn != pnn) {
1593 /* Optimisation: We never calculate the distance
1594 * between an address and itself. This allows us to
1595 * calculate the effect of removing an address from a
1596 * node by simply calculating the distance between
1597 * that address and all of the exitsing addresses.
1598 * Moreover, we assume that we're only ever dealing
1599 * with addresses from all_ips so we can identify an
1600 * address via a pointer rather than doing a more
1601 * expensive address comparison. */
1602 if (&(t->addr) == ip) {
1606 d = ip_distance(ip, &(t->addr));
1607 sum += d * d; /* Cheaper than pulling in math.h :-) */
1613 /* Return the LCP2 imbalance metric for addresses currently assigned
1616 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1618 struct ctdb_public_ip_list *t;
1620 uint32_t imbalance = 0;
1622 for (t=all_ips; t!=NULL; t=t->next) {
1623 if (t->pnn != pnn) {
1626 /* Pass the rest of the IPs rather than the whole
1629 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1635 /* Allocate any unassigned IPs just by looping through the IPs and
1636 * finding the best node for each.
1638 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1639 struct ctdb_ipflags *ipflags,
1640 struct ctdb_public_ip_list *all_ips)
1642 struct ctdb_public_ip_list *tmp_ip;
1644 /* loop over all ip's and find a physical node to cover for
1647 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1648 if (tmp_ip->pnn == -1) {
1649 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1650 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1651 ctdb_addr_to_str(&tmp_ip->addr)));
1657 /* Basic non-deterministic rebalancing algorithm.
1659 static void basic_failback(struct ctdb_context *ctdb,
1660 struct ctdb_ipflags *ipflags,
1661 struct ctdb_public_ip_list *all_ips,
1665 int maxnode, maxnum, minnode, minnum, num, retries;
1666 struct ctdb_public_ip_list *tmp_ip;
1668 numnodes = talloc_array_length(ipflags);
1675 /* for each ip address, loop over all nodes that can serve
1676 this ip and make sure that the difference between the node
1677 serving the most and the node serving the least ip's are
1680 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1681 if (tmp_ip->pnn == -1) {
1685 /* Get the highest and lowest number of ips's served by any
1686 valid node which can serve this ip.
1690 for (i=0; i<numnodes; i++) {
1691 /* only check nodes that can actually serve this ip */
1692 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1693 /* no it couldnt so skip to the next node */
1697 num = node_ip_coverage(ctdb, i, all_ips);
1698 if (maxnode == -1) {
1707 if (minnode == -1) {
1717 if (maxnode == -1) {
1718 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1719 ctdb_addr_to_str(&tmp_ip->addr)));
1724 /* if the spread between the smallest and largest coverage by
1725 a node is >=2 we steal one of the ips from the node with
1726 most coverage to even things out a bit.
1727 try to do this a limited number of times since we dont
1728 want to spend too much time balancing the ip coverage.
1730 if ( (maxnum > minnum+1)
1731 && (retries < (num_ips + 5)) ){
1732 struct ctdb_public_ip_list *tmp;
1734 /* Reassign one of maxnode's VNNs */
1735 for (tmp=all_ips;tmp;tmp=tmp->next) {
1736 if (tmp->pnn == maxnode) {
1737 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1746 static void lcp2_init(struct ctdb_context *tmp_ctx,
1747 struct ctdb_ipflags *ipflags,
1748 struct ctdb_public_ip_list *all_ips,
1749 uint32_t *force_rebalance_nodes,
1750 uint32_t **lcp2_imbalances,
1751 bool **rebalance_candidates)
1754 struct ctdb_public_ip_list *tmp_ip;
1756 numnodes = talloc_array_length(ipflags);
1758 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1759 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1760 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1761 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1763 for (i=0; i<numnodes; i++) {
1764 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1765 /* First step: assume all nodes are candidates */
1766 (*rebalance_candidates)[i] = true;
1769 /* 2nd step: if a node has IPs assigned then it must have been
1770 * healthy before, so we remove it from consideration. This
1771 * is overkill but is all we have because we don't maintain
1772 * state between takeover runs. An alternative would be to
1773 * keep state and invalidate it every time the recovery master
1776 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1777 if (tmp_ip->pnn != -1) {
1778 (*rebalance_candidates)[tmp_ip->pnn] = false;
1782 /* 3rd step: if a node is forced to re-balance then
1783 we allow failback onto the node */
1784 if (force_rebalance_nodes == NULL) {
1787 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1788 uint32_t pnn = force_rebalance_nodes[i];
1789 if (pnn >= numnodes) {
1791 (__location__ "unknown node %u\n", pnn));
1796 ("Forcing rebalancing of IPs to node %u\n", pnn));
1797 (*rebalance_candidates)[pnn] = true;
1801 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1802 * the IP/node combination that will cost the least.
1804 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1805 struct ctdb_ipflags *ipflags,
1806 struct ctdb_public_ip_list *all_ips,
1807 uint32_t *lcp2_imbalances)
1809 struct ctdb_public_ip_list *tmp_ip;
1810 int dstnode, numnodes;
1813 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1814 struct ctdb_public_ip_list *minip;
1816 bool should_loop = true;
1817 bool have_unassigned = true;
1819 numnodes = talloc_array_length(ipflags);
1821 while (have_unassigned && should_loop) {
1822 should_loop = false;
1824 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1825 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1831 /* loop over each unassigned ip. */
1832 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1833 if (tmp_ip->pnn != -1) {
1837 for (dstnode=0; dstnode<numnodes; dstnode++) {
1838 /* only check nodes that can actually takeover this ip */
1839 if (!can_node_takeover_ip(ctdb, dstnode,
1842 /* no it couldnt so skip to the next node */
1846 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1847 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1848 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1849 ctdb_addr_to_str(&(tmp_ip->addr)),
1851 dstimbl - lcp2_imbalances[dstnode]));
1854 if ((minnode == -1) || (dstdsum < mindsum)) {
1864 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1866 /* If we found one then assign it to the given node. */
1867 if (minnode != -1) {
1868 minip->pnn = minnode;
1869 lcp2_imbalances[minnode] = minimbl;
1870 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1871 ctdb_addr_to_str(&(minip->addr)),
1876 /* There might be a better way but at least this is clear. */
1877 have_unassigned = false;
1878 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1879 if (tmp_ip->pnn == -1) {
1880 have_unassigned = true;
1885 /* We know if we have an unassigned addresses so we might as
1888 if (have_unassigned) {
1889 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1890 if (tmp_ip->pnn == -1) {
1891 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1892 ctdb_addr_to_str(&tmp_ip->addr)));
1898 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1899 * to move IPs from, determines the best IP/destination node
1900 * combination to move from the source node.
1902 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1903 struct ctdb_ipflags *ipflags,
1904 struct ctdb_public_ip_list *all_ips,
1906 uint32_t *lcp2_imbalances,
1907 bool *rebalance_candidates)
1909 int dstnode, mindstnode, numnodes;
1910 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1911 uint32_t minsrcimbl, mindstimbl;
1912 struct ctdb_public_ip_list *minip;
1913 struct ctdb_public_ip_list *tmp_ip;
1915 /* Find an IP and destination node that best reduces imbalance. */
1922 numnodes = talloc_array_length(ipflags);
1924 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1925 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1926 srcnode, lcp2_imbalances[srcnode]));
1928 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1929 /* Only consider addresses on srcnode. */
1930 if (tmp_ip->pnn != srcnode) {
1934 /* What is this IP address costing the source node? */
1935 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1936 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1938 /* Consider this IP address would cost each potential
1939 * destination node. Destination nodes are limited to
1940 * those that are newly healthy, since we don't want
1941 * to do gratuitous failover of IPs just to make minor
1942 * balance improvements.
1944 for (dstnode=0; dstnode<numnodes; dstnode++) {
1945 if (!rebalance_candidates[dstnode]) {
1949 /* only check nodes that can actually takeover this ip */
1950 if (!can_node_takeover_ip(ctdb, dstnode,
1951 ipflags[dstnode], tmp_ip)) {
1952 /* no it couldnt so skip to the next node */
1956 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1957 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1958 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1960 ctdb_addr_to_str(&(tmp_ip->addr)),
1963 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1964 (dstdsum < srcdsum) && \
1965 ((mindstnode == -1) || \
1966 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1969 minsrcimbl = srcimbl;
1970 mindstnode = dstnode;
1971 mindstimbl = dstimbl;
1975 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1977 if (mindstnode != -1) {
1978 /* We found a move that makes things better... */
1979 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1980 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1981 ctdb_addr_to_str(&(minip->addr)),
1982 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1985 lcp2_imbalances[srcnode] = minsrcimbl;
1986 lcp2_imbalances[mindstnode] = mindstimbl;
1987 minip->pnn = mindstnode;
1996 struct lcp2_imbalance_pnn {
2001 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2003 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2004 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2006 if (lipa->imbalance > lipb->imbalance) {
2008 } else if (lipa->imbalance == lipb->imbalance) {
2015 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2016 * node with the highest LCP2 imbalance, and then determines the best
2017 * IP/destination node combination to move from the source node.
2019 static void lcp2_failback(struct ctdb_context *ctdb,
2020 struct ctdb_ipflags *ipflags,
2021 struct ctdb_public_ip_list *all_ips,
2022 uint32_t *lcp2_imbalances,
2023 bool *rebalance_candidates)
2026 struct lcp2_imbalance_pnn * lips;
2029 numnodes = talloc_array_length(ipflags);
2032 /* Put the imbalances and nodes into an array, sort them and
2033 * iterate through candidates. Usually the 1st one will be
2034 * used, so this doesn't cost much...
2036 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2037 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2038 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2039 for (i=0; i<numnodes; i++) {
2040 lips[i].imbalance = lcp2_imbalances[i];
2042 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2044 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2045 lcp2_cmp_imbalance_pnn);
2048 for (i=0; i<numnodes; i++) {
2049 /* This means that all nodes had 0 or 1 addresses, so
2050 * can't be imbalanced.
2052 if (lips[i].imbalance == 0) {
2056 if (lcp2_failback_candidate(ctdb,
2061 rebalance_candidates)) {
2073 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2074 struct ctdb_ipflags *ipflags,
2075 struct ctdb_public_ip_list *all_ips)
2077 struct ctdb_public_ip_list *tmp_ip;
2079 /* verify that the assigned nodes can serve that public ip
2080 and set it to -1 if not
2082 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2083 if (tmp_ip->pnn == -1) {
2086 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2087 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2088 /* this node can not serve this ip. */
2089 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2090 ctdb_addr_to_str(&(tmp_ip->addr)),
2097 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2098 struct ctdb_ipflags *ipflags,
2099 struct ctdb_public_ip_list *all_ips)
2101 struct ctdb_public_ip_list *tmp_ip;
2104 numnodes = talloc_array_length(ipflags);
2106 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2107 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2108 * always be allocated the same way for a specific set of
2109 * available/unavailable nodes.
2112 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2113 tmp_ip->pnn = i % numnodes;
2116 /* IP failback doesn't make sense with deterministic
2117 * IPs, since the modulo step above implicitly fails
2118 * back IPs to their "home" node.
2120 if (1 == ctdb->tunable.no_ip_failback) {
2121 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2124 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2126 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2128 /* No failback here! */
2131 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2132 struct ctdb_ipflags *ipflags,
2133 struct ctdb_public_ip_list *all_ips)
2135 /* This should be pushed down into basic_failback. */
2136 struct ctdb_public_ip_list *tmp_ip;
2138 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2142 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2144 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2146 /* If we don't want IPs to fail back then don't rebalance IPs. */
2147 if (1 == ctdb->tunable.no_ip_failback) {
2151 /* Now, try to make sure the ip adresses are evenly distributed
2154 basic_failback(ctdb, ipflags, all_ips, num_ips);
2157 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2158 struct ctdb_ipflags *ipflags,
2159 struct ctdb_public_ip_list *all_ips,
2160 uint32_t *force_rebalance_nodes)
2162 uint32_t *lcp2_imbalances;
2163 bool *rebalance_candidates;
2164 int numnodes, num_rebalance_candidates, i;
2166 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2168 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2170 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2171 &lcp2_imbalances, &rebalance_candidates);
2173 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2175 /* If we don't want IPs to fail back then don't rebalance IPs. */
2176 if (1 == ctdb->tunable.no_ip_failback) {
2180 /* It is only worth continuing if we have suitable target
2181 * nodes to transfer IPs to. This check is much cheaper than
2184 numnodes = talloc_array_length(ipflags);
2185 num_rebalance_candidates = 0;
2186 for (i=0; i<numnodes; i++) {
2187 if (rebalance_candidates[i]) {
2188 num_rebalance_candidates++;
2191 if (num_rebalance_candidates == 0) {
2195 /* Now, try to make sure the ip adresses are evenly distributed
2198 lcp2_failback(ctdb, ipflags, all_ips,
2199 lcp2_imbalances, rebalance_candidates);
2202 talloc_free(tmp_ctx);
2205 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2209 /* Count how many completely healthy nodes we have */
2211 for (i=0;i<nodemap->num;i++) {
2212 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2217 return num_healthy == 0;
2220 /* The calculation part of the IP allocation algorithm. */
2221 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2222 struct ctdb_ipflags *ipflags,
2223 struct ctdb_public_ip_list **all_ips_p,
2224 uint32_t *force_rebalance_nodes)
2226 /* since nodes only know about those public addresses that
2227 can be served by that particular node, no single node has
2228 a full list of all public addresses that exist in the cluster.
2229 Walk over all node structures and create a merged list of
2230 all public addresses that exist in the cluster.
2232 keep the tree of ips around as ctdb->ip_tree
2234 *all_ips_p = create_merged_ip_list(ctdb);
2236 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2237 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2238 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2239 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2241 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2244 /* at this point ->pnn is the node which will own each IP
2245 or -1 if there is no node that can cover this ip
2251 struct get_tunable_callback_data {
2252 const char *tunable;
2257 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2258 int32_t res, TDB_DATA outdata,
2261 struct get_tunable_callback_data *cd =
2262 (struct get_tunable_callback_data *)callback;
2266 /* Already handled in fail callback */
2270 if (outdata.dsize != sizeof(uint32_t)) {
2271 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2272 cd->tunable, pnn, (int)sizeof(uint32_t),
2273 (int)outdata.dsize));
2278 size = talloc_array_length(cd->out);
2280 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2281 cd->tunable, pnn, size));
2286 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2289 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2290 int32_t res, TDB_DATA outdata,
2293 struct get_tunable_callback_data *cd =
2294 (struct get_tunable_callback_data *)callback;
2299 ("Timed out getting tunable \"%s\" from node %d\n",
2305 DEBUG(DEBUG_WARNING,
2306 ("Tunable \"%s\" not implemented on node %d\n",
2311 ("Unexpected error getting tunable \"%s\" from node %d\n",
2317 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2318 TALLOC_CTX *tmp_ctx,
2319 struct ctdb_node_map *nodemap,
2320 const char *tunable,
2321 uint32_t default_value)
2324 struct ctdb_control_get_tunable *t;
2327 struct get_tunable_callback_data callback_data;
2330 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2331 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2332 for (i=0; i<nodemap->num; i++) {
2333 tvals[i] = default_value;
2336 callback_data.out = tvals;
2337 callback_data.tunable = tunable;
2338 callback_data.fatal = false;
2340 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2341 data.dptr = talloc_size(tmp_ctx, data.dsize);
2342 t = (struct ctdb_control_get_tunable *)data.dptr;
2343 t->length = strlen(tunable)+1;
2344 memcpy(t->name, tunable, t->length);
2345 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2346 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2347 nodes, 0, TAKEOVER_TIMEOUT(),
2349 get_tunable_callback,
2350 get_tunable_fail_callback,
2351 &callback_data) != 0) {
2352 if (callback_data.fatal) {
2358 talloc_free(data.dptr);
2363 struct get_runstate_callback_data {
2364 enum ctdb_runstate *out;
2368 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2369 int32_t res, TDB_DATA outdata,
2370 void *callback_data)
2372 struct get_runstate_callback_data *cd =
2373 (struct get_runstate_callback_data *)callback_data;
2377 /* Already handled in fail callback */
2381 if (outdata.dsize != sizeof(uint32_t)) {
2382 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2383 pnn, (int)sizeof(uint32_t),
2384 (int)outdata.dsize));
2389 size = talloc_array_length(cd->out);
2391 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2396 cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2399 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2400 int32_t res, TDB_DATA outdata,
2403 struct get_runstate_callback_data *cd =
2404 (struct get_runstate_callback_data *)callback;
2409 ("Timed out getting runstate from node %d\n", pnn));
2413 DEBUG(DEBUG_WARNING,
2414 ("Error getting runstate from node %d - assuming runstates not supported\n",
2419 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2420 TALLOC_CTX *tmp_ctx,
2421 struct ctdb_node_map *nodemap,
2422 enum ctdb_runstate default_value)
2425 enum ctdb_runstate *rs;
2426 struct get_runstate_callback_data callback_data;
2429 rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2430 CTDB_NO_MEMORY_NULL(ctdb, rs);
2431 for (i=0; i<nodemap->num; i++) {
2432 rs[i] = default_value;
2435 callback_data.out = rs;
2436 callback_data.fatal = false;
2438 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2439 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2440 nodes, 0, TAKEOVER_TIMEOUT(),
2442 get_runstate_callback,
2443 get_runstate_fail_callback,
2444 &callback_data) != 0) {
2445 if (callback_data.fatal) {
2455 /* Set internal flags for IP allocation:
2457 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2458 * Set NOIPHOST ip flag for each INACTIVE node
2459 * if all nodes are disabled:
2460 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2462 * Set NOIPHOST ip flags for disabled nodes
2464 static struct ctdb_ipflags *
2465 set_ipflags_internal(struct ctdb_context *ctdb,
2466 TALLOC_CTX *tmp_ctx,
2467 struct ctdb_node_map *nodemap,
2468 uint32_t *tval_noiptakeover,
2469 uint32_t *tval_noiphostonalldisabled,
2470 enum ctdb_runstate *runstate)
2473 struct ctdb_ipflags *ipflags;
2475 /* Clear IP flags - implicit due to talloc_zero */
2476 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2477 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2479 for (i=0;i<nodemap->num;i++) {
2480 /* Can not take IPs on node with NoIPTakeover set */
2481 if (tval_noiptakeover[i] != 0) {
2482 ipflags[i].noiptakeover = true;
2485 /* Can not host IPs on node not in RUNNING state */
2486 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2487 ipflags[i].noiphost = true;
2490 /* Can not host IPs on INACTIVE node */
2491 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2492 ipflags[i].noiphost = true;
2496 if (all_nodes_are_disabled(nodemap)) {
2497 /* If all nodes are disabled, can not host IPs on node
2498 * with NoIPHostOnAllDisabled set
2500 for (i=0;i<nodemap->num;i++) {
2501 if (tval_noiphostonalldisabled[i] != 0) {
2502 ipflags[i].noiphost = true;
2506 /* If some nodes are not disabled, then can not host
2507 * IPs on DISABLED node
2509 for (i=0;i<nodemap->num;i++) {
2510 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2511 ipflags[i].noiphost = true;
2519 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2520 TALLOC_CTX *tmp_ctx,
2521 struct ctdb_node_map *nodemap)
2523 uint32_t *tval_noiptakeover;
2524 uint32_t *tval_noiphostonalldisabled;
2525 struct ctdb_ipflags *ipflags;
2526 enum ctdb_runstate *runstate;
2529 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2531 if (tval_noiptakeover == NULL) {
2535 tval_noiphostonalldisabled =
2536 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2537 "NoIPHostOnAllDisabled", 0);
2538 if (tval_noiphostonalldisabled == NULL) {
2539 /* Caller frees tmp_ctx */
2543 /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2544 * will default to CTDB_RUNSTATE_RUNNING. This ensures
2545 * reasonable behaviour on a mixed cluster during upgrade.
2547 runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2548 CTDB_RUNSTATE_RUNNING);
2549 if (runstate == NULL) {
2550 /* Caller frees tmp_ctx */
2554 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2556 tval_noiphostonalldisabled,
2559 talloc_free(tval_noiptakeover);
2560 talloc_free(tval_noiphostonalldisabled);
2561 talloc_free(runstate);
2566 struct iprealloc_callback_data {
2569 client_async_callback fail_callback;
2570 void *fail_callback_data;
2571 struct ctdb_node_map *nodemap;
2574 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2575 int32_t res, TDB_DATA outdata,
2579 struct iprealloc_callback_data *cd =
2580 (struct iprealloc_callback_data *)callback;
2582 numnodes = talloc_array_length(cd->retry_nodes);
2583 if (pnn > numnodes) {
2585 ("ipreallocated failure from node %d, "
2586 "but only %d nodes in nodemap\n",
2591 /* Can't run the "ipreallocated" event on a INACTIVE node */
2592 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2593 DEBUG(DEBUG_WARNING,
2594 ("ipreallocated failed on inactive node %d, ignoring\n",
2601 /* If the control timed out then that's a real error,
2602 * so call the real fail callback
2604 if (cd->fail_callback) {
2605 cd->fail_callback(ctdb, pnn, res, outdata,
2606 cd->fail_callback_data);
2608 DEBUG(DEBUG_WARNING,
2609 ("iprealloc timed out but no callback registered\n"));
2613 /* If not a timeout then either the ipreallocated
2614 * eventscript (or some setup) failed. This might
2615 * have failed because the IPREALLOCATED control isn't
2616 * implemented - right now there is no way of knowing
2617 * because the error codes are all folded down to -1.
2618 * Consider retrying using EVENTSCRIPT control...
2620 DEBUG(DEBUG_WARNING,
2621 ("ipreallocated failure from node %d, flagging retry\n",
2623 cd->retry_nodes[pnn] = true;
2628 struct takeover_callback_data {
2630 client_async_callback fail_callback;
2631 void *fail_callback_data;
2632 struct ctdb_node_map *nodemap;
2635 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2636 uint32_t node_pnn, int32_t res,
2637 TDB_DATA outdata, void *callback_data)
2639 struct takeover_callback_data *cd =
2640 talloc_get_type_abort(callback_data,
2641 struct takeover_callback_data);
2644 for (i = 0; i < cd->nodemap->num; i++) {
2645 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2650 if (i == cd->nodemap->num) {
2651 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2655 if (!cd->node_failed[i]) {
2656 cd->node_failed[i] = true;
2657 cd->fail_callback(ctdb, node_pnn, res, outdata,
2658 cd->fail_callback_data);
2663 make any IP alias changes for public addresses that are necessary
2665 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2666 uint32_t *force_rebalance_nodes,
2667 client_async_callback fail_callback, void *callback_data)
2670 struct ctdb_public_ip ip;
2671 struct ctdb_public_ipv4 ipv4;
2673 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2675 struct timeval timeout;
2676 struct client_async_data *async_data;
2677 struct ctdb_client_control_state *state;
2678 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2679 struct ctdb_ipflags *ipflags;
2680 struct takeover_callback_data *takeover_data;
2681 struct iprealloc_callback_data iprealloc_data;
2685 * ip failover is completely disabled, just send out the
2686 * ipreallocated event.
2688 if (ctdb->tunable.disable_ip_failover != 0) {
2692 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2693 if (ipflags == NULL) {
2694 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2695 talloc_free(tmp_ctx);
2701 /* Do the IP reassignment calculations */
2702 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2704 /* Now tell all nodes to release any public IPs should not
2705 * host. This will be a NOOP on nodes that don't currently
2706 * hold the given IP.
2708 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2709 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2711 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2712 bool, nodemap->num);
2713 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2714 takeover_data->fail_callback = fail_callback;
2715 takeover_data->fail_callback_data = callback_data;
2716 takeover_data->nodemap = nodemap;
2718 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2719 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2721 async_data->fail_callback = takeover_run_fail_callback;
2722 async_data->callback_data = takeover_data;
2724 for (i=0;i<nodemap->num;i++) {
2725 /* don't talk to unconnected nodes, but do talk to banned nodes */
2726 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2730 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2731 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2732 /* This node should be serving this
2733 vnn so dont tell it to release the ip
2737 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2738 ipv4.pnn = tmp_ip->pnn;
2739 ipv4.sin = tmp_ip->addr.ip;
2741 timeout = TAKEOVER_TIMEOUT();
2742 data.dsize = sizeof(ipv4);
2743 data.dptr = (uint8_t *)&ipv4;
2744 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2745 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2749 ip.pnn = tmp_ip->pnn;
2750 ip.addr = tmp_ip->addr;
2752 timeout = TAKEOVER_TIMEOUT();
2753 data.dsize = sizeof(ip);
2754 data.dptr = (uint8_t *)&ip;
2755 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2756 0, CTDB_CONTROL_RELEASE_IP, 0,
2761 if (state == NULL) {
2762 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2763 talloc_free(tmp_ctx);
2767 ctdb_client_async_add(async_data, state);
2770 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2771 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2772 talloc_free(tmp_ctx);
2775 talloc_free(async_data);
2778 /* tell all nodes to get their own IPs */
2779 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2780 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2782 async_data->fail_callback = fail_callback;
2783 async_data->callback_data = callback_data;
2785 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2786 if (tmp_ip->pnn == -1) {
2787 /* this IP won't be taken over */
2791 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2792 ipv4.pnn = tmp_ip->pnn;
2793 ipv4.sin = tmp_ip->addr.ip;
2795 timeout = TAKEOVER_TIMEOUT();
2796 data.dsize = sizeof(ipv4);
2797 data.dptr = (uint8_t *)&ipv4;
2798 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2799 0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2803 ip.pnn = tmp_ip->pnn;
2804 ip.addr = tmp_ip->addr;
2806 timeout = TAKEOVER_TIMEOUT();
2807 data.dsize = sizeof(ip);
2808 data.dptr = (uint8_t *)&ip;
2809 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2810 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2814 if (state == NULL) {
2815 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2816 talloc_free(tmp_ctx);
2820 ctdb_client_async_add(async_data, state);
2822 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2823 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2824 talloc_free(tmp_ctx);
2830 * Tell all nodes to run eventscripts to process the
2831 * "ipreallocated" event. This can do a lot of things,
2832 * including restarting services to reconfigure them if public
2833 * IPs have moved. Once upon a time this event only used to
2836 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2837 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2838 iprealloc_data.retry_nodes = retry_data;
2839 iprealloc_data.retry_count = 0;
2840 iprealloc_data.fail_callback = fail_callback;
2841 iprealloc_data.fail_callback_data = callback_data;
2842 iprealloc_data.nodemap = nodemap;
2844 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2845 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2846 nodes, 0, TAKEOVER_TIMEOUT(),
2848 NULL, iprealloc_fail_callback,
2851 /* If the control failed then we should retry to any
2852 * nodes flagged by iprealloc_fail_callback using the
2853 * EVENTSCRIPT control. This is a best-effort at
2854 * backward compatiblity when running a mixed cluster
2855 * where some nodes have not yet been upgraded to
2856 * support the IPREALLOCATED control.
2858 DEBUG(DEBUG_WARNING,
2859 ("Retry ipreallocated to some nodes using eventscript control\n"));
2861 nodes = talloc_array(tmp_ctx, uint32_t,
2862 iprealloc_data.retry_count);
2863 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2866 for (i=0; i<nodemap->num; i++) {
2867 if (iprealloc_data.retry_nodes[i]) {
2873 data.dptr = discard_const("ipreallocated");
2874 data.dsize = strlen((char *)data.dptr) + 1;
2875 ret = ctdb_client_async_control(ctdb,
2876 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2877 nodes, 0, TAKEOVER_TIMEOUT(),
2879 NULL, fail_callback,
2882 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2886 talloc_free(tmp_ctx);
2892 destroy a ctdb_client_ip structure
2894 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2896 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2897 ctdb_addr_to_str(&ip->addr),
2898 ntohs(ip->addr.ip.sin_port),
2901 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2906 called by a client to inform us of a TCP connection that it is managing
2907 that should tickled with an ACK when IP takeover is done
2908 we handle both the old ipv4 style of packets as well as the new ipv4/6
2911 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2914 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2915 struct ctdb_control_tcp *old_addr = NULL;
2916 struct ctdb_control_tcp_addr new_addr;
2917 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2918 struct ctdb_tcp_list *tcp;
2919 struct ctdb_tcp_connection t;
2922 struct ctdb_client_ip *ip;
2923 struct ctdb_vnn *vnn;
2924 ctdb_sock_addr addr;
2926 /* If we don't have public IPs, tickles are useless */
2927 if (ctdb->vnn == NULL) {
2931 switch (indata.dsize) {
2932 case sizeof(struct ctdb_control_tcp):
2933 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2934 ZERO_STRUCT(new_addr);
2935 tcp_sock = &new_addr;
2936 tcp_sock->src.ip = old_addr->src;
2937 tcp_sock->dest.ip = old_addr->dest;
2939 case sizeof(struct ctdb_control_tcp_addr):
2940 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2943 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2944 "to ctdb_control_tcp_client. size was %d but "
2945 "only allowed sizes are %lu and %lu\n",
2947 (long unsigned)sizeof(struct ctdb_control_tcp),
2948 (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2952 addr = tcp_sock->src;
2953 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2954 addr = tcp_sock->dest;
2955 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2958 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2959 vnn = find_public_ip_vnn(ctdb, &addr);
2961 switch (addr.sa.sa_family) {
2963 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2964 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2965 ctdb_addr_to_str(&addr)));
2969 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2970 ctdb_addr_to_str(&addr)));
2973 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2979 if (vnn->pnn != ctdb->pnn) {
2980 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2981 ctdb_addr_to_str(&addr),
2982 client_id, client->pid));
2983 /* failing this call will tell smbd to die */
2987 ip = talloc(client, struct ctdb_client_ip);
2988 CTDB_NO_MEMORY(ctdb, ip);
2992 ip->client_id = client_id;
2993 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2994 DLIST_ADD(ctdb->client_ip_list, ip);
2996 tcp = talloc(client, struct ctdb_tcp_list);
2997 CTDB_NO_MEMORY(ctdb, tcp);
2999 tcp->connection.src_addr = tcp_sock->src;
3000 tcp->connection.dst_addr = tcp_sock->dest;
3002 DLIST_ADD(client->tcp_list, tcp);
3004 t.src_addr = tcp_sock->src;
3005 t.dst_addr = tcp_sock->dest;
3007 data.dptr = (uint8_t *)&t;
3008 data.dsize = sizeof(t);
3010 switch (addr.sa.sa_family) {
3012 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3013 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
3014 ctdb_addr_to_str(&tcp_sock->src),
3015 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
3018 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3019 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
3020 ctdb_addr_to_str(&tcp_sock->src),
3021 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3024 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3028 /* tell all nodes about this tcp connection */
3029 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
3030 CTDB_CONTROL_TCP_ADD,
3031 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3033 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3041 find a tcp address on a list
3043 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
3044 struct ctdb_tcp_connection *tcp)
3048 if (array == NULL) {
3052 for (i=0;i<array->num;i++) {
3053 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3054 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3055 return &array->connections[i];
3064 called by a daemon to inform us of a TCP connection that one of its
3065 clients managing that should tickled with an ACK when IP takeover is
3068 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3070 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3071 struct ctdb_tcp_array *tcparray;
3072 struct ctdb_tcp_connection tcp;
3073 struct ctdb_vnn *vnn;
3075 /* If we don't have public IPs, tickles are useless */
3076 if (ctdb->vnn == NULL) {
3080 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3082 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3083 ctdb_addr_to_str(&p->dst_addr)));
3089 tcparray = vnn->tcp_array;
3091 /* If this is the first tickle */
3092 if (tcparray == NULL) {
3093 tcparray = talloc(vnn, struct ctdb_tcp_array);
3094 CTDB_NO_MEMORY(ctdb, tcparray);
3095 vnn->tcp_array = tcparray;
3098 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3099 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3101 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3102 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3105 if (tcp_update_needed) {
3106 vnn->tcp_update_needed = true;
3112 /* Do we already have this tickle ?*/
3113 tcp.src_addr = p->src_addr;
3114 tcp.dst_addr = p->dst_addr;
3115 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3116 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3117 ctdb_addr_to_str(&tcp.dst_addr),
3118 ntohs(tcp.dst_addr.ip.sin_port),
3123 /* A new tickle, we must add it to the array */
3124 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3125 struct ctdb_tcp_connection,
3127 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3129 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3130 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3133 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3134 ctdb_addr_to_str(&tcp.dst_addr),
3135 ntohs(tcp.dst_addr.ip.sin_port),
3138 if (tcp_update_needed) {
3139 vnn->tcp_update_needed = true;
3147 called by a daemon to inform us of a TCP connection that one of its
3148 clients managing that should tickled with an ACK when IP takeover is
3151 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3153 struct ctdb_tcp_connection *tcpp;
3154 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3157 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3158 ctdb_addr_to_str(&conn->dst_addr)));
3162 /* if the array is empty we cant remove it
3163 and we dont need to do anything
3165 if (vnn->tcp_array == NULL) {
3166 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3167 ctdb_addr_to_str(&conn->dst_addr),
3168 ntohs(conn->dst_addr.ip.sin_port)));
3173 /* See if we know this connection
3174 if we dont know this connection then we dont need to do anything
3176 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3178 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3179 ctdb_addr_to_str(&conn->dst_addr),
3180 ntohs(conn->dst_addr.ip.sin_port)));
3185 /* We need to remove this entry from the array.
3186 Instead of allocating a new array and copying data to it
3187 we cheat and just copy the last entry in the existing array
3188 to the entry that is to be removed and just shring the
3191 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3192 vnn->tcp_array->num--;
3194 /* If we deleted the last entry we also need to remove the entire array
3196 if (vnn->tcp_array->num == 0) {
3197 talloc_free(vnn->tcp_array);
3198 vnn->tcp_array = NULL;
3201 vnn->tcp_update_needed = true;
3203 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3204 ctdb_addr_to_str(&conn->src_addr),
3205 ntohs(conn->src_addr.ip.sin_port)));
3210 called by a daemon to inform us of a TCP connection that one of its
3211 clients used are no longer needed in the tickle database
3213 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3215 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3217 /* If we don't have public IPs, tickles are useless */
3218 if (ctdb->vnn == NULL) {
3222 ctdb_remove_tcp_connection(ctdb, conn);
3229 Called when another daemon starts - caises all tickles for all
3230 public addresses we are serving to be sent to the new node on the
3231 next check. This actually causes the next scheduled call to
3232 tdb_update_tcp_tickles() to update all nodes. This is simple and
3233 doesn't require careful error handling.
3235 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3237 struct ctdb_vnn *vnn;
3239 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3240 vnn->tcp_update_needed = true;
3248 called when a client structure goes away - hook to remove
3249 elements from the tcp_list in all daemons
3251 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3253 while (client->tcp_list) {
3254 struct ctdb_tcp_list *tcp = client->tcp_list;
3255 DLIST_REMOVE(client->tcp_list, tcp);
3256 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3262 release all IPs on shutdown
3264 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3266 struct ctdb_vnn *vnn;
3269 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3270 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3271 ctdb_vnn_unassign_iface(ctdb, vnn);
3278 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3279 ctdb_addr_to_str(&vnn->public_address),
3280 vnn->public_netmask_bits,
3281 ctdb_vnn_iface_string(vnn)));
3283 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3284 ctdb_vnn_iface_string(vnn),
3285 ctdb_addr_to_str(&vnn->public_address),
3286 vnn->public_netmask_bits);
3287 release_kill_clients(ctdb, &vnn->public_address);
3288 ctdb_vnn_unassign_iface(ctdb, vnn);
3292 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3297 get list of public IPs
3299 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3300 struct ctdb_req_control *c, TDB_DATA *outdata)
3303 struct ctdb_all_public_ips *ips;
3304 struct ctdb_vnn *vnn;
3305 bool only_available = false;
3307 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3308 only_available = true;
3311 /* count how many public ip structures we have */
3313 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3317 len = offsetof(struct ctdb_all_public_ips, ips) +
3318 num*sizeof(struct ctdb_public_ip);
3319 ips = talloc_zero_size(outdata, len);
3320 CTDB_NO_MEMORY(ctdb, ips);
3323 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3324 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3327 ips->ips[i].pnn = vnn->pnn;
3328 ips->ips[i].addr = vnn->public_address;
3332 len = offsetof(struct ctdb_all_public_ips, ips) +
3333 i*sizeof(struct ctdb_public_ip);
3335 outdata->dsize = len;
3336 outdata->dptr = (uint8_t *)ips;
3343 get list of public IPs, old ipv4 style. only returns ipv4 addresses
3345 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb,
3346 struct ctdb_req_control *c, TDB_DATA *outdata)
3349 struct ctdb_all_public_ipsv4 *ips;
3350 struct ctdb_vnn *vnn;
3352 /* count how many public ip structures we have */
3354 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3355 if (vnn->public_address.sa.sa_family != AF_INET) {
3361 len = offsetof(struct ctdb_all_public_ipsv4, ips) +
3362 num*sizeof(struct ctdb_public_ipv4);
3363 ips = talloc_zero_size(outdata, len);
3364 CTDB_NO_MEMORY(ctdb, ips);
3366 outdata->dsize = len;
3367 outdata->dptr = (uint8_t *)ips;
3371 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3372 if (vnn->public_address.sa.sa_family != AF_INET) {
3375 ips->ips[i].pnn = vnn->pnn;
3376 ips->ips[i].sin = vnn->public_address.ip;
3383 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3384 struct ctdb_req_control *c,
3389 ctdb_sock_addr *addr;
3390 struct ctdb_control_public_ip_info *info;
3391 struct ctdb_vnn *vnn;
3393 addr = (ctdb_sock_addr *)indata.dptr;
3395 vnn = find_public_ip_vnn(ctdb, addr);
3397 /* if it is not a public ip it could be our 'single ip' */
3398 if (ctdb->single_ip_vnn) {
3399 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3400 vnn = ctdb->single_ip_vnn;
3405 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3406 "'%s'not a public address\n",
3407 ctdb_addr_to_str(addr)));
3411 /* count how many public ip structures we have */
3413 for (;vnn->ifaces[num];) {
3417 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3418 num*sizeof(struct ctdb_control_iface_info);
3419 info = talloc_zero_size(outdata, len);
3420 CTDB_NO_MEMORY(ctdb, info);
3422 info->ip.addr = vnn->public_address;
3423 info->ip.pnn = vnn->pnn;
3424 info->active_idx = 0xFFFFFFFF;
3426 for (i=0; vnn->ifaces[i]; i++) {
3427 struct ctdb_iface *cur;
3429 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3431 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3435 if (vnn->iface == cur) {
3436 info->active_idx = i;
3438 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3439 info->ifaces[i].link_state = cur->link_up;
3440 info->ifaces[i].references = cur->references;
3443 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3444 i*sizeof(struct ctdb_control_iface_info);
3446 outdata->dsize = len;
3447 outdata->dptr = (uint8_t *)info;
3452 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3453 struct ctdb_req_control *c,
3457 struct ctdb_control_get_ifaces *ifaces;
3458 struct ctdb_iface *cur;
3460 /* count how many public ip structures we have */
3462 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3466 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3467 num*sizeof(struct ctdb_control_iface_info);
3468 ifaces = talloc_zero_size(outdata, len);
3469 CTDB_NO_MEMORY(ctdb, ifaces);
3472 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3473 strcpy(ifaces->ifaces[i].name, cur->name);
3474 ifaces->ifaces[i].link_state = cur->link_up;
3475 ifaces->ifaces[i].references = cur->references;
3479 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3480 i*sizeof(struct ctdb_control_iface_info);
3482 outdata->dsize = len;
3483 outdata->dptr = (uint8_t *)ifaces;
3488 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3489 struct ctdb_req_control *c,
3492 struct ctdb_control_iface_info *info;
3493 struct ctdb_iface *iface;
3494 bool link_up = false;
3496 info = (struct ctdb_control_iface_info *)indata.dptr;
3498 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3499 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3500 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3501 len, len, info->name));
3505 switch (info->link_state) {
3513 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3514 (unsigned int)info->link_state));
3518 if (info->references != 0) {
3519 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3520 (unsigned int)info->references));
3524 iface = ctdb_find_iface(ctdb, info->name);
3525 if (iface == NULL) {
3529 if (link_up == iface->link_up) {
3533 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3534 ("iface[%s] has changed it's link status %s => %s\n",
3536 iface->link_up?"up":"down",
3537 link_up?"up":"down"));
3539 iface->link_up = link_up;
3545 structure containing the listening socket and the list of tcp connections
3546 that the ctdb daemon is to kill
3548 struct ctdb_kill_tcp {
3549 struct ctdb_vnn *vnn;
3550 struct ctdb_context *ctdb;
3552 struct fd_event *fde;
3553 trbt_tree_t *connections;
3558 a tcp connection that is to be killed
3560 struct ctdb_killtcp_con {
3561 ctdb_sock_addr src_addr;
3562 ctdb_sock_addr dst_addr;
3564 struct ctdb_kill_tcp *killtcp;
3567 /* this function is used to create a key to represent this socketpair
3568 in the killtcp tree.
3569 this key is used to insert and lookup matching socketpairs that are
3570 to be tickled and RST
3572 #define KILLTCP_KEYLEN 10
3573 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3575 static uint32_t key[KILLTCP_KEYLEN];
3577 bzero(key, sizeof(key));
3579 if (src->sa.sa_family != dst->sa.sa_family) {
3580 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3584 switch (src->sa.sa_family) {
3586 key[0] = dst->ip.sin_addr.s_addr;
3587 key[1] = src->ip.sin_addr.s_addr;
3588 key[2] = dst->ip.sin_port;
3589 key[3] = src->ip.sin_port;
3592 uint32_t *dst6_addr32 =
3593 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3594 uint32_t *src6_addr32 =
3595 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3596 key[0] = dst6_addr32[3];
3597 key[1] = src6_addr32[3];
3598 key[2] = dst6_addr32[2];
3599 key[3] = src6_addr32[2];
3600 key[4] = dst6_addr32[1];
3601 key[5] = src6_addr32[1];
3602 key[6] = dst6_addr32[0];
3603 key[7] = src6_addr32[0];
3604 key[8] = dst->ip6.sin6_port;
3605 key[9] = src->ip6.sin6_port;
3609 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3617 called when we get a read event on the raw socket
3619 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
3620 uint16_t flags, void *private_data)
3622 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3623 struct ctdb_killtcp_con *con;
3624 ctdb_sock_addr src, dst;
3625 uint32_t ack_seq, seq;
3627 if (!(flags & EVENT_FD_READ)) {
3631 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3632 killtcp->private_data,
3634 &ack_seq, &seq) != 0) {
3635 /* probably a non-tcp ACK packet */
3639 /* check if we have this guy in our list of connections
3642 con = trbt_lookuparray32(killtcp->connections,
3643 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3645 /* no this was some other packet we can just ignore */
3649 /* This one has been tickled !
3650 now reset him and remove him from the list.
3652 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3653 ntohs(con->dst_addr.ip.sin_port),
3654 ctdb_addr_to_str(&con->src_addr),
3655 ntohs(con->src_addr.ip.sin_port)));
3657 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3662 /* when traversing the list of all tcp connections to send tickle acks to
3663 (so that we can capture the ack coming back and kill the connection
3665 this callback is called for each connection we are currently trying to kill
3667 static int tickle_connection_traverse(void *param, void *data)
3669 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3671 /* have tried too many times, just give up */
3672 if (con->count >= 5) {
3673 /* can't delete in traverse: reparent to delete_cons */
3674 talloc_steal(param, con);
3678 /* othervise, try tickling it again */
3681 (ctdb_sock_addr *)&con->dst_addr,
3682 (ctdb_sock_addr *)&con->src_addr,
3689 called every second until all sentenced connections have been reset
3691 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
3692 struct timeval t, void *private_data)
3694 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3695 void *delete_cons = talloc_new(NULL);
3697 /* loop over all connections sending tickle ACKs */
3698 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3700 /* now we've finished traverse, it's safe to do deletion. */
3701 talloc_free(delete_cons);
3703 /* If there are no more connections to kill we can remove the
3704 entire killtcp structure
3706 if ( (killtcp->connections == NULL) ||
3707 (killtcp->connections->root == NULL) ) {
3708 talloc_free(killtcp);
3712 /* try tickling them again in a seconds time
3714 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3715 ctdb_tickle_sentenced_connections, killtcp);
3719 destroy the killtcp structure
3721 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3723 struct ctdb_vnn *tmpvnn;
3725 /* verify that this vnn is still active */
3726 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3727 if (tmpvnn == killtcp->vnn) {
3732 if (tmpvnn == NULL) {
3736 if (killtcp->vnn->killtcp != killtcp) {
3740 killtcp->vnn->killtcp = NULL;
3746 /* nothing fancy here, just unconditionally replace any existing
3747 connection structure with the new one.
3749 dont even free the old one if it did exist, that one is talloc_stolen
3750 by the same node in the tree anyway and will be deleted when the new data
3753 static void *add_killtcp_callback(void *parm, void *data)
3759 add a tcp socket to the list of connections we want to RST
3761 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3765 ctdb_sock_addr src, dst;
3766 struct ctdb_kill_tcp *killtcp;
3767 struct ctdb_killtcp_con *con;
3768 struct ctdb_vnn *vnn;
3770 ctdb_canonicalize_ip(s, &src);
3771 ctdb_canonicalize_ip(d, &dst);
3773 vnn = find_public_ip_vnn(ctdb, &dst);
3775 vnn = find_public_ip_vnn(ctdb, &src);
3778 /* if it is not a public ip it could be our 'single ip' */
3779 if (ctdb->single_ip_vnn) {
3780 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3781 vnn = ctdb->single_ip_vnn;
3786 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3790 killtcp = vnn->killtcp;
3792 /* If this is the first connection to kill we must allocate
3795 if (killtcp == NULL) {
3796 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3797 CTDB_NO_MEMORY(ctdb, killtcp);
3800 killtcp->ctdb = ctdb;
3801 killtcp->capture_fd = -1;
3802 killtcp->connections = trbt_create(killtcp, 0);
3804 vnn->killtcp = killtcp;
3805 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3810 /* create a structure that describes this connection we want to
3811 RST and store it in killtcp->connections
3813 con = talloc(killtcp, struct ctdb_killtcp_con);
3814 CTDB_NO_MEMORY(ctdb, con);
3815 con->src_addr = src;
3816 con->dst_addr = dst;
3818 con->killtcp = killtcp;
3821 trbt_insertarray32_callback(killtcp->connections,
3822 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3823 add_killtcp_callback, con);
3826 If we dont have a socket to listen on yet we must create it
3828 if (killtcp->capture_fd == -1) {
3829 const char *iface = ctdb_vnn_iface_string(vnn);
3830 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3831 if (killtcp->capture_fd == -1) {
3832 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3833 "socket on iface '%s' for killtcp (%s)\n",
3834 iface, strerror(errno)));
3840 if (killtcp->fde == NULL) {
3841 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
3843 capture_tcp_handler, killtcp);
3844 tevent_fd_set_auto_close(killtcp->fde);
3846 /* We also need to set up some events to tickle all these connections
3847 until they are all reset
3849 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3850 ctdb_tickle_sentenced_connections, killtcp);
3853 /* tickle him once now */
3862 talloc_free(vnn->killtcp);
3863 vnn->killtcp = NULL;
3868 kill a TCP connection.
3870 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3872 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3874 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3878 called by a daemon to inform us of the entire list of TCP tickles for
3879 a particular public address.
3880 this control should only be sent by the node that is currently serving
3881 that public address.
3883 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3885 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3886 struct ctdb_tcp_array *tcparray;
3887 struct ctdb_vnn *vnn;
3889 /* We must at least have tickles.num or else we cant verify the size
3890 of the received data blob
3892 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3893 tickles.connections)) {
3894 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3898 /* verify that the size of data matches what we expect */
3899 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3900 tickles.connections)
3901 + sizeof(struct ctdb_tcp_connection)
3902 * list->tickles.num) {
3903 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3907 vnn = find_public_ip_vnn(ctdb, &list->addr);
3909 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3910 ctdb_addr_to_str(&list->addr)));
3915 /* remove any old ticklelist we might have */
3916 talloc_free(vnn->tcp_array);
3917 vnn->tcp_array = NULL;
3919 tcparray = talloc(vnn, struct ctdb_tcp_array);
3920 CTDB_NO_MEMORY(ctdb, tcparray);
3922 tcparray->num = list->tickles.num;
3924 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3925 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3927 memcpy(tcparray->connections, &list->tickles.connections[0],
3928 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3930 /* We now have a new fresh tickle list array for this vnn */
3931 vnn->tcp_array = tcparray;
3937 called to return the full list of tickles for the puclic address associated
3938 with the provided vnn
3940 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3942 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3943 struct ctdb_control_tcp_tickle_list *list;
3944 struct ctdb_tcp_array *tcparray;
3946 struct ctdb_vnn *vnn;
3948 vnn = find_public_ip_vnn(ctdb, addr);
3950 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3951 ctdb_addr_to_str(addr)));
3956 tcparray = vnn->tcp_array;
3958 num = tcparray->num;
3963 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3964 tickles.connections)
3965 + sizeof(struct ctdb_tcp_connection) * num;
3967 outdata->dptr = talloc_size(outdata, outdata->dsize);
3968 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3969 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3972 list->tickles.num = num;
3974 memcpy(&list->tickles.connections[0], tcparray->connections,
3975 sizeof(struct ctdb_tcp_connection) * num);
3983 set the list of all tcp tickles for a public address
3985 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3986 ctdb_sock_addr *addr,
3987 struct ctdb_tcp_array *tcparray)
3991 struct ctdb_control_tcp_tickle_list *list;
3994 num = tcparray->num;
3999 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
4000 tickles.connections) +
4001 sizeof(struct ctdb_tcp_connection) * num;
4002 data.dptr = talloc_size(ctdb, data.dsize);
4003 CTDB_NO_MEMORY(ctdb, data.dptr);
4005 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
4007 list->tickles.num = num;
4009 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
4012 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
4013 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4014 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4016 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4020 talloc_free(data.dptr);
4027 perform tickle updates if required
4029 static void ctdb_update_tcp_tickles(struct event_context *ev,
4030 struct timed_event *te,
4031 struct timeval t, void *private_data)
4033 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4035 struct ctdb_vnn *vnn;
4037 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4038 /* we only send out updates for public addresses that
4041 if (ctdb->pnn != vnn->pnn) {
4044 /* We only send out the updates if we need to */
4045 if (!vnn->tcp_update_needed) {
4048 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
4049 &vnn->public_address,
4052 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4053 ctdb_addr_to_str(&vnn->public_address)));
4055 vnn->tcp_update_needed = false;
4059 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4060 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
4061 ctdb_update_tcp_tickles, ctdb);
4066 start periodic update of tcp tickles
4068 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4070 ctdb->tickle_update_context = talloc_new(ctdb);
4072 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4073 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
4074 ctdb_update_tcp_tickles, ctdb);
4080 struct control_gratious_arp {
4081 struct ctdb_context *ctdb;
4082 ctdb_sock_addr addr;
4088 send a control_gratuitous arp
4090 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
4091 struct timeval t, void *private_data)
4094 struct control_gratious_arp *arp = talloc_get_type(private_data,
4095 struct control_gratious_arp);
4097 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4099 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4100 arp->iface, strerror(errno)));
4105 if (arp->count == CTDB_ARP_REPEAT) {
4110 event_add_timed(arp->ctdb->ev, arp,
4111 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4112 send_gratious_arp, arp);
4119 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4121 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4122 struct control_gratious_arp *arp;
4124 /* verify the size of indata */
4125 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4126 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4127 (unsigned)indata.dsize,
4128 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4132 ( offsetof(struct ctdb_control_gratious_arp, iface)
4133 + gratious_arp->len ) ){
4135 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4136 "but should be %u bytes\n",
4137 (unsigned)indata.dsize,
4138 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4143 arp = talloc(ctdb, struct control_gratious_arp);
4144 CTDB_NO_MEMORY(ctdb, arp);
4147 arp->addr = gratious_arp->addr;
4148 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4149 CTDB_NO_MEMORY(ctdb, arp->iface);
4152 event_add_timed(arp->ctdb->ev, arp,
4153 timeval_zero(), send_gratious_arp, arp);
4158 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4160 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4163 /* verify the size of indata */
4164 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4165 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4169 ( offsetof(struct ctdb_control_ip_iface, iface)
4172 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4173 "but should be %u bytes\n",
4174 (unsigned)indata.dsize,
4175 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4179 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4181 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4184 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4191 struct delete_ip_callback_state {
4192 struct ctdb_req_control *c;
4196 called when releaseip event finishes for del_public_address
4198 static void delete_ip_callback(struct ctdb_context *ctdb,
4199 int32_t status, TDB_DATA data,
4200 const char *errormsg,
4203 struct delete_ip_callback_state *state =
4204 talloc_get_type(private_data, struct delete_ip_callback_state);
4206 /* If release failed then fail. */
4207 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4208 talloc_free(private_data);
4211 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4212 struct ctdb_req_control *c,
4213 TDB_DATA indata, bool *async_reply)
4215 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4216 struct ctdb_vnn *vnn;
4218 /* verify the size of indata */
4219 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4220 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4224 ( offsetof(struct ctdb_control_ip_iface, iface)
4227 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4228 "but should be %u bytes\n",
4229 (unsigned)indata.dsize,
4230 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4234 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4236 /* walk over all public addresses until we find a match */
4237 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4238 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4239 if (vnn->pnn == ctdb->pnn) {
4240 struct delete_ip_callback_state *state;
4241 struct ctdb_public_ip *ip;
4245 vnn->delete_pending = true;
4247 state = talloc(ctdb,
4248 struct delete_ip_callback_state);
4249 CTDB_NO_MEMORY(ctdb, state);
4252 ip = talloc(state, struct ctdb_public_ip);
4255 (__location__ " Out of memory\n"));
4260 ip->addr = pub->addr;
4262 data.dsize = sizeof(struct ctdb_public_ip);
4263 data.dptr = (unsigned char *)ip;
4265 ret = ctdb_daemon_send_control(ctdb,
4268 CTDB_CONTROL_RELEASE_IP,
4275 (__location__ "Unable to send "
4276 "CTDB_CONTROL_RELEASE_IP\n"));
4281 state->c = talloc_steal(state, c);
4282 *async_reply = true;
4284 /* This IP is not hosted on the
4285 * current node so just delete it
4287 do_delete_ip(ctdb, vnn);
4294 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4295 ctdb_addr_to_str(&pub->addr)));
4300 struct ipreallocated_callback_state {
4301 struct ctdb_req_control *c;
4304 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4305 int status, void *p)
4307 struct ipreallocated_callback_state *state =
4308 talloc_get_type(p, struct ipreallocated_callback_state);
4312 (" \"ipreallocated\" event script failed (status %d)\n",
4314 if (status == -ETIME) {
4315 ctdb_ban_self(ctdb);
4319 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4323 /* A control to run the ipreallocated event */
4324 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4325 struct ctdb_req_control *c,
4329 struct ipreallocated_callback_state *state;
4331 state = talloc(ctdb, struct ipreallocated_callback_state);
4332 CTDB_NO_MEMORY(ctdb, state);
4334 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4336 ret = ctdb_event_script_callback(ctdb, state,
4337 ctdb_ipreallocated_callback, state,
4338 CTDB_EVENT_IPREALLOCATED,
4342 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4347 /* tell the control that we will be reply asynchronously */
4348 state->c = talloc_steal(state, c);
4349 *async_reply = true;
4355 /* This function is called from the recovery daemon to verify that a remote
4356 node has the expected ip allocation.
4357 This is verified against ctdb->ip_tree
4359 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4360 struct ctdb_all_public_ips *ips,
4363 struct ctdb_public_ip_list *tmp_ip;
4366 if (ctdb->ip_tree == NULL) {
4367 /* dont know the expected allocation yet, assume remote node
4376 for (i=0; i<ips->num; i++) {
4377 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4378 if (tmp_ip == NULL) {
4379 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4383 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4387 if (tmp_ip->pnn != ips->ips[i].pnn) {
4389 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4391 ctdb_addr_to_str(&ips->ips[i].addr),
4392 ips->ips[i].pnn, tmp_ip->pnn));
4400 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4402 struct ctdb_public_ip_list *tmp_ip;
4404 if (ctdb->ip_tree == NULL) {
4405 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4409 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4410 if (tmp_ip == NULL) {
4411 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4415 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4416 tmp_ip->pnn = ip->pnn;
4422 struct ctdb_reloadips_handle {
4423 struct ctdb_context *ctdb;
4424 struct ctdb_req_control *c;
4428 struct fd_event *fde;
4431 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4433 if (h == h->ctdb->reload_ips) {
4434 h->ctdb->reload_ips = NULL;
4437 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4440 ctdb_kill(h->ctdb, h->child, SIGKILL);
4444 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4445 struct timed_event *te,
4446 struct timeval t, void *private_data)
4448 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4453 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde,
4454 uint16_t flags, void *private_data)
4456 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4461 ret = read(h->fd[0], &res, 1);
4462 if (ret < 1 || res != 0) {
4463 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4471 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4473 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4474 struct ctdb_all_public_ips *ips;
4475 struct ctdb_vnn *vnn;
4476 struct client_async_data *async_data;
4477 struct timeval timeout;
4479 struct ctdb_client_control_state *state;
4483 CTDB_NO_MEMORY(ctdb, mem_ctx);
4485 /* Read IPs from local node */
4486 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4487 CTDB_CURRENT_NODE, mem_ctx, &ips);
4490 ("Unable to fetch public IPs from local node\n"));
4491 talloc_free(mem_ctx);
4495 /* Read IPs file - this is safe since this is a child process */
4497 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4498 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4499 talloc_free(mem_ctx);
4503 async_data = talloc_zero(mem_ctx, struct client_async_data);
4504 CTDB_NO_MEMORY(ctdb, async_data);
4506 /* Compare IPs between node and file for IPs to be deleted */
4507 for (i = 0; i < ips->num; i++) {
4509 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4510 if (ctdb_same_ip(&vnn->public_address,
4511 &ips->ips[i].addr)) {
4512 /* IP is still in file */
4518 /* Delete IP ips->ips[i] */
4519 struct ctdb_control_ip_iface *pub;
4522 ("IP %s no longer configured, deleting it\n",
4523 ctdb_addr_to_str(&ips->ips[i].addr)));
4525 pub = talloc_zero(mem_ctx,
4526 struct ctdb_control_ip_iface);
4527 CTDB_NO_MEMORY(ctdb, pub);
4529 pub->addr = ips->ips[i].addr;
4533 timeout = TAKEOVER_TIMEOUT();
4535 data.dsize = offsetof(struct ctdb_control_ip_iface,
4537 data.dptr = (uint8_t *)pub;
4539 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4540 CTDB_CONTROL_DEL_PUBLIC_IP,
4541 0, data, async_data,
4543 if (state == NULL) {
4546 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4550 ctdb_client_async_add(async_data, state);
4554 /* Compare IPs between node and file for IPs to be added */
4556 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4557 for (i = 0; i < ips->num; i++) {
4558 if (ctdb_same_ip(&vnn->public_address,
4559 &ips->ips[i].addr)) {
4560 /* IP already on node */
4564 if (i == ips->num) {
4565 /* Add IP ips->ips[i] */
4566 struct ctdb_control_ip_iface *pub;
4567 const char *ifaces = NULL;
4572 ("New IP %s configured, adding it\n",
4573 ctdb_addr_to_str(&vnn->public_address)));
4575 uint32_t pnn = ctdb_get_pnn(ctdb);
4577 data.dsize = sizeof(pnn);
4578 data.dptr = (uint8_t *)&pnn;
4580 ret = ctdb_client_send_message(
4582 CTDB_BROADCAST_CONNECTED,
4583 CTDB_SRVID_REBALANCE_NODE,
4586 DEBUG(DEBUG_WARNING,
4587 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4593 ifaces = vnn->ifaces[0];
4595 while (vnn->ifaces[iface] != NULL) {
4596 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4597 vnn->ifaces[iface]);
4601 len = strlen(ifaces) + 1;
4602 pub = talloc_zero_size(mem_ctx,
4603 offsetof(struct ctdb_control_ip_iface, iface) + len);
4604 CTDB_NO_MEMORY(ctdb, pub);
4606 pub->addr = vnn->public_address;
4607 pub->mask = vnn->public_netmask_bits;
4609 memcpy(&pub->iface[0], ifaces, pub->len);
4611 timeout = TAKEOVER_TIMEOUT();
4613 data.dsize = offsetof(struct ctdb_control_ip_iface,
4615 data.dptr = (uint8_t *)pub;
4617 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4618 CTDB_CONTROL_ADD_PUBLIC_IP,
4619 0, data, async_data,
4621 if (state == NULL) {
4624 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4628 ctdb_client_async_add(async_data, state);
4632 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4633 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4637 talloc_free(mem_ctx);
4641 talloc_free(mem_ctx);
4645 /* This control is sent to force the node to re-read the public addresses file
4646 and drop any addresses we should nnot longer host, and add new addresses
4647 that we are now able to host
4649 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4651 struct ctdb_reloadips_handle *h;
4652 pid_t parent = getpid();
4654 if (ctdb->reload_ips != NULL) {
4655 talloc_free(ctdb->reload_ips);
4656 ctdb->reload_ips = NULL;
4659 h = talloc(ctdb, struct ctdb_reloadips_handle);
4660 CTDB_NO_MEMORY(ctdb, h);
4665 if (pipe(h->fd) == -1) {
4666 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4671 h->child = ctdb_fork(ctdb);
4672 if (h->child == (pid_t)-1) {
4673 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4681 if (h->child == 0) {
4682 signed char res = 0;
4685 debug_extra = talloc_asprintf(NULL, "reloadips:");
4687 ctdb_set_process_name("ctdb_reloadips");
4688 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4689 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4692 res = ctdb_reloadips_child(ctdb);
4694 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4698 write(h->fd[1], &res, 1);
4699 /* make sure we die when our parent dies */
4700 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4706 h->c = talloc_steal(h, c);
4709 set_close_on_exec(h->fd[0]);
4711 talloc_set_destructor(h, ctdb_reloadips_destructor);
4714 h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4715 EVENT_FD_READ, ctdb_reloadips_child_handler,
4717 tevent_fd_set_auto_close(h->fde);
4719 event_add_timed(ctdb->ev, h,
4720 timeval_current_ofs(120, 0),
4721 ctdb_reloadips_timeout_event, h);
4723 /* we reply later */
4724 *async_reply = true;