From fa72d7d50beb7155234a8e15fa5c8443a3c34eaf Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 17 Oct 2011 12:11:54 +1100 Subject: [PATCH] S1031575 When performing addip we dont allow "gratious failvoer" which can, due to timing, and depending on which order the "ctdb addip ..." is called on the nodes lead to imperfect balancing of the ip addresses when addigng several at the same time. This patch makes sure that once the ip address is added to a node, any node, this ip address is released from the node currently hosting the address and there will possibly be a failover after a short while while performing the rebalance of the ip address. This means that when performing "ctdb addip ..." and adding it to a new node, this could affect/disrupt the i/o on this address to the node currently hosting the address, but it will mean we do get a more even distribution after the assignment. This is based on the assumption that it will be more common to "add completely new ip to a set of nodes" rather than "add an ip address that is already in service to a brand new node" --- tools/ctdb.c | 92 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 13 deletions(-) diff --git a/tools/ctdb.c b/tools/ctdb.c index 6cada94e..d49bc8f8 100644 --- a/tools/ctdb.c +++ b/tools/ctdb.c @@ -1325,6 +1325,80 @@ static int control_moveip(struct ctdb_context *ctdb, int argc, const char **argv return 0; } +static int rebalance_ip(struct ctdb_context *ctdb, ctdb_sock_addr *addr) +{ + struct ctdb_public_ip ip; + int ret; + uint32_t *nodes; + uint32_t disable_time; + TDB_DATA data; + struct ctdb_node_map *nodemap=NULL; + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + + disable_time = 30; + data.dptr = (uint8_t*)&disable_time; + data.dsize = sizeof(disable_time); + ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_DISABLE_IP_CHECK, data); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to send message to disable ipcheck\n")); + return -1; + } + + ip.pnn = -1; + ip.addr = *addr; + + data.dptr = (uint8_t *)&ip; + data.dsize = sizeof(ip); + + ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &nodemap); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn)); + talloc_free(tmp_ctx); + return ret; + } + + nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true); + ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_RELEASE_IP, + nodes, 0, + LONGTIMELIMIT(), + false, data, + NULL, NULL, + NULL); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to release IP on nodes\n")); + talloc_free(tmp_ctx); + return -1; + } + + talloc_free(tmp_ctx); + return 0; +} + +/* + release an ip form all nodes and have it re-assigned by recd + */ +static int control_rebalanceip(struct ctdb_context *ctdb, int argc, const char **argv) +{ + ctdb_sock_addr addr; + + if (argc < 1) { + usage(); + return -1; + } + + if (parse_ip(argv[0], NULL, 0, &addr) == 0) { + DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0])); + return -1; + } + + if (rebalance_ip(ctdb, &addr) != 0) { + DEBUG(DEBUG_ERR,("Error when trying to reassign ip\n")); + return -1; + } + + return 0; +} + void getips_store_callback(void *param, void *data) { struct ctdb_public_ip *node_ip = (struct ctdb_public_ip *)data; @@ -1661,7 +1735,7 @@ static int control_addip(struct ctdb_context *ctdb, int argc, const char **argv) /* Dont timeout. This command waits for an ip reallocation - which sometimes can take wuite a while if there has + which sometimes can take quite a while if there has been a recent recovery */ alarm(0); @@ -1689,18 +1763,9 @@ static int control_addip(struct ctdb_context *ctdb, int argc, const char **argv) return ret; } - do { - ret = control_ipreallocate(ctdb, argc, argv); - if (ret != 0) { - DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u. Wait 3 seconds and try again.\n", options.pnn)); - sleep(3); - retries++; - } - } while (retries < 5 && ret != 0); - if (ret != 0) { - DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u. Giving up.\n", options.pnn)); - talloc_free(tmp_ctx); - return ret; + if (rebalance_ip(ctdb, &addr) != 0) { + DEBUG(DEBUG_ERR,("Error when trying to reassign ip\n")); + return -1; } talloc_free(tmp_ctx); @@ -4925,6 +4990,7 @@ static const struct { { "listnodes", control_listnodes, false, true, "list all nodes in the cluster"}, { "reloadnodes", control_reload_nodes_file, false, false, "reload the nodes file and restart the transport on all nodes"}, { "moveip", control_moveip, false, false, "move/failover an ip address to another node", " "}, + { "rebalanceip", control_rebalanceip, false, false, "release an ip from the node and let recd rebalance it", ""}, { "addip", control_addip, true, false, "add a ip address to a node", " "}, { "delip", control_delip, false, false, "delete an ip address from a node", ""}, { "eventscript", control_eventscript, true, false, "run the eventscript with the given parameters on a node", ""}, -- 2.34.1