ctdb-takeover: Add takeover helper
authorMartin Schwenke <martin@meltin.net>
Thu, 10 Nov 2016 05:47:38 +0000 (16:47 +1100)
committerAmitay Isaacs <amitay@samba.org>
Mon, 19 Dec 2016 03:07:08 +0000 (04:07 +0100)
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/packaging/RPM/ctdb.spec.in
ctdb/server/ctdb_takeover_helper.c [new file with mode: 0644]
ctdb/wscript

index d6e51287a6a9c918b125393971b0fb1d208e3eea..41573d20babb4c56824dbf1829e2e893bdc75681 100644 (file)
@@ -216,6 +216,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libexecdir}/ctdb/ctdb_eventd
 %{_libexecdir}/ctdb/ctdb_lock_helper
 %{_libexecdir}/ctdb/ctdb_recovery_helper
+%{_libexecdir}/ctdb/ctdb_takeover_helper
 %{_libexecdir}/ctdb/ctdb_mutex_fcntl_helper
 %{_libexecdir}/ctdb/ctdb_event
 %{_libexecdir}/ctdb/ctdb_natgw
diff --git a/ctdb/server/ctdb_takeover_helper.c b/ctdb/server/ctdb_takeover_helper.c
new file mode 100644 (file)
index 0000000..847a49d
--- /dev/null
@@ -0,0 +1,1206 @@
+/*
+   CTDB IP takeover helper
+
+   Copyright (C) Martin Schwenke  2016
+
+   Based on ctdb_recovery_helper.c
+   Copyright (C) Amitay Isaacs  2015
+
+   and ctdb_takeover.c
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <popt.h>
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/strv.h"
+#include "lib/util/strv_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/time.h"
+#include "lib/util/tevent_unix.h"
+
+#include "protocol/protocol.h"
+#include "protocol/protocol_api.h"
+#include "client/client.h"
+
+#include "common/logging.h"
+
+#include "server/ipalloc.h"
+
+static int takeover_timeout = 9;
+
+#define TIMEOUT()      timeval_current_ofs(takeover_timeout, 0)
+
+/*
+ * Utility functions
+ */
+
+static bool generic_recv(struct tevent_req *req, int *perr)
+{
+       int err;
+
+       if (tevent_req_is_unix_error(req, &err)) {
+               if (perr != NULL) {
+                       *perr = err;
+               }
+               return false;
+       }
+
+       return true;
+}
+
+static enum ipalloc_algorithm
+determine_algorithm(const struct ctdb_tunable_list *tunables)
+{
+       switch (tunables->ip_alloc_algorithm) {
+       case 0:
+               return IPALLOC_DETERMINISTIC;
+       case 1:
+               return IPALLOC_NONDETERMINISTIC;
+       case 2:
+               return IPALLOC_LCP2;
+       default:
+               return IPALLOC_LCP2;
+       };
+}
+
+/**********************************************************************/
+
+struct get_public_ips_state {
+       struct tevent_context *ev;
+       struct ctdb_client_context *client;
+       uint32_t *pnns;
+       int count;
+       struct ctdb_public_ip_list *ips;
+};
+
+static void get_public_ips_done(struct tevent_req *subreq);
+
+static struct tevent_req *get_public_ips_send(
+                               TALLOC_CTX *mem_ctx,
+                               struct tevent_context *ev,
+                               struct ctdb_client_context *client,
+                               uint32_t *pnns,
+                               int count,
+                               bool available_only)
+{
+       struct tevent_req *req, *subreq;
+       struct get_public_ips_state *state;
+       struct ctdb_req_control request;
+
+       req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
+       if (req == NULL) {
+               return tevent_req_post(req, ev);
+       }
+
+       state->pnns = pnns;
+       state->count = count;
+       state->ips = NULL;
+
+       ctdb_req_control_get_public_ips(&request, available_only);
+       subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
+                                               state->pnns,
+                                               state->count,
+                                               TIMEOUT(), &request);
+       if (tevent_req_nomem(subreq, req)) {
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, get_public_ips_done, req);
+
+       return req;
+}
+
+static void get_public_ips_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct get_public_ips_state *state = tevent_req_data(
+               req, struct get_public_ips_state);
+       struct ctdb_reply_control **reply;
+       int *err_list;
+       int ret, i;
+       bool status;
+
+       status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+                                               &reply);
+       TALLOC_FREE(subreq);
+       if (! status) {
+               int ret2;
+               uint32_t pnn;
+
+               ret2 = ctdb_client_control_multi_error(state->pnns,
+                                                      state->count,
+                                                      err_list, &pnn);
+               if (ret2 != 0) {
+                       D_ERR("control GET_PUBLIC_IPS failed on "
+                             "node %u, ret=%d\n", pnn, ret2);
+               } else {
+                       D_ERR("control GET_PUBLIC_IPS failed, "
+                             "ret=%d\n", ret);
+               }
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       state->ips = talloc_zero_array(state, struct ctdb_public_ip_list,
+                                      state->count);
+       if (tevent_req_nomem(state->ips, req)) {
+               return;
+       }
+
+       for (i = 0; i < state->count; i++) {
+               uint32_t pnn;
+               struct ctdb_public_ip_list *ips;
+
+               pnn = state->pnns[i];
+               ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
+                                                       &ips);
+               if (ret != 0) {
+                       D_ERR("control GET_PUBLIC_IPS failed on "
+                             "node %u\n", pnn);
+                       tevent_req_error(req, EIO);
+                       return;
+               }
+               state->ips[pnn] = *ips;
+       }
+
+       talloc_free(reply);
+
+       tevent_req_done(req);
+}
+
+static bool get_public_ips_recv(struct tevent_req *req, int *perr,
+                               TALLOC_CTX *mem_ctx,
+                               struct ctdb_public_ip_list **ips)
+{
+       struct get_public_ips_state *state = tevent_req_data(
+               req, struct get_public_ips_state);
+       int err;
+
+       if (tevent_req_is_unix_error(req, &err)) {
+               if (perr != NULL) {
+                       *perr = err;
+               }
+               return false;
+       }
+
+       *ips = talloc_steal(mem_ctx, state->ips);
+
+       return true;
+}
+
+/**********************************************************************/
+
+struct release_ip_state {
+       int num_sent;
+       int num_replies;
+       int num_fails;
+       int err_any;
+       uint32_t *ban_credits;
+};
+
+struct release_ip_one_state {
+       struct tevent_req *req;
+       uint32_t *pnns;
+       int count;
+       const char *ip_str;
+};
+
+static void release_ip_done(struct tevent_req *subreq);
+
+static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
+                                         struct tevent_context *ev,
+                                         struct ctdb_client_context *client,
+                                         uint32_t *pnns,
+                                         int count,
+                                         struct timeval timeout,
+                                         struct public_ip_list *all_ips,
+                                         uint32_t *ban_credits)
+{
+       struct tevent_req *req, *subreq;
+       struct release_ip_state *state;
+       struct ctdb_req_control request;
+       struct public_ip_list *tmp_ip;
+
+       req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
+       if (req == NULL) {
+               return NULL;
+       }
+
+       state->num_sent = 0;
+       state->num_replies = 0;
+       state->num_fails = 0;
+       state->ban_credits = ban_credits;
+
+       /* Send a RELEASE_IP to all nodes that should not be hosting
+        * each IP.  For each IP, all but one of these will be
+        * redundant.  However, the redundant ones are used to tell
+        * nodes which node should be hosting the IP so that commands
+        * like "ctdb ip" can display a particular nodes idea of who
+        * is hosting what. */
+       for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
+               struct release_ip_one_state *substate;
+               struct ctdb_public_ip ip;
+               int i;
+
+               substate = talloc_zero(state, struct release_ip_one_state);
+               if (tevent_req_nomem(substate, req)) {
+                       return tevent_req_post(req, ev);
+               }
+
+               substate->pnns = talloc_zero_array(substate, uint32_t, count);
+               if (tevent_req_nomem(substate->pnns, req)) {
+                       return tevent_req_post(req, ev);
+               }
+
+               substate->count = 0;
+               substate->req = req;
+
+               substate->ip_str  = ctdb_sock_addr_to_string(substate,
+                                                            &tmp_ip->addr);
+               if (tevent_req_nomem(substate->ip_str, req)) {
+                       return tevent_req_post(req, ev);
+               }
+
+               for (i = 0; i < count; i++) {
+                       uint32_t pnn = pnns[i];
+                       /* If pnn is not the node that should be
+                        * hosting the IP then add it to the list of
+                        * nodes that need to do a release. */
+                       if (tmp_ip->pnn != pnn) {
+                               substate->pnns[substate->count] = pnn;
+                               substate->count++;
+                       }
+               }
+
+               ip.pnn = tmp_ip->pnn;
+               ip.addr = tmp_ip->addr;
+               ctdb_req_control_release_ip(&request, &ip);
+               subreq = ctdb_client_control_multi_send(state, ev, client,
+                                                       substate->pnns,
+                                                       substate->count,
+                                                       timeout,/* cumulative */
+                                                       &request);
+               if (tevent_req_nomem(subreq, req)) {
+                       return tevent_req_post(req, ev);
+               }
+               tevent_req_set_callback(subreq, release_ip_done, substate);
+
+               state->num_sent++;
+       }
+
+       return req;
+}
+
+static void release_ip_done(struct tevent_req *subreq)
+{
+       struct release_ip_one_state *substate = tevent_req_callback_data(
+               subreq, struct release_ip_one_state);
+       struct tevent_req *req = substate->req;
+       struct release_ip_state *state = tevent_req_data(
+               req, struct release_ip_state);
+       int ret, i;
+       int *err_list;
+       bool status, found_errors;
+
+       status = ctdb_client_control_multi_recv(subreq, &ret, state,
+                                               &err_list, NULL);
+       TALLOC_FREE(subreq);
+
+       if (status) {
+               D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
+                      substate->ip_str, substate->count);
+               goto done;
+       }
+
+       /* Get some clear error messages out of err_list and count
+        * banning credits
+        */
+       found_errors = false;
+       for (i = 0; i < substate->count; i++) {
+               int err = err_list[i];
+               if (err != 0) {
+                       uint32_t pnn = substate->pnns[i];
+
+                       D_ERR("RELEASE_IP %s failed on node %u, "
+                             "ret=%d\n", substate->ip_str, pnn, err);
+
+                       state->ban_credits[pnn]++;
+                       state->err_any = err;
+                       found_errors = true;
+               }
+       }
+       if (! found_errors) {
+               D_ERR("RELEASE_IP %s internal error, ret=%d\n",
+                     substate->ip_str, ret);
+               state->err_any = EIO;
+       }
+
+       state->num_fails++;
+
+done:
+       talloc_free(substate);
+
+       state->num_replies++;
+
+       if (state->num_replies < state->num_sent) {
+               /* Not all replies received, don't go further */
+               return;
+       }
+
+       if (state->num_fails > 0) {
+               tevent_req_error(req, state->err_any);
+               return;
+       }
+
+       tevent_req_done(req);
+}
+
+static bool release_ip_recv(struct tevent_req *req, int *perr)
+{
+       return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+struct take_ip_state {
+       int num_sent;
+       int num_replies;
+       int num_fails;
+       int err_any;
+       uint32_t *ban_credits;
+};
+
+struct take_ip_one_state {
+       struct tevent_req *req;
+       uint32_t pnn;
+       const char *ip_str;
+};
+
+static void take_ip_done(struct tevent_req *subreq);
+
+static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
+                                      struct tevent_context *ev,
+                                      struct ctdb_client_context *client,
+                                      struct timeval timeout,
+                                      struct public_ip_list *all_ips,
+                                      uint32_t *ban_credits)
+{
+       struct tevent_req *req, *subreq;
+       struct take_ip_state *state;
+       struct ctdb_req_control request;
+       struct public_ip_list *tmp_ip;
+
+       req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
+       if (req == NULL) {
+               return NULL;
+       }
+
+       state->num_sent = 0;
+       state->num_replies = 0;
+       state->num_fails = 0;
+       state->ban_credits = ban_credits;
+
+       /* For each IP, send a TAKOVER_IP to the node that should be
+        * hosting it.  Many of these will often be redundant (since
+        * the allocation won't have changed) but they can be useful
+        * to recover from inconsistencies. */
+       for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
+               struct take_ip_one_state *substate;
+               struct ctdb_public_ip ip;
+
+               if (tmp_ip->pnn == -1) {
+                       /* IP will be unassigned */
+                       continue;
+               }
+
+               substate = talloc_zero(state, struct take_ip_one_state);
+               if (tevent_req_nomem(substate, req)) {
+                       return tevent_req_post(req, ev);
+               }
+
+               substate->req = req;
+               substate->pnn = tmp_ip->pnn;
+
+               substate->ip_str  = ctdb_sock_addr_to_string(substate,
+                                                            &tmp_ip->addr);
+               if (tevent_req_nomem(substate->ip_str, req)) {
+                       return tevent_req_post(req, ev);
+               }
+
+               ip.pnn = tmp_ip->pnn;
+               ip.addr = tmp_ip->addr;
+               ctdb_req_control_takeover_ip(&request, &ip);
+               subreq = ctdb_client_control_send(
+                                       state, ev, client, tmp_ip->pnn,
+                                       timeout, /* cumulative */
+                                       &request);
+               if (tevent_req_nomem(subreq, req)) {
+                       return tevent_req_post(req, ev);
+               }
+               tevent_req_set_callback(subreq, take_ip_done, substate);
+
+               state->num_sent++;
+       }
+
+       /* None sent, finished... */
+       if (state->num_sent == 0) {
+               tevent_req_done(req);
+               return tevent_req_post(req, ev);
+       }
+
+       return req;
+}
+
+static void take_ip_done(struct tevent_req *subreq)
+{
+       struct take_ip_one_state *substate = tevent_req_callback_data(
+               subreq, struct take_ip_one_state);
+       struct tevent_req *req = substate->req;
+       struct ctdb_reply_control *reply;
+       struct take_ip_state *state = tevent_req_data(
+               req, struct take_ip_state);
+       int ret = 0;
+       bool status;
+
+       status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+       TALLOC_FREE(subreq);
+
+       if (! status) {
+               D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
+                     substate->ip_str, substate->pnn, ret);
+               goto fail;
+       }
+
+       ret = ctdb_reply_control_takeover_ip(reply);
+       if (ret != 0) {
+               D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
+                     substate->ip_str, substate->pnn, ret);
+               goto fail;
+       }
+
+       D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
+              substate->ip_str, substate->pnn);
+       goto done;
+
+fail:
+       state->ban_credits[substate->pnn]++;
+       state->num_fails++;
+       state->err_any = ret;
+
+done:
+       talloc_free(substate);
+
+       state->num_replies++;
+
+       if (state->num_replies < state->num_sent) {
+               /* Not all replies received, don't go further */
+               return;
+       }
+
+       if (state->num_fails > 0) {
+               tevent_req_error(req, state->err_any);
+               return;
+       }
+
+       tevent_req_done(req);
+}
+
+static bool take_ip_recv(struct tevent_req *req, int *perr)
+{
+       return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+struct ipreallocated_state {
+       uint32_t *pnns;
+       int count;
+       uint32_t *ban_credits;
+};
+
+static void ipreallocated_done(struct tevent_req *subreq);
+
+static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
+                                            struct tevent_context *ev,
+                                            struct ctdb_client_context *client,
+                                            uint32_t *pnns,
+                                            int count,
+                                            struct timeval timeout,
+                                            uint32_t *ban_credits)
+{
+       struct tevent_req *req, *subreq;
+       struct ipreallocated_state *state;
+       struct ctdb_req_control request;
+
+       req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
+       if (req == NULL) {
+               return NULL;
+       }
+
+       state->pnns = pnns;
+       state->count = count;
+       state->ban_credits = ban_credits;
+
+       ctdb_req_control_ipreallocated(&request);
+       subreq = ctdb_client_control_multi_send(state, ev, client,
+                                               pnns, count,
+                                               timeout, /* cumulative */
+                                               &request);
+       if (tevent_req_nomem(subreq, req)) {
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, ipreallocated_done, req);
+
+       return req;
+}
+
+static void ipreallocated_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct ipreallocated_state *state = tevent_req_data(
+               req, struct ipreallocated_state);
+       int *err_list = NULL;
+       int ret, i;
+       bool status, found_errors;
+
+       status = ctdb_client_control_multi_recv(subreq, &ret, state,
+                                               &err_list, NULL);
+       TALLOC_FREE(subreq);
+
+       if (status) {
+               D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
+               tevent_req_done(req);
+               return;
+       }
+
+       /* Get some clear error messages out of err_list and count
+        * banning credits
+        */
+       found_errors = false;
+       for (i = 0; i < state->count; i++) {
+               int err = err_list[i];
+               if (err != 0) {
+                       uint32_t pnn = state->pnns[i];
+
+                       D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
+                             pnn, err);
+
+                       state->ban_credits[pnn]++;
+                       found_errors = true;
+               }
+       }
+
+       if (! found_errors) {
+               D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
+       }
+
+       tevent_req_error(req, ret);
+}
+
+static bool ipreallocated_recv(struct tevent_req *req, int *perr)
+{
+       return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+/*
+ * Recalculate the allocation of public IPs to nodes and have the
+ * nodes host their allocated addresses.
+ *
+ * - Get tunables
+ * - Get nodemap
+ * - Initialise IP allocation state.  Pass:
+ *   + algorithm to be used;
+ *   + various tunables (NoIPTakeover, NoIPFailback, NoIPHostOnAllDisabled)
+ *   + list of nodes to force rebalance (internal structure, currently
+ *     no way to fetch, only used by LCP2 for nodes that have had new
+ *     IP addresses added).
+ * - Set IP flags for IP allocation based on node map
+ * - Retrieve known and available IP addresses (done separately so
+ *   values can be faked in unit testing)
+ * - Use ipalloc_set_public_ips() to set known and available IP
+ *   addresses for allocation
+ * - If cluster can't host IP addresses then jump to IPREALLOCATED
+ * - Run IP allocation algorithm
+ * - Send RELEASE_IP to all nodes for IPs they should not host
+ * - Send TAKE_IP to all nodes for IPs they should host
+ * - Send IPREALLOCATED to all nodes
+ */
+
+struct takeover_state {
+       struct tevent_context *ev;
+       struct ctdb_client_context *client;
+       struct timeval timeout;
+       int num_nodes;
+       uint32_t *pnns_connected;
+       int num_connected;
+       uint32_t *pnns_active;
+       int num_active;
+       uint32_t destnode;
+       uint32_t *force_rebalance_nodes;
+       struct ctdb_tunable_list *tun_list;
+       struct ipalloc_state *ipalloc_state;
+       struct ctdb_public_ip_list *known_ips;
+       struct public_ip_list *all_ips;
+       uint32_t *ban_credits;
+};
+
+static void takeover_tunables_done(struct tevent_req *subreq);
+static void takeover_nodemap_done(struct tevent_req *subreq);
+static void takeover_known_ips_done(struct tevent_req *subreq);
+static void takeover_avail_ips_done(struct tevent_req *subreq);
+static void takeover_release_ip_done(struct tevent_req *subreq);
+static void takeover_take_ip_done(struct tevent_req *subreq);
+static void takeover_ipreallocated(struct tevent_req *req);
+static void takeover_ipreallocated_done(struct tevent_req *subreq);
+static void takeover_failed(struct tevent_req *subreq, int ret);
+static void takeover_failed_done(struct tevent_req *subreq);
+
+static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
+                                       struct tevent_context *ev,
+                                       struct ctdb_client_context *client,
+                                       uint32_t *force_rebalance_nodes)
+{
+       struct tevent_req *req, *subreq;
+       struct takeover_state *state;
+       struct ctdb_req_control request;
+
+       req = tevent_req_create(mem_ctx, &state, struct takeover_state);
+       if (req == NULL) {
+               return NULL;
+       }
+
+       state->ev = ev;
+       state->client = client;
+       state->force_rebalance_nodes = force_rebalance_nodes;
+       state->destnode = ctdb_client_pnn(client);
+
+       ctdb_req_control_get_all_tunables(&request);
+       subreq = ctdb_client_control_send(state, state->ev, state->client,
+                                         state->destnode, TIMEOUT(),
+                                         &request);
+       if (tevent_req_nomem(subreq, req)) {
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, takeover_tunables_done, req);
+
+       return req;
+}
+
+static void takeover_tunables_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct takeover_state *state = tevent_req_data(
+               req, struct takeover_state);
+       struct ctdb_reply_control *reply;
+       struct ctdb_req_control request;
+       int ret;
+       bool status;
+
+       status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+       TALLOC_FREE(subreq);
+       if (! status) {
+               D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       ret = ctdb_reply_control_get_all_tunables(reply, state,
+                                                 &state->tun_list);
+       if (ret != 0) {
+               D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       talloc_free(reply);
+
+       takeover_timeout = state->tun_list->takeover_timeout;
+
+       ctdb_req_control_get_nodemap(&request);
+       subreq = ctdb_client_control_send(state, state->ev, state->client,
+                                         state->destnode, TIMEOUT(),
+                                         &request);
+       if (tevent_req_nomem(subreq, req)) {
+               return;
+       }
+       tevent_req_set_callback(subreq, takeover_nodemap_done, req);
+}
+
+static void takeover_nodemap_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct takeover_state *state = tevent_req_data(
+               req, struct takeover_state);
+       struct ctdb_reply_control *reply;
+       bool status;
+       int ret;
+       struct ctdb_node_map *nodemap;
+
+       status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+       TALLOC_FREE(subreq);
+       if (! status) {
+               D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
+                       state->destnode, ret);
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
+       if (ret != 0) {
+               D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       state->num_nodes = nodemap->num;
+
+       state->num_connected = list_of_connected_nodes(nodemap,
+                                                      CTDB_UNKNOWN_PNN, state,
+                                                      &state->pnns_connected);
+       if (state->num_connected <= 0) {
+               tevent_req_error(req, ENOMEM);
+               return;
+       }
+
+       state->num_active = list_of_active_nodes(nodemap,
+                                                CTDB_UNKNOWN_PNN, state,
+                                                &state->pnns_active);
+       if (state->num_active <= 0) {
+               tevent_req_error(req, ENOMEM);
+               return;
+       }
+
+       /* Default timeout for early jump to IPREALLOCATED.  See below
+        * for explanation of 3 times...
+        */
+       state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
+
+       state->ban_credits = talloc_zero_array(state, uint32_t,
+                                              state->num_nodes);
+       if (tevent_req_nomem(state->ban_credits, req)) {
+               return;
+       }
+
+       if (state->tun_list->disable_ip_failover != 0) {
+               /* IP failover is completely disabled so just send out
+                * ipreallocated event.
+                */
+               takeover_ipreallocated(req);
+               return;
+       }
+
+       state->ipalloc_state =
+               ipalloc_state_init(
+                       state, state->num_nodes,
+                       determine_algorithm(state->tun_list),
+                       (state->tun_list->no_ip_takeover != 0),
+                       (state->tun_list->no_ip_failback != 0),
+                       (state->tun_list->no_ip_host_on_all_disabled != 0),
+                       state->force_rebalance_nodes);
+       if (tevent_req_nomem(state->ipalloc_state, req)) {
+               return;
+       }
+
+       ipalloc_set_node_flags(state->ipalloc_state, nodemap);
+
+       subreq = get_public_ips_send(state, state->ev, state->client,
+                                    state->pnns_active, state->num_active,
+                                    false);
+       if (tevent_req_nomem(subreq, req)) {
+               return;
+       }
+
+       tevent_req_set_callback(subreq, takeover_known_ips_done, req);
+}
+
+static void takeover_known_ips_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct takeover_state *state = tevent_req_data(
+               req, struct takeover_state);
+       int ret;
+       bool status;
+
+       status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
+       TALLOC_FREE(subreq);
+
+       if (! status) {
+               D_ERR("Failed to fetch known public IPs\n");
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       subreq = get_public_ips_send(state, state->ev, state->client,
+                                    state->pnns_active, state->num_active,
+                                    true);
+       if (tevent_req_nomem(subreq, req)) {
+               return;
+       }
+
+       tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
+}
+
+static void takeover_avail_ips_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct takeover_state *state = tevent_req_data(
+               req, struct takeover_state);
+       bool status;
+       int ret;
+       struct ctdb_public_ip_list *available_ips;
+
+       status = get_public_ips_recv(subreq, &ret, state, &available_ips);
+       TALLOC_FREE(subreq);
+
+       if (! status) {
+               D_ERR("Failed to fetch available public IPs\n");
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       ipalloc_set_public_ips(state->ipalloc_state,
+                              state->known_ips, available_ips);
+
+       if (! ipalloc_can_host_ips(state->ipalloc_state)) {
+               D_NOTICE("No nodes available to host public IPs yet\n");
+               takeover_ipreallocated(req);
+               return;
+       }
+
+       /* Do the IP reassignment calculations */
+       state->all_ips = ipalloc(state->ipalloc_state);
+       if (tevent_req_nomem(state->all_ips, req)) {
+               return;
+       }
+
+       /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
+        * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
+        * seconds.  However, RELEASE_IP can take longer due to TCP
+        * connection killing, so sometimes needs more time.
+        * Therefore, use a cumulative timeout of TakeoverTimeout * 3
+        * seconds across all 3 stages.  No explicit expiry checks are
+        * needed before each stage because tevent is smart enough to
+        * fire the timeouts even if they are in the past.  Initialise
+        * this here so it explicitly covers the stages we're
+        * interested in but, in particular, not the time taken by the
+        * ipalloc().
+        */
+       state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
+
+       subreq = release_ip_send(state, state->ev, state->client,
+                                state->pnns_connected, state->num_connected,
+                                state->timeout, state->all_ips,
+                                state->ban_credits);
+       if (tevent_req_nomem(subreq, req)) {
+               return;
+       }
+       tevent_req_set_callback(subreq, takeover_release_ip_done, req);
+}
+
+static void takeover_release_ip_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct takeover_state *state = tevent_req_data(
+               req, struct takeover_state);
+       int ret;
+       bool status;
+
+       status = release_ip_recv(subreq, &ret);
+       TALLOC_FREE(subreq);
+
+       if (! status) {
+               takeover_failed(req, ret);
+               return;
+       }
+
+       /* All released, now for takeovers */
+
+       subreq = take_ip_send(state, state->ev, state->client,
+                             state->timeout, state->all_ips,
+                             state->ban_credits);
+       if (tevent_req_nomem(subreq, req)) {
+               return;
+       }
+       tevent_req_set_callback(subreq, takeover_take_ip_done, req);
+}
+
+static void takeover_take_ip_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       int ret = 0;
+       bool status;
+
+       status = take_ip_recv(subreq, &ret);
+       TALLOC_FREE(subreq);
+
+       if (! status) {
+               takeover_failed(req, ret);
+               return;
+       }
+
+       takeover_ipreallocated(req);
+}
+
+static void takeover_ipreallocated(struct tevent_req *req)
+{
+       struct takeover_state *state = tevent_req_data(
+               req, struct takeover_state);
+       struct tevent_req *subreq;
+
+       subreq = ipreallocated_send(state, state->ev, state->client,
+                                   state->pnns_connected,
+                                   state->num_connected,
+                                   state->timeout,
+                                   state->ban_credits);
+       if (tevent_req_nomem(subreq, req)) {
+               return;
+       }
+       tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
+}
+
+static void takeover_ipreallocated_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       int ret;
+       bool status;
+
+       status = ipreallocated_recv(subreq, &ret);
+       TALLOC_FREE(subreq);
+
+       if (! status) {
+               takeover_failed(req, ret);
+               return;
+       }
+
+       tevent_req_done(req);
+}
+
+struct takeover_failed_state {
+       struct tevent_req *req;
+       int ret;
+};
+
+void takeover_failed(struct tevent_req *req, int ret)
+{
+       struct takeover_state *state = tevent_req_data(
+               req, struct takeover_state);
+       struct tevent_req *subreq;
+       uint32_t max_pnn = CTDB_UNKNOWN_PNN;
+       int max_credits = 0;
+       int pnn;
+
+       /* Check that bans are enabled */
+       if (state->tun_list->enable_bans == 0) {
+               tevent_req_error(req, ret);
+               return;
+       }
+
+       for (pnn = 0; pnn < state->num_nodes; pnn++) {
+               if (state->ban_credits[pnn] > max_credits) {
+                       max_pnn = pnn;
+                       max_credits = state->ban_credits[pnn];
+               }
+       }
+
+       if (max_credits > 0) {
+               struct ctdb_req_message message;
+               struct takeover_failed_state *substate;
+
+               D_WARNING("Assigning banning credits to node %u\n", max_pnn);
+
+               substate = talloc_zero(state, struct takeover_failed_state);
+               if (tevent_req_nomem(substate, req)) {
+                       return;
+               }
+               substate->req = req;
+               substate->ret = ret;
+
+               message.srvid = CTDB_SRVID_BANNING;
+               message.data.pnn = max_pnn;
+
+               subreq = ctdb_client_message_send(
+                       state, state->ev, state->client,
+                       ctdb_client_pnn(state->client),
+                       &message);
+               if (subreq == NULL) {
+                       D_ERR("failed to assign banning credits\n");
+                       tevent_req_error(req, ret);
+                       return;
+               }
+               tevent_req_set_callback(subreq, takeover_failed_done, substate);
+       } else {
+               tevent_req_error(req, ret);
+       }
+}
+
+static void takeover_failed_done(struct tevent_req *subreq)
+{
+       struct takeover_failed_state *substate = tevent_req_callback_data(
+               subreq, struct takeover_failed_state);
+       struct tevent_req *req = substate->req;
+       int ret;
+       bool status;
+
+       status = ctdb_client_message_recv(subreq, &ret);
+       TALLOC_FREE(subreq);
+       if (! status) {
+               D_ERR("failed to assign banning credits, ret=%d\n", ret);
+       }
+
+       ret = substate->ret;
+       talloc_free(substate);
+       tevent_req_error(req, ret);
+}
+
+static void takeover_recv(struct tevent_req *req, int *perr)
+{
+       generic_recv(req, perr);
+}
+
+static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
+{
+       char *strv = NULL;
+       int num, i, ret;
+       char *t;
+       uint32_t *nodes;
+
+       ret = strv_split(mem_ctx, &strv, s, ",");
+       if (ret != 0) {
+               D_ERR("out of memory\n");
+               return NULL;
+       }
+
+       num = strv_count(strv);
+
+       nodes = talloc_array(mem_ctx, uint32_t, num);
+       if (nodes == NULL) {
+               D_ERR("out of memory\n");
+               return NULL;
+       }
+
+       t = NULL;
+       for (i = 0; i < num; i++) {
+               t = strv_next(strv, t);
+               nodes[i] = atoi(t);
+       }
+
+       return nodes;
+}
+
+static void usage(const char *progname)
+{
+       fprintf(stderr,
+               "\nUsage: %s <output-fd> <ctdb-socket-path> "
+               "[<force-rebalance-nodes>]\n",
+               progname);
+}
+
+/*
+ * Arguments - write fd, socket path
+ */
+int main(int argc, const char *argv[])
+{
+       int write_fd;
+       const char *sockpath;
+       TALLOC_CTX *mem_ctx;
+       struct tevent_context *ev;
+       struct ctdb_client_context *client;
+       int ret;
+       struct tevent_req *req;
+       uint32_t *force_rebalance_nodes = NULL;
+
+       if (argc < 3 || argc > 4) {
+               usage(argv[0]);
+               exit(1);
+       }
+
+       write_fd = atoi(argv[1]);
+       sockpath = argv[2];
+
+       mem_ctx = talloc_new(NULL);
+       if (mem_ctx == NULL) {
+               fprintf(stderr, "talloc_new() failed\n");
+               ret = ENOMEM;
+               goto done;
+       }
+
+       if (argc == 4) {
+               force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
+               if (force_rebalance_nodes == NULL) {
+                       usage(argv[0]);
+                       exit(1);
+               }
+       }
+
+       logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
+
+       ev = tevent_context_init(mem_ctx);
+       if (ev == NULL) {
+               D_ERR("tevent_context_init() failed\n");
+               ret = ENOMEM;
+               goto done;
+       }
+
+       ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
+       if (ret != 0) {
+               D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
+               goto done;
+       }
+
+       req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
+       if (req == NULL) {
+               D_ERR("takeover_send() failed\n");
+               ret = 1;
+               goto done;
+       }
+
+       if (! tevent_req_poll(req, ev)) {
+               D_ERR("tevent_req_poll() failed\n");
+               ret = 1;
+               goto done;
+       }
+
+       takeover_recv(req, &ret);
+       TALLOC_FREE(req);
+       if (ret != 0) {
+               D_ERR("takeover run failed, ret=%d\n", ret);
+       }
+
+done:
+       sys_write_v(write_fd, &ret, sizeof(ret));
+
+       talloc_free(mem_ctx);
+       return ret;
+}
index 6bc9bfd8339da31104187da7e50cbf05780bc1e4..66959cde0915062bbeebd7442ecc27c27081a6c8 100644 (file)
@@ -512,6 +512,12 @@ def build(bld):
                              samba-util sys_rw replace tdb''',
                      install_path='${CTDB_HELPER_BINDIR}')
 
+    bld.SAMBA_BINARY('ctdb_takeover_helper',
+                     source='server/ctdb_takeover_helper.c',
+                     deps='''ctdb-client2 ctdb-protocol ctdb-util
+                             samba-util sys_rw replace ctdb-ipalloc popt''',
+                     install_path='${CTDB_HELPER_BINDIR}')
+
     bld.SAMBA_BINARY('ctdb_mutex_fcntl_helper',
                      source='server/ctdb_mutex_fcntl_helper.c',
                      deps='sys_rw ctdb-system',