ctdb-takeover: Assign banning credits on failure to fetch public IPs
[sfrench/samba-autobuild/.git] / ctdb / server / ctdb_takeover_helper.c
index 3057528c8bbba29d099817b2e30d65ebff1be7e1..00b83c3687738fa9c813b40f60df811beb8300b3 100644 (file)
@@ -88,11 +88,10 @@ determine_algorithm(const struct ctdb_tunable_list *tunables)
 /**********************************************************************/
 
 struct get_public_ips_state {
-       struct tevent_context *ev;
-       struct ctdb_client_context *client;
        uint32_t *pnns;
        int count;
        struct ctdb_public_ip_list *ips;
+       uint32_t *ban_credits;
 };
 
 static void get_public_ips_done(struct tevent_req *subreq);
@@ -102,7 +101,8 @@ static struct tevent_req *get_public_ips_send(
                                struct tevent_context *ev,
                                struct ctdb_client_context *client,
                                uint32_t *pnns,
-                               int count,
+                               int count, int num_nodes,
+                               uint32_t *ban_credits,
                                bool available_only)
 {
        struct tevent_req *req, *subreq;
@@ -116,7 +116,20 @@ static struct tevent_req *get_public_ips_send(
 
        state->pnns = pnns;
        state->count = count;
-       state->ips = NULL;
+       state->ban_credits = ban_credits;
+
+       state->ips  = talloc_zero_array(state,
+                                       struct ctdb_public_ip_list,
+                                       num_nodes);
+       if (tevent_req_nomem(state->ips, req)) {
+               return tevent_req_post(req, ev);
+       }
+
+       /* Short circuit if no nodes being asked for IPs */
+       if (state->count == 0) {
+               tevent_req_done(req);
+               return tevent_req_post(req, ev);
+       }
 
        ctdb_req_control_get_public_ips(&request, available_only);
        subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
@@ -140,35 +153,30 @@ static void get_public_ips_done(struct tevent_req *subreq)
        struct ctdb_reply_control **reply;
        int *err_list;
        int ret, i;
-       bool status;
+       bool status, found_errors;
 
        status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
                                                &reply);
        TALLOC_FREE(subreq);
        if (! status) {
-               int ret2;
-               uint32_t pnn;
+               found_errors = false;
+               for (i = 0; i < state->count; i++) {
+                       if (err_list[i] != 0) {
+                               uint32_t pnn = state->pnns[i];
 
-               ret2 = ctdb_client_control_multi_error(state->pnns,
-                                                      state->count,
-                                                      err_list, &pnn);
-               if (ret2 != 0) {
-                       D_ERR("control GET_PUBLIC_IPS failed on "
-                             "node %u, ret=%d\n", pnn, ret2);
-               } else {
-                       D_ERR("control GET_PUBLIC_IPS failed, "
-                             "ret=%d\n", ret);
+                               D_ERR("control GET_PUBLIC_IPS failed on "
+                                     "node %u, ret=%d\n", pnn, err_list[i]);
+
+                               state->ban_credits[pnn]++;
+                               found_errors = true;
+                       }
                }
-               tevent_req_error(req, ret);
-               return;
-       }
 
-       state->ips = talloc_zero_array(state, struct ctdb_public_ip_list,
-                                      state->count);
-       if (tevent_req_nomem(state->ips, req)) {
+               tevent_req_error(req, ret);
                return;
        }
 
+       found_errors = false;
        for (i = 0; i < state->count; i++) {
                uint32_t pnn;
                struct ctdb_public_ip_list *ips;
@@ -179,12 +187,20 @@ static void get_public_ips_done(struct tevent_req *subreq)
                if (ret != 0) {
                        D_ERR("control GET_PUBLIC_IPS failed on "
                              "node %u\n", pnn);
-                       tevent_req_error(req, EIO);
-                       return;
+                       state->ban_credits[pnn]++;
+                       found_errors = true;
+                       continue;
                }
+
+               D_INFO("Fetched public IPs from node %u\n", pnn);
                state->ips[pnn] = *ips;
        }
 
+       if (found_errors) {
+               tevent_req_error(req, EIO);
+               return;
+       }
+
        talloc_free(reply);
 
        tevent_req_done(req);
@@ -294,6 +310,12 @@ static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
                        }
                }
 
+               if (substate->count == 0) {
+                       /* No releases to send for this address... */
+                       TALLOC_FREE(substate);
+                       continue;
+               }
+
                ip.pnn = tmp_ip->pnn;
                ip.addr = tmp_ip->addr;
                ctdb_req_control_release_ip(&request, &ip);
@@ -310,6 +332,12 @@ static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
                state->num_sent++;
        }
 
+       /* None sent, finished... */
+       if (state->num_sent == 0) {
+               tevent_req_done(req);
+               return tevent_req_post(req, ev);
+       }
+
        return req;
 }
 
@@ -832,6 +860,7 @@ static void takeover_nodemap_done(struct tevent_req *subreq)
 
        subreq = get_public_ips_send(state, state->ev, state->client,
                                     state->pnns_active, state->num_active,
+                                    state->num_nodes, state->ban_credits,
                                     false);
        if (tevent_req_nomem(subreq, req)) {
                return;
@@ -848,18 +877,39 @@ static void takeover_known_ips_done(struct tevent_req *subreq)
                req, struct takeover_state);
        int ret;
        bool status;
+       uint32_t *pnns = NULL;
+       int count, i;
 
        status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
        TALLOC_FREE(subreq);
 
        if (! status) {
                D_ERR("Failed to fetch known public IPs\n");
-               tevent_req_error(req, ret);
+               takeover_failed(req, ret);
                return;
        }
 
+       /* Get available IPs from active nodes that actually have known IPs */
+
+       pnns = talloc_zero_array(state, uint32_t, state->num_active);
+       if (tevent_req_nomem(pnns, req)) {
+               return;
+       }
+
+       count = 0;
+       for (i = 0; i < state->num_active; i++) {
+               uint32_t pnn = state->pnns_active[i];
+
+               /* If pnn has IPs then fetch available IPs from it */
+               if (state->known_ips[pnn].num > 0) {
+                       pnns[count] = pnn;
+                       count++;
+               }
+       }
+
        subreq = get_public_ips_send(state, state->ev, state->client,
-                                    state->pnns_active, state->num_active,
+                                    pnns, count,
+                                    state->num_nodes, state->ban_credits,
                                     true);
        if (tevent_req_nomem(subreq, req)) {
                return;
@@ -883,7 +933,7 @@ static void takeover_avail_ips_done(struct tevent_req *subreq)
 
        if (! status) {
                D_ERR("Failed to fetch available public IPs\n");
-               tevent_req_error(req, ret);
+               takeover_failed(req, ret);
                return;
        }
 
@@ -1160,11 +1210,17 @@ int main(int argc, const char *argv[])
                force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
                if (force_rebalance_nodes == NULL) {
                        usage(argv[0]);
-                       exit(1);
+                       ret = EINVAL;
+                       goto done;
                }
        }
 
-       logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
+       ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
+       if (ret != 0) {
+               fprintf(stderr,
+                       "ctdb-takeover: Unable to initialize logging\n");
+               goto done;
+       }
 
        ev = tevent_context_init(mem_ctx);
        if (ev == NULL) {