recoverd: Remove an orphaned comment
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
index a071861a82f71354602e48bef4796283ccdf69f8..130df8a3484f775c7ddffc80be498449a1dc7501 100644 (file)
@@ -861,7 +861,7 @@ static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr
                                        (unsigned)client->pid,
                                        ctdb_addr_to_str(addr),
                                        ip->client_id));
-                               ctdb_kill(ctdb, client->pid, SIGKILL);
+                               kill(client->pid, SIGKILL);
                        }
                }
        }
@@ -1309,6 +1309,12 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
        return 0;
 }
 
+struct ctdb_public_ip_list {
+       struct ctdb_public_ip_list *next;
+       uint32_t pnn;
+       ctdb_sock_addr addr;
+};
+
 /* Given a physical node, return the number of
    public addresses that is currently assigned to this node.
 */
@@ -2622,6 +2628,40 @@ static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
        }
 }
 
+struct takeover_callback_data {
+       bool *node_failed;
+       client_async_callback fail_callback;
+       void *fail_callback_data;
+       struct ctdb_node_map *nodemap;
+};
+
+static void takeover_run_fail_callback(struct ctdb_context *ctdb,
+                                      uint32_t node_pnn, int32_t res,
+                                      TDB_DATA outdata, void *callback_data)
+{
+       struct takeover_callback_data *cd =
+               talloc_get_type_abort(callback_data,
+                                     struct takeover_callback_data);
+       int i;
+
+       for (i = 0; i < cd->nodemap->num; i++) {
+               if (node_pnn == cd->nodemap->nodes[i].pnn) {
+                       break;
+               }
+       }
+
+       if (i == cd->nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
+               return;
+       }
+
+       if (!cd->node_failed[i]) {
+               cd->node_failed[i] = true;
+               cd->fail_callback(ctdb, node_pnn, res, outdata,
+                                 cd->fail_callback_data);
+       }
+}
+
 /*
   make any IP alias changes for public addresses that are necessary 
  */
@@ -2640,6 +2680,7 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
        TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
        uint32_t disable_timeout;
        struct ctdb_ipflags *ipflags;
+       struct takeover_callback_data *takeover_data;
        struct iprealloc_callback_data iprealloc_data;
        bool *retry_data;
 
@@ -2663,10 +2704,6 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
        /* Do the IP reassignment calculations */
        ctdb_takeover_run_core(ctdb, ipflags, &all_ips);
 
-       /* The IP flags need to be cleared because they should never
-        * be seen outside the IP allocation code.
-        */
-
        /* The recovery daemon does regular sanity checks of the IPs.
         * However, sometimes it is overzealous and thinks changes are
         * required when they're already underway.  This stops the
@@ -2680,14 +2717,25 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
                DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
        }
 
-       /* now tell all nodes to delete any alias that they should not
-          have.  This will be a NOOP on nodes that don't currently
-          hold the given alias */
+       /* Now tell all nodes to release any public IPs should not
+        * host.  This will be a NOOP on nodes that don't currently
+        * hold the given IP.
+        */
+       takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
+
+       takeover_data->node_failed = talloc_zero_array(tmp_ctx,
+                                                      bool, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
+       takeover_data->fail_callback = fail_callback;
+       takeover_data->fail_callback_data = callback_data;
+       takeover_data->nodemap = nodemap;
+
        async_data = talloc_zero(tmp_ctx, struct client_async_data);
        CTDB_NO_MEMORY_FATAL(ctdb, async_data);
 
-       async_data->fail_callback = fail_callback;
-       async_data->callback_data = callback_data;
+       async_data->fail_callback = takeover_run_fail_callback;
+       async_data->callback_data = takeover_data;
 
        for (i=0;i<nodemap->num;i++) {
                /* don't talk to unconnected nodes, but do talk to banned nodes */
@@ -3211,6 +3259,7 @@ void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
 void ctdb_release_all_ips(struct ctdb_context *ctdb)
 {
        struct ctdb_vnn *vnn;
+       int count = 0;
 
        for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
                if (!ctdb_sys_have_ip(&vnn->public_address)) {
@@ -3220,13 +3269,22 @@ void ctdb_release_all_ips(struct ctdb_context *ctdb)
                if (!vnn->iface) {
                        continue;
                }
+
+               DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
+                                   ctdb_addr_to_str(&vnn->public_address),
+                                   vnn->public_netmask_bits,
+                                   ctdb_vnn_iface_string(vnn)));
+
                ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
                                  ctdb_vnn_iface_string(vnn),
                                  ctdb_addr_to_str(&vnn->public_address),
                                  vnn->public_netmask_bits);
                release_kill_clients(ctdb, &vnn->public_address);
                ctdb_vnn_unassign_iface(ctdb, vnn);
+               count++;
        }
+
+       DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
 }
 
 
@@ -4257,7 +4315,9 @@ int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
    node has the expected ip allocation.
    This is verified against ctdb->ip_tree
 */
-int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
+int verify_remote_ip_allocation(struct ctdb_context *ctdb,
+                               struct ctdb_all_public_ips *ips,
+                               uint32_t pnn)
 {
        struct ctdb_public_ip_list *tmp_ip; 
        int i;
@@ -4275,7 +4335,7 @@ int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_publi
        for (i=0; i<ips->num; i++) {
                tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
                if (tmp_ip == NULL) {
-                       DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
+                       DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
                        return -1;
                }
 
@@ -4284,7 +4344,11 @@ int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_publi
                }
 
                if (tmp_ip->pnn != ips->ips[i].pnn) {
-                       DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
+                       DEBUG(DEBUG_ERR,
+                             ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
+                              pnn,
+                              ctdb_addr_to_str(&ips->ips[i].addr),
+                              ips->ips[i].pnn, tmp_ip->pnn));
                        return -1;
                }
        }
@@ -4370,6 +4434,8 @@ static int ctdb_reloadips_child(struct ctdb_context *ctdb)
        struct ctdb_vnn *vnn;
        int i, ret;
 
+       CTDB_NO_MEMORY(ctdb, mem_ctx);
+
        /* read the ip allocation from the local node */
        ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
        if (ret != 0) {
@@ -4384,7 +4450,7 @@ static int ctdb_reloadips_child(struct ctdb_context *ctdb)
                DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
                talloc_free(mem_ctx);
                return -1;
-       }               
+       }
 
 
        /* check the previous list of ips and scan for ips that have been
@@ -4408,6 +4474,7 @@ static int ctdb_reloadips_child(struct ctdb_context *ctdb)
 
                        ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
                        if (ret != 0) {
+                               talloc_free(mem_ctx);
                                DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
                                return -1;
                        }
@@ -4423,15 +4490,15 @@ static int ctdb_reloadips_child(struct ctdb_context *ctdb)
                        }
                }
                if (i == ips->num) {
-                       struct ctdb_control_ip_iface pub;
+                       struct ctdb_control_ip_iface *pub;
                        const char *ifaces = NULL;
                        int iface = 0;
 
                        DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
 
-                       pub.addr  = vnn->public_address;
-                       pub.mask  = vnn->public_netmask_bits;
-
+                       pub = talloc_zero(mem_ctx, struct ctdb_control_ip_iface);
+                       pub->addr  = vnn->public_address;
+                       pub->mask  = vnn->public_netmask_bits;
 
                        ifaces = vnn->ifaces[0];
                        iface = 1;
@@ -4439,17 +4506,27 @@ static int ctdb_reloadips_child(struct ctdb_context *ctdb)
                                ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
                                iface++;
                        }
-                       pub.len   = strlen(ifaces)+1;
-                       memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
+                       pub->len   = strlen(ifaces)+1;
+                       pub = talloc_realloc_size(mem_ctx, pub,
+                               offsetof(struct ctdb_control_ip_iface, iface) + pub->len);
+                       if (pub == NULL) {
+                               DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory\n"));
+                               talloc_free(mem_ctx);
+                               return -1;
+                       }
+                       memcpy(&pub->iface[0], ifaces, pub->len);
 
-                       ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
+                       ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(),
+                                                     CTDB_CURRENT_NODE, pub);
                        if (ret != 0) {
                                DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
+                               talloc_free(mem_ctx);
                                return -1;
                        }
                }
        }
 
+       talloc_free(mem_ctx);
        return 0;
 }
 
@@ -4495,6 +4572,7 @@ int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_re
                close(h->fd[0]);
                debug_extra = talloc_asprintf(NULL, "reloadips:");
 
+               ctdb_set_process_name("ctdb_reloadips");
                if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
                        DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
                        res = -1;