ctdb-ipalloc: ctdb_takeover_run_core() takes ipalloc_state
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
index 6462de8aa4348c8386e59f790013be6ba9e2ffe7..d99f9aa08e0a4ab57c2d34ec3d2297eddb6cd1a6 100644 (file)
 #include "lib/util/dlinklist.h"
 #include "lib/util/debug.h"
 #include "lib/util/samba_util.h"
+#include "lib/util/util_process.h"
 
 #include "ctdb_private.h"
 #include "ctdb_client.h"
-#include "ctdb_logging.h"
 
 #include "common/rb_tree.h"
 #include "common/reqid.h"
 #include "common/system.h"
 #include "common/common.h"
+#include "common/logging.h"
 
 
 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
 struct ctdb_ipflags {
        bool noiptakeover;
        bool noiphost;
-       enum ctdb_runstate runstate;
+};
+
+enum ipalloc_algorithm {
+       IPALLOC_DETERMINISTIC,
+       IPALLOC_NONDETERMINISTIC,
+       IPALLOC_LCP2,
+};
+
+struct ipalloc_state {
+       uint32_t num;
+
+       /* Arrays with data for each node */
+       struct ctdb_public_ip_list_old **known_public_ips;
+       struct ctdb_public_ip_list_old **available_public_ips;
+
+       enum ipalloc_algorithm algorithm;
+       uint32_t no_ip_failback;
 };
 
 struct ctdb_interface {
@@ -261,6 +278,11 @@ static bool ctdb_vnn_available(struct ctdb_context *ctdb,
 {
        int i;
 
+       /* Nodes that are not RUNNING can not host IPs */
+       if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
+               return false;
+       }
+
        if (vnn->delete_pending) {
                return false;
        }
@@ -1237,8 +1259,7 @@ struct public_ip_list {
 /* Given a physical node, return the number of
    public addresses that is currently assigned to this node.
 */
-static int node_ip_coverage(struct ctdb_context *ctdb, int32_t pnn,
-                           struct public_ip_list *ips)
+static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
 {
        int num=0;
 
@@ -1254,7 +1275,8 @@ static int node_ip_coverage(struct ctdb_context *ctdb, int32_t pnn,
 /* Can the given node host the given IP: is the public IP known to the
  * node and is NOIPHOST unset?
 */
-static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
+static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
+                            int32_t pnn,
                             struct ctdb_ipflags ipflags,
                             struct public_ip_list *ip)
 {
@@ -1265,7 +1287,7 @@ static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
                return false;
        }
 
-       public_ips = ctdb->nodes[pnn]->available_public_ips;
+       public_ips = ipalloc_state->available_public_ips[pnn];
 
        if (public_ips == NULL) {
                return false;
@@ -1281,7 +1303,8 @@ static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
        return false;
 }
 
-static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
+static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+                                int32_t pnn,
                                 struct ctdb_ipflags ipflags,
                                 struct public_ip_list *ip)
 {
@@ -1289,14 +1312,14 @@ static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
                return false;
        }
 
-       return can_node_host_ip(ctdb, pnn, ipflags, ip);
+       return can_node_host_ip(ipalloc_state, pnn, ipflags, ip);
 }
 
 /* search the node lists list for a node to takeover this ip.
    pick the node that currently are serving the least number of ips
    so that the ips get spread out evenly.
 */
-static int find_takeover_node(struct ctdb_context *ctdb,
+static int find_takeover_node(struct ipalloc_state *ipalloc_state,
                              struct ctdb_ipflags *ipflags,
                              struct public_ip_list *ip,
                              struct public_ip_list *all_ips)
@@ -1308,12 +1331,12 @@ static int find_takeover_node(struct ctdb_context *ctdb,
        pnn    = -1;
        for (i=0; i<numnodes; i++) {
                /* verify that this node can serve this ip */
-               if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
+               if (!can_node_takeover_ip(ipalloc_state, i, ipflags[i], ip)) {
                        /* no it couldnt   so skip to the next node */
                        continue;
                }
 
-               num = node_ip_coverage(ctdb, i, all_ips);
+               num = node_ip_coverage(i, all_ips);
                /* was this the first node we checked ? */
                if (pnn == -1) {
                        pnn = i;
@@ -1324,7 +1347,7 @@ static int find_takeover_node(struct ctdb_context *ctdb,
                                min  = num;
                        }
                }
-       }       
+       }
        if (pnn == -1) {
                DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
                        ctdb_addr_to_str(&ip->addr)));
@@ -1388,21 +1411,80 @@ static int getips_count_callback(void *param, void *data)
        return 0;
 }
 
+static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
+                                      struct ctdb_public_ip_list_old *ips,
+                                      uint32_t pnn);
+
+static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
+                                        struct ipalloc_state *ipalloc_state,
+                                        struct ctdb_node_map_old *nodemap)
+{
+       int j;
+       int ret;
+
+       if (ipalloc_state->num != nodemap->num) {
+               DEBUG(DEBUG_ERR,
+                     (__location__
+                      " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
+                      ipalloc_state->num, nodemap->num));
+               return -1;
+       }
+
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               /* Retrieve the list of known public IPs from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       TAKEOVER_TIMEOUT(),
+                                       j,
+                                       ctdb->nodes,
+                                       0,
+                                       &ipalloc_state->known_public_ips[j]);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to read known public IPs from node: %u\n",
+                              j));
+                       return -1;
+               }
+
+               if (ctdb->do_checkpublicip) {
+                       verify_remote_ip_allocation(ctdb,
+                                                   ipalloc_state->known_public_ips[j],
+                                                   j);
+               }
+
+               /* Retrieve the list of available public IPs from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       TAKEOVER_TIMEOUT(),
+                                       j,
+                                       ctdb->nodes,
+                                       CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
+                                       &ipalloc_state->available_public_ips[j]);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to read available public IPs from node: %u\n",
+                              j));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
 static struct public_ip_list *
-create_merged_ip_list(struct ctdb_context *ctdb)
+create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
 {
        int i, j;
        struct public_ip_list *ip_list;
        struct ctdb_public_ip_list_old *public_ips;
 
-       if (ctdb->ip_tree != NULL) {
-               talloc_free(ctdb->ip_tree);
-               ctdb->ip_tree = NULL;
-       }
+       TALLOC_FREE(ctdb->ip_tree);
        ctdb->ip_tree = trbt_create(ctdb, 0);
 
-       for (i=0;i<ctdb->num_nodes;i++) {
-               public_ips = ctdb->nodes[i]->known_public_ips;
+       for (i=0; i < ctdb->num_nodes; i++) {
+               public_ips = ipalloc_state->known_public_ips[i];
 
                if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
                        continue;
@@ -1411,9 +1493,9 @@ create_merged_ip_list(struct ctdb_context *ctdb)
                /* there were no public ips for this node */
                if (public_ips == NULL) {
                        continue;
-               }               
+               }
 
-               for (j=0;j<public_ips->num;j++) {
+               for (j=0; j < public_ips->num; j++) {
                        struct public_ip_list *tmp_ip;
 
                        tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
@@ -1543,20 +1625,22 @@ static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
 /* Allocate any unassigned IPs just by looping through the IPs and
  * finding the best node for each.
  */
-static void basic_allocate_unassigned(struct ctdb_context *ctdb,
+static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
                                      struct ctdb_ipflags *ipflags,
                                      struct public_ip_list *all_ips)
 {
        struct public_ip_list *tmp_ip;
 
-       /* loop over all ip's and find a physical node to cover for 
+       /* loop over all ip's and find a physical node to cover for
           each unassigned ip.
        */
        for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
                if (tmp_ip->pnn == -1) {
-                       if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
-                               DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
-                                       ctdb_addr_to_str(&tmp_ip->addr)));
+                       if (find_takeover_node(ipalloc_state, ipflags,
+                                              tmp_ip, all_ips)) {
+                               DEBUG(DEBUG_WARNING,
+                                     ("Failed to find node to cover ip %s\n",
+                                      ctdb_addr_to_str(&tmp_ip->addr)));
                        }
                }
        }
@@ -1564,7 +1648,7 @@ static void basic_allocate_unassigned(struct ctdb_context *ctdb,
 
 /* Basic non-deterministic rebalancing algorithm.
  */
-static void basic_failback(struct ctdb_context *ctdb,
+static void basic_failback(struct ipalloc_state *ipalloc_state,
                           struct ctdb_ipflags *ipflags,
                           struct public_ip_list *all_ips,
                           int num_ips)
@@ -1597,12 +1681,13 @@ try_again:
                minnode = -1;
                for (i=0; i<numnodes; i++) {
                        /* only check nodes that can actually serve this ip */
-                       if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
+                       if (!can_node_takeover_ip(ipalloc_state, i,
+                                                 ipflags[i], tmp_ip)) {
                                /* no it couldnt   so skip to the next node */
                                continue;
                        }
 
-                       num = node_ip_coverage(ctdb, i, all_ips);
+                       num = node_ip_coverage(i, all_ips);
                        if (maxnode == -1) {
                                maxnode = i;
                                maxnum  = num;
@@ -1642,7 +1727,10 @@ try_again:
                        /* Reassign one of maxnode's VNNs */
                        for (tmp=all_ips;tmp;tmp=tmp->next) {
                                if (tmp->pnn == maxnode) {
-                                       (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
+                                       (void)find_takeover_node(ipalloc_state,
+                                                                ipflags,
+                                                                tmp,
+                                                                all_ips);
                                        retries++;
                                        goto try_again;;
                                }
@@ -1651,7 +1739,7 @@ try_again:
        }
 }
 
-static void lcp2_init(struct ctdb_context *tmp_ctx,
+static void lcp2_init(TALLOC_CTX *tmp_ctx,
                      struct ctdb_ipflags *ipflags,
                      struct public_ip_list *all_ips,
                      uint32_t *force_rebalance_nodes,
@@ -1709,7 +1797,7 @@ static void lcp2_init(struct ctdb_context *tmp_ctx,
 /* Allocate any unassigned addresses using the LCP2 algorithm to find
  * the IP/node combination that will cost the least.
  */
-static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
+static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
                                     struct ctdb_ipflags *ipflags,
                                     struct public_ip_list *all_ips,
                                     uint32_t *lcp2_imbalances)
@@ -1744,7 +1832,8 @@ static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
 
                        for (dstnode=0; dstnode<numnodes; dstnode++) {
                                /* only check nodes that can actually takeover this ip */
-                               if (!can_node_takeover_ip(ctdb, dstnode,
+                               if (!can_node_takeover_ip(ipalloc_state,
+                                                         dstnode,
                                                          ipflags[dstnode],
                                                          tmp_ip)) {
                                        /* no it couldnt   so skip to the next node */
@@ -1807,7 +1896,7 @@ static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
  * to move IPs from, determines the best IP/destination node
  * combination to move from the source node.
  */
-static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
+static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
                                    struct ctdb_ipflags *ipflags,
                                    struct public_ip_list *all_ips,
                                    int srcnode,
@@ -1855,7 +1944,7 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
                        }
 
                        /* only check nodes that can actually takeover this ip */
-                       if (!can_node_takeover_ip(ctdb, dstnode,
+                       if (!can_node_takeover_ip(ipalloc_state, dstnode,
                                                  ipflags[dstnode], tmp_ip)) {
                                /* no it couldnt   so skip to the next node */
                                continue;
@@ -1924,7 +2013,7 @@ static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
  * node with the highest LCP2 imbalance, and then determines the best
  * IP/destination node combination to move from the source node.
  */
-static void lcp2_failback(struct ctdb_context *ctdb,
+static void lcp2_failback(struct ipalloc_state *ipalloc_state,
                          struct ctdb_ipflags *ipflags,
                          struct public_ip_list *all_ips,
                          uint32_t *lcp2_imbalances,
@@ -1943,7 +2032,7 @@ try_again:
         */
        DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
        DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
-       lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
+       lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
        for (i=0; i<numnodes; i++) {
                lips[i].imbalance = lcp2_imbalances[i];
                lips[i].pnn = i;
@@ -1961,7 +2050,7 @@ try_again:
                        break;
                }
 
-               if (lcp2_failback_candidate(ctdb,
+               if (lcp2_failback_candidate(ipalloc_state,
                                            ipflags,
                                            all_ips,
                                            lips[i].pnn,
@@ -1978,7 +2067,7 @@ try_again:
        }
 }
 
-static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
+static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
                                    struct ctdb_ipflags *ipflags,
                                    struct public_ip_list *all_ips)
 {
@@ -1991,7 +2080,7 @@ static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
                if (tmp_ip->pnn == -1) {
                        continue;
                }
-               if (!can_node_host_ip(ctdb, tmp_ip->pnn,
+               if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
                                      ipflags[tmp_ip->pnn], tmp_ip) != 0) {
                        /* this node can not serve this ip. */
                        DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
@@ -2002,7 +2091,7 @@ static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
        }
 }
 
-static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
+static void ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
                                       struct ctdb_ipflags *ipflags,
                                       struct public_ip_list *all_ips)
 {
@@ -2025,18 +2114,18 @@ static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
         * IPs, since the modulo step above implicitly fails
         * back IPs to their "home" node.
         */
-       if (1 == ctdb->tunable.no_ip_failback) {
+       if (1 == ipalloc_state->no_ip_failback) {
                DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
        }
 
-       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
+       unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
 
-       basic_allocate_unassigned(ctdb, ipflags, all_ips);
+       basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
 
        /* No failback here! */
 }
 
-static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
+static void ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
                                          struct ctdb_ipflags *ipflags,
                                          struct public_ip_list *all_ips)
 {
@@ -2047,22 +2136,22 @@ static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
                num_ips++;
        }
 
-       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
+       unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
 
-       basic_allocate_unassigned(ctdb, ipflags, all_ips);
+       basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
 
        /* If we don't want IPs to fail back then don't rebalance IPs. */
-       if (1 == ctdb->tunable.no_ip_failback) {
+       if (1 == ipalloc_state->no_ip_failback) {
                return;
        }
 
        /* Now, try to make sure the ip adresses are evenly distributed
           across the nodes.
        */
-       basic_failback(ctdb, ipflags, all_ips, num_ips);
+       basic_failback(ipalloc_state, ipflags, all_ips, num_ips);
 }
 
-static void ip_alloc_lcp2(struct ctdb_context *ctdb,
+static void ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
                          struct ctdb_ipflags *ipflags,
                          struct public_ip_list *all_ips,
                          uint32_t *force_rebalance_nodes)
@@ -2071,17 +2160,17 @@ static void ip_alloc_lcp2(struct ctdb_context *ctdb,
        bool *rebalance_candidates;
        int numnodes, num_rebalance_candidates, i;
 
-       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TALLOC_CTX *tmp_ctx = talloc_new(ipalloc_state);
 
-       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
+       unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
 
        lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
                  &lcp2_imbalances, &rebalance_candidates);
 
-       lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
+       lcp2_allocate_unassigned(ipalloc_state, ipflags, all_ips, lcp2_imbalances);
 
        /* If we don't want IPs to fail back then don't rebalance IPs. */
-       if (1 == ctdb->tunable.no_ip_failback) {
+       if (1 == ipalloc_state->no_ip_failback) {
                goto finished;
        }
 
@@ -2103,7 +2192,7 @@ static void ip_alloc_lcp2(struct ctdb_context *ctdb,
        /* Now, try to make sure the ip adresses are evenly distributed
           across the nodes.
        */
-       lcp2_failback(ctdb, ipflags, all_ips,
+       lcp2_failback(ipalloc_state, ipflags, all_ips,
                      lcp2_imbalances, rebalance_candidates);
 
 finished:
@@ -2125,27 +2214,22 @@ static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
 }
 
 /* The calculation part of the IP allocation algorithm. */
-static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
+static void ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
                                   struct ctdb_ipflags *ipflags,
-                                  struct public_ip_list **all_ips_p,
+                                  struct public_ip_list *all_ips,
                                   uint32_t *force_rebalance_nodes)
 {
-       /* since nodes only know about those public addresses that
-          can be served by that particular node, no single node has
-          a full list of all public addresses that exist in the cluster.
-          Walk over all node structures and create a merged list of
-          all public addresses that exist in the cluster.
-
-          keep the tree of ips around as ctdb->ip_tree
-       */
-       *all_ips_p = create_merged_ip_list(ctdb);
-
-        if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
-               ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
-       } else if (1 == ctdb->tunable.deterministic_public_ips) {
-               ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
-       } else {
-               ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
+       switch (ipalloc_state->algorithm) {
+       case IPALLOC_LCP2:
+               ip_alloc_lcp2(ipalloc_state, ipflags, all_ips,
+                             force_rebalance_nodes);
+               break;
+       case IPALLOC_DETERMINISTIC:
+               ip_alloc_deterministic_ips(ipalloc_state, ipflags, all_ips);
+               break;
+       case IPALLOC_NONDETERMINISTIC:
+               ip_alloc_nondeterministic_ips(ipalloc_state, ipflags, all_ips);
+               break;
        }
 
        /* at this point ->pnn is the node which will own each IP
@@ -2267,98 +2351,6 @@ static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
        return tvals;
 }
 
-struct get_runstate_callback_data {
-       enum ctdb_runstate *out;
-       bool fatal;
-};
-
-static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
-                                 int32_t res, TDB_DATA outdata,
-                                 void *callback_data)
-{
-       struct get_runstate_callback_data *cd =
-               (struct get_runstate_callback_data *)callback_data;
-       int size;
-
-       if (res != 0) {
-               /* Already handled in fail callback */
-               return;
-       }
-
-       if (outdata.dsize != sizeof(uint32_t)) {
-               DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
-                                pnn, (int)sizeof(uint32_t),
-                                (int)outdata.dsize));
-               cd->fatal = true;
-               return;
-       }
-
-       size = talloc_array_length(cd->out);
-       if (pnn >= size) {
-               DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
-                                pnn, size));
-               return;
-       }
-
-       cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
-}
-
-static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
-                                      int32_t res, TDB_DATA outdata,
-                                      void *callback)
-{
-       struct get_runstate_callback_data *cd =
-               (struct get_runstate_callback_data *)callback;
-
-       switch (res) {
-       case -ETIME:
-               DEBUG(DEBUG_ERR,
-                     ("Timed out getting runstate from node %d\n", pnn));
-               cd->fatal = true;
-               break;
-       default:
-               DEBUG(DEBUG_WARNING,
-                     ("Error getting runstate from node %d - assuming runstates not supported\n",
-                      pnn));
-       }
-}
-
-static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
-                                                   TALLOC_CTX *tmp_ctx,
-                                                   struct ctdb_node_map_old *nodemap,
-                                                   enum ctdb_runstate default_value)
-{
-       uint32_t *nodes;
-       enum ctdb_runstate *rs;
-       struct get_runstate_callback_data callback_data;
-       int i;
-
-       rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
-       CTDB_NO_MEMORY_NULL(ctdb, rs);
-       for (i=0; i<nodemap->num; i++) {
-               rs[i] = default_value;
-       }
-
-       callback_data.out = rs;
-       callback_data.fatal = false;
-
-       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
-       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
-                                     nodes, 0, TAKEOVER_TIMEOUT(),
-                                     true, tdb_null,
-                                     get_runstate_callback,
-                                     get_runstate_fail_callback,
-                                     &callback_data) != 0) {
-               if (callback_data.fatal) {
-                       free(rs);
-                       rs = NULL;
-               }
-       }
-       talloc_free(nodes);
-
-       return rs;
-}
-
 /* Set internal flags for IP allocation:
  *   Clear ip flags
  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
@@ -2373,8 +2365,7 @@ set_ipflags_internal(struct ctdb_context *ctdb,
                     TALLOC_CTX *tmp_ctx,
                     struct ctdb_node_map_old *nodemap,
                     uint32_t *tval_noiptakeover,
-                    uint32_t *tval_noiphostonalldisabled,
-                    enum ctdb_runstate *runstate)
+                    uint32_t *tval_noiphostonalldisabled)
 {
        int i;
        struct ctdb_ipflags *ipflags;
@@ -2389,17 +2380,10 @@ set_ipflags_internal(struct ctdb_context *ctdb,
                        ipflags[i].noiptakeover = true;
                }
 
-               /* Can not host IPs on node not in RUNNING state */
-               if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
-                       ipflags[i].noiphost = true;
-                       continue;
-               }
                /* Can not host IPs on INACTIVE node */
                if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
                        ipflags[i].noiphost = true;
                }
-               /* Remember the runstate */
-               ipflags[i].runstate = runstate[i];
        }
 
        if (all_nodes_are_disabled(nodemap)) {
@@ -2432,7 +2416,6 @@ static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
        uint32_t *tval_noiptakeover;
        uint32_t *tval_noiphostonalldisabled;
        struct ctdb_ipflags *ipflags;
-       enum ctdb_runstate *runstate;
 
 
        tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
@@ -2449,29 +2432,59 @@ static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
                return NULL;
        }
 
-       /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
-        * will default to CTDB_RUNSTATE_RUNNING.  This ensures
-        * reasonable behaviour on a mixed cluster during upgrade.
-        */
-       runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
-                                          CTDB_RUNSTATE_RUNNING);
-       if (runstate == NULL) {
-               /* Caller frees tmp_ctx */
-               return NULL;
-       }
-
        ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
                                       tval_noiptakeover,
-                                      tval_noiphostonalldisabled,
-                                      runstate);
+                                      tval_noiphostonalldisabled);
 
        talloc_free(tval_noiptakeover);
        talloc_free(tval_noiphostonalldisabled);
-       talloc_free(runstate);
 
        return ipflags;
 }
 
+static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
+                                                TALLOC_CTX *mem_ctx)
+{
+       struct ipalloc_state *ipalloc_state =
+               talloc_zero(mem_ctx, struct ipalloc_state);
+       if (ipalloc_state == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
+               return NULL;
+       }
+
+       ipalloc_state->num = ctdb->num_nodes;
+       ipalloc_state->known_public_ips =
+               talloc_zero_array(ipalloc_state,
+                                 struct ctdb_public_ip_list_old *,
+                                 ipalloc_state->num);
+       if (ipalloc_state->known_public_ips == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
+               talloc_free(ipalloc_state);
+               return NULL;
+       }
+       ipalloc_state->available_public_ips =
+               talloc_zero_array(ipalloc_state,
+                                 struct ctdb_public_ip_list_old *,
+                                 ipalloc_state->num);
+       if (ipalloc_state->available_public_ips == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
+               talloc_free(ipalloc_state);
+               return NULL;
+       }
+
+       if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+               ipalloc_state->algorithm = IPALLOC_LCP2;
+       } else if (1 == ctdb->tunable.deterministic_public_ips) {
+               ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
+       } else {
+               ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
+       }
+
+       ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
+
+       return ipalloc_state;
+}
+
 struct iprealloc_callback_data {
        bool *retry_nodes;
        int retry_count;
@@ -2585,6 +2598,7 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
        struct ctdb_client_control_state *state;
        TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
        struct ctdb_ipflags *ipflags;
+       struct ipalloc_state *ipalloc_state;
        struct takeover_callback_data *takeover_data;
        struct iprealloc_callback_data iprealloc_data;
        bool *retry_data;
@@ -2598,6 +2612,12 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                goto ipreallocated;
        }
 
+       ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
+       if (ipalloc_state == NULL) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
        ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
        if (ipflags == NULL) {
                DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
@@ -2605,11 +2625,17 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                return -1;
        }
 
-       /* Short-circuit IP allocation if no nodes are in the RUNNING
-        * runstate yet, since no nodes will be able to host IPs */
+       /* Fetch known/available public IPs from each active node */
+       ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
+       if (ret != 0) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* Short-circuit IP allocation if no node has available IPs */
        can_host_ips = false;
-       for (i=0; i<nodemap->num; i++) {
-               if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
+       for (i=0; i < ipalloc_state->num; i++) {
+               if (ipalloc_state->available_public_ips[i] != NULL) {
                        can_host_ips = true;
                }
        }
@@ -2618,8 +2644,19 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                return 0;
        }
 
+       /* since nodes only know about those public addresses that
+          can be served by that particular node, no single node has
+          a full list of all public addresses that exist in the cluster.
+          Walk over all node structures and create a merged list of
+          all public addresses that exist in the cluster.
+
+          keep the tree of ips around as ctdb->ip_tree
+       */
+       all_ips = create_merged_ip_list(ctdb, ipalloc_state);
+
        /* Do the IP reassignment calculations */
-       ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
+       ctdb_takeover_run_core(ipalloc_state, ipflags,
+                              all_ips, force_rebalance_nodes);
 
        /* Now tell all nodes to release any public IPs should not
         * host.  This will be a NOOP on nodes that don't currently
@@ -4219,9 +4256,9 @@ int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
    node has the expected ip allocation.
    This is verified against ctdb->ip_tree
 */
-int verify_remote_ip_allocation(struct ctdb_context *ctdb,
-                               struct ctdb_public_ip_list_old *ips,
-                               uint32_t pnn)
+static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
+                                      struct ctdb_public_ip_list_old *ips,
+                                      uint32_t pnn)
 {
        struct public_ip_list *tmp_ip;
        int i;
@@ -4556,7 +4593,7 @@ int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_re
                close(h->fd[0]);
                debug_extra = talloc_asprintf(NULL, "reloadips:");
 
-               ctdb_set_process_name("ctdb_reloadips");
+               prctl_set_comment("ctdb_reloadips");
                if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
                        DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
                        res = -1;