4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "system/network.h"
25 #include "lib/util/debug.h"
26 #include "common/logging.h"
28 #include "protocol/protocol_api.h"
30 #include "server/ipalloc_private.h"
33 * This is the length of the longtest common prefix between the IPs.
34 * It is calculated by XOR-ing the 2 IPs together and counting the
35 * number of leading zeroes. The implementation means that all
36 * addresses end up being 128 bits long.
38 * FIXME? Should we consider IPv4 and IPv6 separately given that the
39 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
40 * lots of nodes and IP addresses?
42 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
44 uint32_t ip1_k[IP_KEYLEN];
49 uint32_t distance = 0;
51 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
53 for (i=0; i<IP_KEYLEN; i++) {
58 /* Count number of leading zeroes.
59 * FIXME? This could be optimised...
61 while ((x & (1 << 31)) == 0) {
71 /* Calculate the IP distance for the given IP relative to IPs on the
72 given node. The ips argument is generally the all_ips variable
73 used in the main part of the algorithm.
75 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
76 struct public_ip_list *ips,
79 struct public_ip_list *t;
84 for (t = ips; t != NULL; t = t->next) {
89 /* Optimisation: We never calculate the distance
90 * between an address and itself. This allows us to
91 * calculate the effect of removing an address from a
92 * node by simply calculating the distance between
93 * that address and all of the exitsing addresses.
94 * Moreover, we assume that we're only ever dealing
95 * with addresses from all_ips so we can identify an
96 * address via a pointer rather than doing a more
97 * expensive address comparison. */
98 if (&(t->addr) == ip) {
102 d = ip_distance(ip, &(t->addr));
103 sum += d * d; /* Cheaper than pulling in math.h :-) */
109 /* Return the LCP2 imbalance metric for addresses currently assigned
112 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
114 struct public_ip_list *t;
116 uint32_t imbalance = 0;
118 for (t = all_ips; t != NULL; t = t->next) {
122 /* Pass the rest of the IPs rather than the whole
125 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
131 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
132 uint32_t **lcp2_imbalances,
133 bool **rebalance_candidates)
136 struct public_ip_list *t;
138 numnodes = ipalloc_state->num;
140 *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
141 if (*rebalance_candidates == NULL) {
142 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
145 *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
146 if (*lcp2_imbalances == NULL) {
147 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
151 for (i=0; i<numnodes; i++) {
152 (*lcp2_imbalances)[i] =
153 lcp2_imbalance(ipalloc_state->all_ips, i);
154 /* First step: assume all nodes are candidates */
155 (*rebalance_candidates)[i] = true;
158 /* 2nd step: if a node has IPs assigned then it must have been
159 * healthy before, so we remove it from consideration. This
160 * is overkill but is all we have because we don't maintain
161 * state between takeover runs. An alternative would be to
162 * keep state and invalidate it every time the recovery master
165 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
167 (*rebalance_candidates)[t->pnn] = false;
171 /* 3rd step: if a node is forced to re-balance then
172 we allow failback onto the node */
173 if (ipalloc_state->force_rebalance_nodes == NULL) {
177 i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
179 uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
180 if (pnn >= numnodes) {
182 (__location__ "unknown node %u\n", pnn));
187 ("Forcing rebalancing of IPs to node %u\n", pnn));
188 (*rebalance_candidates)[pnn] = true;
194 /* Allocate any unassigned addresses using the LCP2 algorithm to find
195 * the IP/node combination that will cost the least.
197 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
198 uint32_t *lcp2_imbalances)
200 struct public_ip_list *t;
201 int dstnode, numnodes;
204 uint32_t mindsum, dstdsum, dstimbl;
205 uint32_t minimbl = 0;
206 struct public_ip_list *minip;
208 bool should_loop = true;
209 bool have_unassigned = true;
211 numnodes = ipalloc_state->num;
213 while (have_unassigned && should_loop) {
216 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
217 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
223 /* loop over each unassigned ip. */
224 for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
229 for (dstnode = 0; dstnode < numnodes; dstnode++) {
230 /* only check nodes that can actually takeover this ip */
231 if (!can_node_takeover_ip(ipalloc_state,
234 /* no it couldnt so skip to the next node */
238 dstdsum = ip_distance_2_sum(&(t->addr),
239 ipalloc_state->all_ips,
241 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
243 (" %s -> %d [+%d]\n",
244 ctdb_sock_addr_to_string(ipalloc_state,
247 dstimbl - lcp2_imbalances[dstnode]));
250 if ((minnode == -1) || (dstdsum < mindsum)) {
260 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
262 /* If we found one then assign it to the given node. */
264 minip->pnn = minnode;
265 lcp2_imbalances[minnode] = minimbl;
266 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
267 ctdb_sock_addr_to_string(
274 /* There might be a better way but at least this is clear. */
275 have_unassigned = false;
276 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
278 have_unassigned = true;
283 /* We know if we have an unassigned addresses so we might as
286 if (have_unassigned) {
287 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
290 ("Failed to find node to cover ip %s\n",
291 ctdb_sock_addr_to_string(ipalloc_state,
298 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
299 * to move IPs from, determines the best IP/destination node
300 * combination to move from the source node.
302 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
304 uint32_t *lcp2_imbalances,
305 bool *rebalance_candidates)
307 int dstnode, mindstnode, numnodes;
308 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
309 uint32_t minsrcimbl, mindstimbl;
310 struct public_ip_list *minip;
311 struct public_ip_list *t;
313 /* Find an IP and destination node that best reduces imbalance. */
320 numnodes = ipalloc_state->num;
322 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
323 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
324 srcnode, lcp2_imbalances[srcnode]));
326 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
327 /* Only consider addresses on srcnode. */
328 if (t->pnn != srcnode) {
332 /* What is this IP address costing the source node? */
333 srcdsum = ip_distance_2_sum(&(t->addr),
334 ipalloc_state->all_ips,
336 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
338 /* Consider this IP address would cost each potential
339 * destination node. Destination nodes are limited to
340 * those that are newly healthy, since we don't want
341 * to do gratuitous failover of IPs just to make minor
342 * balance improvements.
344 for (dstnode = 0; dstnode < numnodes; dstnode++) {
345 if (!rebalance_candidates[dstnode]) {
349 /* only check nodes that can actually takeover this ip */
350 if (!can_node_takeover_ip(ipalloc_state, dstnode,
352 /* no it couldnt so skip to the next node */
356 dstdsum = ip_distance_2_sum(&(t->addr),
357 ipalloc_state->all_ips,
359 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
360 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
362 ctdb_sock_addr_to_string(
363 ipalloc_state, &(t->addr)),
366 if ((dstimbl < lcp2_imbalances[srcnode]) &&
367 (dstdsum < srcdsum) && \
368 ((mindstnode == -1) || \
369 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
372 minsrcimbl = srcimbl;
373 mindstnode = dstnode;
374 mindstimbl = dstimbl;
378 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
380 if (mindstnode != -1) {
381 /* We found a move that makes things better... */
383 ("%d [%d] -> %s -> %d [+%d]\n",
384 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
385 ctdb_sock_addr_to_string(ipalloc_state, &(minip->addr)),
386 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
389 lcp2_imbalances[srcnode] = minsrcimbl;
390 lcp2_imbalances[mindstnode] = mindstimbl;
391 minip->pnn = mindstnode;
399 struct lcp2_imbalance_pnn {
404 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
406 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
407 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
409 if (lipa->imbalance > lipb->imbalance) {
411 } else if (lipa->imbalance == lipb->imbalance) {
418 /* LCP2 algorithm for rebalancing the cluster. This finds the source
419 * node with the highest LCP2 imbalance, and then determines the best
420 * IP/destination node combination to move from the source node.
422 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
423 uint32_t *lcp2_imbalances,
424 bool *rebalance_candidates)
427 struct lcp2_imbalance_pnn * lips;
430 numnodes = ipalloc_state->num;
433 /* Put the imbalances and nodes into an array, sort them and
434 * iterate through candidates. Usually the 1st one will be
435 * used, so this doesn't cost much...
437 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
438 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
439 lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
440 for (i = 0; i < numnodes; i++) {
441 lips[i].imbalance = lcp2_imbalances[i];
443 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
445 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
446 lcp2_cmp_imbalance_pnn);
449 for (i = 0; i < numnodes; i++) {
450 /* This means that all nodes had 0 or 1 addresses, so
451 * can't be imbalanced.
453 if (lips[i].imbalance == 0) {
457 if (lcp2_failback_candidate(ipalloc_state,
460 rebalance_candidates)) {
472 bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
474 uint32_t *lcp2_imbalances;
475 bool *rebalance_candidates;
476 int numnodes, num_rebalance_candidates, i;
479 unassign_unsuitable_ips(ipalloc_state);
481 if (!lcp2_init(ipalloc_state,
482 &lcp2_imbalances, &rebalance_candidates)) {
487 lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
489 /* If we don't want IPs to fail back then don't rebalance IPs. */
490 if (ipalloc_state->no_ip_failback) {
494 /* It is only worth continuing if we have suitable target
495 * nodes to transfer IPs to. This check is much cheaper than
498 numnodes = ipalloc_state->num;
499 num_rebalance_candidates = 0;
500 for (i=0; i<numnodes; i++) {
501 if (rebalance_candidates[i]) {
502 num_rebalance_candidates++;
505 if (num_rebalance_candidates == 0) {
509 /* Now, try to make sure the ip adresses are evenly distributed
512 lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);