ctdb-docs: Promote CTDB_INIT_STYLE to global options section
[samba.git] / ctdb / server / ipalloc_lcp2.c
1 /*
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include "replace.h"
23 #include "system/network.h"
24
25 #include "lib/util/debug.h"
26 #include "common/logging.h"
27
28 #include "protocol/protocol_util.h"
29
30 #include "server/ipalloc_private.h"
31
32 /*
33  * This is the length of the longtest common prefix between the IPs.
34  * It is calculated by XOR-ing the 2 IPs together and counting the
35  * number of leading zeroes.  The implementation means that all
36  * addresses end up being 128 bits long.
37  *
38  * FIXME? Should we consider IPv4 and IPv6 separately given that the
39  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
40  * lots of nodes and IP addresses?
41  */
42 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
43 {
44         uint32_t ip1_k[IP_KEYLEN];
45         uint32_t *t;
46         int i;
47         uint32_t x;
48
49         uint32_t distance = 0;
50
51         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
52         t = ip_key(ip2);
53         for (i=0; i<IP_KEYLEN; i++) {
54                 x = ip1_k[i] ^ t[i];
55                 if (x == 0) {
56                         distance += 32;
57                 } else {
58                         /* Count number of leading zeroes.
59                          * FIXME? This could be optimised...
60                          */
61                         while ((x & (1 << 31)) == 0) {
62                                 x <<= 1;
63                                 distance += 1;
64                         }
65                 }
66         }
67
68         return distance;
69 }
70
71 /* Calculate the IP distance for the given IP relative to IPs on the
72    given node.  The ips argument is generally the all_ips variable
73    used in the main part of the algorithm.
74  */
75 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
76                                   struct public_ip_list *ips,
77                                   int pnn)
78 {
79         struct public_ip_list *t;
80         uint32_t d;
81
82         uint32_t sum = 0;
83
84         for (t = ips; t != NULL; t = t->next) {
85                 if (t->pnn != pnn) {
86                         continue;
87                 }
88
89                 /* Optimisation: We never calculate the distance
90                  * between an address and itself.  This allows us to
91                  * calculate the effect of removing an address from a
92                  * node by simply calculating the distance between
93                  * that address and all of the exitsing addresses.
94                  * Moreover, we assume that we're only ever dealing
95                  * with addresses from all_ips so we can identify an
96                  * address via a pointer rather than doing a more
97                  * expensive address comparison. */
98                 if (&(t->addr) == ip) {
99                         continue;
100                 }
101
102                 d = ip_distance(ip, &(t->addr));
103                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
104         }
105
106         return sum;
107 }
108
109 /* Return the LCP2 imbalance metric for addresses currently assigned
110    to the given node.
111  */
112 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
113 {
114         struct public_ip_list *t;
115
116         uint32_t imbalance = 0;
117
118         for (t = all_ips; t != NULL; t = t->next) {
119                 if (t->pnn != pnn) {
120                         continue;
121                 }
122                 /* Pass the rest of the IPs rather than the whole
123                    all_ips input list.
124                 */
125                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
126         }
127
128         return imbalance;
129 }
130
131 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
132                       uint32_t **lcp2_imbalances,
133                       bool **rebalance_candidates)
134 {
135         int i, numnodes;
136         struct public_ip_list *t;
137
138         numnodes = ipalloc_state->num;
139
140         *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
141         if (*rebalance_candidates == NULL) {
142                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
143                 return false;
144         }
145         *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
146         if (*lcp2_imbalances == NULL) {
147                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
148                 return false;
149         }
150
151         for (i=0; i<numnodes; i++) {
152                 (*lcp2_imbalances)[i] =
153                         lcp2_imbalance(ipalloc_state->all_ips, i);
154                 /* First step: assume all nodes are candidates */
155                 (*rebalance_candidates)[i] = true;
156         }
157
158         /* 2nd step: if a node has IPs assigned then it must have been
159          * healthy before, so we remove it from consideration.  This
160          * is overkill but is all we have because we don't maintain
161          * state between takeover runs.  An alternative would be to
162          * keep state and invalidate it every time the recovery master
163          * changes.
164          */
165         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
166                 if (t->pnn != -1) {
167                         (*rebalance_candidates)[t->pnn] = false;
168                 }
169         }
170
171         /* 3rd step: if a node is forced to re-balance then
172            we allow failback onto the node */
173         if (ipalloc_state->force_rebalance_nodes == NULL) {
174                 return true;
175         }
176         for (i = 0;
177              i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
178              i++) {
179                 uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
180                 if (pnn >= numnodes) {
181                         DEBUG(DEBUG_ERR,
182                               (__location__ "unknown node %u\n", pnn));
183                         continue;
184                 }
185
186                 DEBUG(DEBUG_NOTICE,
187                       ("Forcing rebalancing of IPs to node %u\n", pnn));
188                 (*rebalance_candidates)[pnn] = true;
189         }
190
191         return true;
192 }
193
194 /* Allocate any unassigned addresses using the LCP2 algorithm to find
195  * the IP/node combination that will cost the least.
196  */
197 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
198                                      uint32_t *lcp2_imbalances)
199 {
200         struct public_ip_list *t;
201         int dstnode, numnodes;
202
203         int minnode;
204         uint32_t mindsum, dstdsum, dstimbl;
205         uint32_t minimbl = 0;
206         struct public_ip_list *minip;
207
208         bool should_loop = true;
209         bool have_unassigned = true;
210
211         numnodes = ipalloc_state->num;
212
213         while (have_unassigned && should_loop) {
214                 should_loop = false;
215
216                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
217                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
218
219                 minnode = -1;
220                 mindsum = 0;
221                 minip = NULL;
222
223                 /* loop over each unassigned ip. */
224                 for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
225                         if (t->pnn != -1) {
226                                 continue;
227                         }
228
229                         for (dstnode = 0; dstnode < numnodes; dstnode++) {
230                                 /* only check nodes that can actually takeover this ip */
231                                 if (!can_node_takeover_ip(ipalloc_state,
232                                                           dstnode,
233                                                           t)) {
234                                         /* no it couldnt   so skip to the next node */
235                                         continue;
236                                 }
237
238                                 dstdsum = ip_distance_2_sum(&(t->addr),
239                                                             ipalloc_state->all_ips,
240                                                             dstnode);
241                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
242                                 DEBUG(DEBUG_DEBUG,
243                                       (" %s -> %d [+%d]\n",
244                                        ctdb_sock_addr_to_string(ipalloc_state,
245                                                                 &(t->addr),
246                                                                 false),
247                                        dstnode,
248                                        dstimbl - lcp2_imbalances[dstnode]));
249
250
251                                 if ((minnode == -1) || (dstdsum < mindsum)) {
252                                         minnode = dstnode;
253                                         minimbl = dstimbl;
254                                         mindsum = dstdsum;
255                                         minip = t;
256                                         should_loop = true;
257                                 }
258                         }
259                 }
260
261                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
262
263                 /* If we found one then assign it to the given node. */
264                 if (minnode != -1) {
265                         minip->pnn = minnode;
266                         lcp2_imbalances[minnode] = minimbl;
267                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
268                                           ctdb_sock_addr_to_string(
269                                                   ipalloc_state,
270                                                   &(minip->addr), false),
271                                           minnode,
272                                           mindsum));
273                 }
274
275                 /* There might be a better way but at least this is clear. */
276                 have_unassigned = false;
277                 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
278                         if (t->pnn == -1) {
279                                 have_unassigned = true;
280                         }
281                 }
282         }
283
284         /* We know if we have an unassigned addresses so we might as
285          * well optimise.
286          */
287         if (have_unassigned) {
288                 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
289                         if (t->pnn == -1) {
290                                 DEBUG(DEBUG_WARNING,
291                                       ("Failed to find node to cover ip %s\n",
292                                        ctdb_sock_addr_to_string(ipalloc_state,
293                                                                 &t->addr,
294                                                                 false)));
295                         }
296                 }
297         }
298 }
299
300 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
301  * to move IPs from, determines the best IP/destination node
302  * combination to move from the source node.
303  */
304 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
305                                     int srcnode,
306                                     uint32_t *lcp2_imbalances,
307                                     bool *rebalance_candidates)
308 {
309         int dstnode, mindstnode, numnodes;
310         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
311         uint32_t minsrcimbl, mindstimbl;
312         struct public_ip_list *minip;
313         struct public_ip_list *t;
314
315         /* Find an IP and destination node that best reduces imbalance. */
316         srcimbl = 0;
317         minip = NULL;
318         minsrcimbl = 0;
319         mindstnode = -1;
320         mindstimbl = 0;
321
322         numnodes = ipalloc_state->num;
323
324         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
325         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
326                            srcnode, lcp2_imbalances[srcnode]));
327
328         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
329                 /* Only consider addresses on srcnode. */
330                 if (t->pnn != srcnode) {
331                         continue;
332                 }
333
334                 /* What is this IP address costing the source node? */
335                 srcdsum = ip_distance_2_sum(&(t->addr),
336                                             ipalloc_state->all_ips,
337                                             srcnode);
338                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
339
340                 /* Consider this IP address would cost each potential
341                  * destination node.  Destination nodes are limited to
342                  * those that are newly healthy, since we don't want
343                  * to do gratuitous failover of IPs just to make minor
344                  * balance improvements.
345                  */
346                 for (dstnode = 0; dstnode < numnodes; dstnode++) {
347                         if (!rebalance_candidates[dstnode]) {
348                                 continue;
349                         }
350
351                         /* only check nodes that can actually takeover this ip */
352                         if (!can_node_takeover_ip(ipalloc_state, dstnode,
353                                                   t)) {
354                                 /* no it couldnt   so skip to the next node */
355                                 continue;
356                         }
357
358                         dstdsum = ip_distance_2_sum(&(t->addr),
359                                                     ipalloc_state->all_ips,
360                                                     dstnode);
361                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
362                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
363                                            srcnode, -srcdsum,
364                                            ctdb_sock_addr_to_string(
365                                                    ipalloc_state,
366                                                    &(t->addr), false),
367                                            dstnode, dstdsum));
368
369                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
370                             (dstdsum < srcdsum) &&                      \
371                             ((mindstnode == -1) ||                              \
372                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
373
374                                 minip = t;
375                                 minsrcimbl = srcimbl;
376                                 mindstnode = dstnode;
377                                 mindstimbl = dstimbl;
378                         }
379                 }
380         }
381         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
382
383         if (mindstnode != -1) {
384                 /* We found a move that makes things better... */
385                 DEBUG(DEBUG_INFO,
386                       ("%d [%d] -> %s -> %d [+%d]\n",
387                        srcnode, minsrcimbl - lcp2_imbalances[srcnode],
388                        ctdb_sock_addr_to_string(ipalloc_state,
389                                                 &(minip->addr), false),
390                        mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
391
392
393                 lcp2_imbalances[srcnode] = minsrcimbl;
394                 lcp2_imbalances[mindstnode] = mindstimbl;
395                 minip->pnn = mindstnode;
396
397                 return true;
398         }
399
400         return false;
401 }
402
403 struct lcp2_imbalance_pnn {
404         uint32_t imbalance;
405         int pnn;
406 };
407
408 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
409 {
410         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
411         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
412
413         if (lipa->imbalance > lipb->imbalance) {
414                 return -1;
415         } else if (lipa->imbalance == lipb->imbalance) {
416                 return 0;
417         } else {
418                 return 1;
419         }
420 }
421
422 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
423  * node with the highest LCP2 imbalance, and then determines the best
424  * IP/destination node combination to move from the source node.
425  */
426 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
427                           uint32_t *lcp2_imbalances,
428                           bool *rebalance_candidates)
429 {
430         int i, numnodes;
431         struct lcp2_imbalance_pnn * lips;
432         bool again;
433
434         numnodes = ipalloc_state->num;
435
436 try_again:
437         /* Put the imbalances and nodes into an array, sort them and
438          * iterate through candidates.  Usually the 1st one will be
439          * used, so this doesn't cost much...
440          */
441         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
442         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
443         lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
444         for (i = 0; i < numnodes; i++) {
445                 lips[i].imbalance = lcp2_imbalances[i];
446                 lips[i].pnn = i;
447                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
448         }
449         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
450               lcp2_cmp_imbalance_pnn);
451
452         again = false;
453         for (i = 0; i < numnodes; i++) {
454                 /* This means that all nodes had 0 or 1 addresses, so
455                  * can't be imbalanced.
456                  */
457                 if (lips[i].imbalance == 0) {
458                         break;
459                 }
460
461                 if (lcp2_failback_candidate(ipalloc_state,
462                                             lips[i].pnn,
463                                             lcp2_imbalances,
464                                             rebalance_candidates)) {
465                         again = true;
466                         break;
467                 }
468         }
469
470         talloc_free(lips);
471         if (again) {
472                 goto try_again;
473         }
474 }
475
476 bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
477 {
478         uint32_t *lcp2_imbalances;
479         bool *rebalance_candidates;
480         int numnodes, num_rebalance_candidates, i;
481         bool ret = true;
482
483         unassign_unsuitable_ips(ipalloc_state);
484
485         if (!lcp2_init(ipalloc_state,
486                        &lcp2_imbalances, &rebalance_candidates)) {
487                 ret = false;
488                 goto finished;
489         }
490
491         lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
492
493         /* If we don't want IPs to fail back then don't rebalance IPs. */
494         if (ipalloc_state->no_ip_failback) {
495                 goto finished;
496         }
497
498         /* It is only worth continuing if we have suitable target
499          * nodes to transfer IPs to.  This check is much cheaper than
500          * continuing on...
501          */
502         numnodes = ipalloc_state->num;
503         num_rebalance_candidates = 0;
504         for (i=0; i<numnodes; i++) {
505                 if (rebalance_candidates[i]) {
506                         num_rebalance_candidates++;
507                 }
508         }
509         if (num_rebalance_candidates == 0) {
510                 goto finished;
511         }
512
513         /* Now, try to make sure the ip adresses are evenly distributed
514            across the nodes.
515         */
516         lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
517
518 finished:
519         return ret;
520 }