ce9cc115e6a4ca7f4b3384335d60b2ec8125b832
[metze/ctdb/wip.git] / tests / takeover / ctdb_takeover.py
1 #!/usr/bin/env python
2
3 # ctdb ip takeover code
4
5 # Copyright (C) Ronnie Sahlberg  2007
6 # Copyright (C) Andrew Tridgell  2007
7 #
8 # Python version (C) Martin Schwenke 2010
9
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either version 3 of the License, or
13 # (at your option) any later version.
14
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 # GNU General Public License for more details.
19
20 # You should have received a copy of the GNU General Public License
21 # along with this program; if not, see <http://www.gnu.org/licenses/>.
22
23
24 import os
25 import sys
26 from optparse import OptionParser
27 import copy
28 import random
29
30 def process_args():
31     usage = "usage: %prog [options]"
32
33     parser = OptionParser(usage=usage)
34
35     parser.add_option("--nd",
36                       action="store_false", dest="deterministic_public_ips",
37                       default=True,
38                       help="turn off deterministic_public_ips")
39     parser.add_option("--ni",
40                       action="store_true", dest="no_ip_failback", default=False,
41                       help="turn on no_ip_failback")
42     parser.add_option("-v", "--verbose",
43                       action="store_true", dest="verbose", default=False,
44                       help="print information and actions taken to stdout")
45     parser.add_option("-d", "--diff",
46                       action="store_true", dest="diff", default=False,
47                       help="after each recovery show IP address movements")
48     parser.add_option("-n", "--no-print",
49                       action="store_false", dest="show", default=True,
50                       help="after each recovery don't print IP address layout")
51     parser.add_option("--hack",
52                       action="store", type="int", dest="hack", default=0,
53                       help="apply a hack (see the code!!!)")
54     parser.add_option("-r", "--retries",
55                       action="store", type="int", dest="retries", default=5,
56                       help="number of retry loops for rebalancing")
57     parser.add_option("-i", "--iterations",
58                       action="store", type="int", dest="iterations",
59                       default=1000,
60                       help="number of iterations to run in test")
61     parser.add_option("-b", "--balance",
62                       action="store_true", dest="balance", default=False,
63                       help="show (im)balance information")
64     parser.add_option("-x", "--exit",
65                       action="store_true", dest="exit", default=False,
66                       help="exit on the 1st gratuitous IP move")
67     
68     (options, args) = parser.parse_args()
69
70     if len(args) != 0:
71         parser.error("too many argumentss")
72
73     return options
74
75 def print_begin(t):
76     print "=" * 40
77     print "%s:" % (t)
78
79 def print_end():
80     print "-" * 40
81
82 def verbose_begin(t):
83     if options.verbose:
84         print_begin(t)
85
86 def verbose_end():
87     if options.verbose:
88         print_end()
89
90 def verbose_print(t):
91     if options.verbose:
92         if not type(t) == list:
93             t = [t]
94         if t != []:
95             print "\n".join([str(i) for i in t])
96
97
98 class Node(object):
99     def __init__(self, public_addresses):
100         self.public_addresses = set(public_addresses)
101         self.current_addresses = set()
102         self.healthy = True
103
104     def can_node_serve_ip(self, ip):
105         return ip in self.public_addresses
106
107     def node_ip_coverage(self):
108         return len(self.current_addresses)
109
110 class Cluster(object):
111     def __init__(self):
112         global options
113
114         self.nodes = []
115         self.deterministic_public_ips = options.deterministic_public_ips
116         self.no_ip_failback = options.no_ip_failback
117         self.all_public_ips = set()
118
119         self.ip_moves = []
120         self.grat_ip_moves = []
121         self.imbalance = []
122         self.events = -1
123
124     def __str__(self):
125         return "\n".join(["%2d %s %s" %
126                           (i,
127                            "*" if len(n.public_addresses) == 0 else \
128                                (" " if n.healthy else "#"),
129                            sorted(list(n.current_addresses)))
130                           for (i, n) in enumerate(self.nodes)])
131
132     def print_statistics(self):
133         print_begin("STATISTICS")
134         print "Events:              %6d" % self.events
135         print "Total IP moves:      %6d" % sum(self.ip_moves)
136         print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
137         print "Max imbalance:       %6d" % max(self.imbalance)
138         print "Final imbalance:     %6d" % self.imbalance[-1]
139         print_end()
140
141     def find_pnn_with_ip(self, ip):
142         for (i, n) in enumerate(self.nodes):
143             if ip in n.current_addresses:
144                 return i
145         return -1
146
147     def quietly_remove_ip(self, ip):
148         # Remove address from old node.
149         old = self.find_pnn_with_ip(ip)
150         if old != -1:
151             self.nodes[old].current_addresses.remove(ip)
152
153     def add_node(self, node):
154         self.nodes.append(node)
155         self.all_public_ips |= node.public_addresses
156
157     def healthy(self, *pnns):
158         global options
159
160         verbose_begin("HEALTHY")
161
162         for pnn in pnns:
163             self.nodes[pnn].healthy = True
164             verbose_print(pnn)
165
166         verbose_end()
167         
168     def unhealthy(self, *pnns):
169
170         verbose_begin("UNHEALTHY")
171
172         for pnn in pnns:
173             self.nodes[pnn].healthy = False
174             verbose_print(pnn)
175
176         verbose_end()
177
178     def do_something_random(self):
179
180
181         """Make a random node healthy or unhealthy.
182
183         If all nodes are healthy or unhealthy, then invert one of
184         them.  Otherwise, there's a 1/4 chance of making another node
185         unhealthy."""
186
187         num_nodes = len(self.nodes)
188         healthy_nodes = [n for n in self.nodes if n.healthy]
189         num_healthy = len(healthy_nodes)
190
191         if num_nodes == num_healthy:
192             self.unhealthy(random.randint(0, num_nodes-1))
193         elif num_healthy == 0:
194             self.healthy(random.randint(0, num_nodes-1))
195         elif random.randint(1, 4) == 1:
196             self.unhealthy(self.nodes.index(random.choice(healthy_nodes)))
197         else:
198             self.healthy(self.nodes.index(random.choice(list(set(self.nodes) - set(healthy_nodes)))))
199
200     def random_iterations(self):
201         i = 1
202         while i <= options.iterations:
203             verbose_begin("EVENT %d" % i)
204             verbose_end()
205             self.do_something_random()
206             if self.recover() and options.exit > 0:
207                 break
208             i += 1
209
210         self.print_statistics()
211
212     def calculate_imbalance(self):
213
214         imbalance = 0
215
216         assigned = sorted([ip
217                            for n in self.nodes
218                            for ip in n.current_addresses])
219
220         for ip in assigned:
221
222             num_capable = 0
223             maxnode = -1
224             minnode = -1
225             for (i, n) in enumerate(self.nodes):
226                 if not n.healthy:
227                     continue
228
229                 if not n.can_node_serve_ip(ip):
230                     continue
231
232                 num_capable += 1
233
234                 num = n.node_ip_coverage()
235
236                 if maxnode == -1 or num > maxnum:
237                     maxnode = i
238                     maxnum = num
239
240                 if minnode == -1 or num < minnum:
241                     minnode = i
242                     minnum = num
243             
244             if maxnode == -1:
245                 continue
246
247             i = maxnum - minnum
248             if maxnum - minnum < 2:
249                 i = 0
250             imbalance = max([imbalance, i])
251
252         return imbalance
253
254     def diff(self, prev):
255         """Calculate differences in IP assignments between self and prev.
256
257         Gratuitous IP moves (from a healthy node to a healthy node)
258         are prefix by !!.  Any gratuitous IP moves cause this function
259         to return False.  If there are no gratuitous moves then it
260         will return True."""
261
262         ip_moves = 0
263         grat_ip_moves = 0
264         details = []
265
266         for (new, n) in enumerate(self.nodes):
267             for ip in n.current_addresses:
268                 old = prev.find_pnn_with_ip(ip)
269                 if old != new:
270                     ip_moves += 1
271                     if old != -1 and \
272                             prev.nodes[new].healthy and \
273                             self.nodes[new].healthy and \
274                             self.nodes[old].healthy and \
275                             prev.nodes[old].healthy:
276                         prefix = "!!"
277                         grat_ip_moves += 1
278                     else:
279                         prefix = "  "
280                     details.append("%s %s: %d -> %d" %
281                                    (prefix, ip, old, new))
282
283         return (ip_moves, grat_ip_moves, details)
284                     
285     def find_least_loaded_node(self, ip):
286         """Just like find_takeover_node but doesn't care about health."""
287         pnn = -1
288         min = 0
289         for (i, n) in enumerate(self.nodes):
290             if not n.can_node_serve_ip(ip):
291                 continue
292
293             num = n.node_ip_coverage()
294
295             if (pnn == -1):
296                 pnn = i
297                 min = num
298             else:
299                 if num < min:
300                     pnn = i
301                     min = num
302
303         if pnn == -1:
304             verbose_print("Could not find node to take over public address %s" % ip)
305             return False
306
307         self.nodes[pnn].current_addresses.add(ip)
308
309         verbose_print("%s -> %d" % (ip, pnn))
310         return True
311
312     def find_takeover_node(self, ip):
313
314         pnn = -1
315         min = 0
316         for (i, n) in enumerate(self.nodes):
317             if not n.healthy:
318                 continue
319
320             if not n.can_node_serve_ip(ip):
321                 continue
322
323             num = n.node_ip_coverage()
324
325             if (pnn == -1):
326                 pnn = i
327                 min = num
328             else:
329                 if num < min:
330                     pnn = i
331                     min = num
332
333         if pnn == -1:
334             verbose_print("Could not find node to take over public address %s" % ip)
335             return False
336
337         self.nodes[pnn].current_addresses.add(ip)
338
339         verbose_print("%s -> %d" % (ip, pnn))
340         return True
341
342     def ctdb_takeover_run(self):
343
344         global options
345
346         self.events += 1
347
348         # Don't bother with the num_healthy stuff.  It is an
349         # irrelevant detail.
350
351         # We just keep the allocate IPs in the current_addresses field
352         # of the node.  This needs to readable, not efficient!
353
354         if self.deterministic_public_ips:
355             # Remap everything.
356             addr_list = sorted(list(self.all_public_ips))
357             for (i, ip) in enumerate(addr_list):
358                 if options.hack == 1:
359                     self.quietly_remove_ip(ip)
360                     self.find_least_loaded_node(ip)
361                 elif options.hack == 2:
362                     pnn = i % len(self.nodes)
363                     if ip in self.nodes[pnn].public_addresses:
364                         self.quietly_remove_ip(ip)
365                         # Add addresses to new node.
366                         self.nodes[pnn].current_addresses.add(ip)
367                         verbose_print("%s -> %d" % (ip, pnn))
368                 else:
369                     self.quietly_remove_ip(ip)
370                     # Add addresses to new node.
371                     pnn = i % len(self.nodes)
372                     self.nodes[pnn].current_addresses.add(ip)
373                     verbose_print("%s -> %d" % (ip, pnn))
374
375         # Remove public addresses from unhealthy nodes.
376         for (pnn, n) in enumerate(self.nodes):
377             if not n.healthy:
378                 verbose_print(["%s <- %d" % (ip, pnn)
379                                for ip in n.current_addresses])
380                 n.current_addresses = set()
381
382         # If a node can't serve an assigned address then remove it.
383         for n in self.nodes:
384             verbose_print(["%s <- %d" % (ip, pnn)
385                            for ip in n.current_addresses - n.public_addresses])
386             n.current_addresses &= n.public_addresses
387
388         # We'll only retry the balancing act up to 5 times.
389         retries = 0
390         should_loop = True
391         while should_loop:
392             should_loop = False
393
394             assigned = set([ip for n in self.nodes for ip in n.current_addresses])
395             unassigned = sorted(list(self.all_public_ips - assigned))
396
397             for ip in unassigned:
398                 self.find_takeover_node(ip)
399
400             if self.no_ip_failback:
401                 break
402
403             assigned = sorted([ip
404                                for n in self.nodes
405                                for ip in n.current_addresses])
406             for ip in assigned:
407
408                 maxnode = -1
409                 minnode = -1
410                 for (i, n) in enumerate(self.nodes):
411                     if not n.healthy:
412                         continue
413
414                     if not n.can_node_serve_ip(ip):
415                         continue
416
417                     num = n.node_ip_coverage()
418
419                     if maxnode == -1:
420                         maxnode = i
421                         maxnum = num
422                     else:
423                         if num > maxnum:
424                             maxnode = i
425                             maxnum = num
426                     if minnode == -1:
427                         minnode = i
428                         minnum = num
429                     else:
430                         if num < minnum:
431                             minnode = i
432                             minnum = num
433
434                 if maxnode == -1:
435                     print "Could not maxnode. May not be able to serve ip", ip
436                     continue
437
438                 if self.deterministic_public_ips:
439                     continue
440
441                 if maxnum > minnum + 1 and retries < options.retries:
442                     # Remove the 1st ip from maxnode
443                     t = sorted(list(self.nodes[maxnode].current_addresses))
444                     realloc = t[0]
445                     verbose_print("%s <- %d" % (realloc, maxnode))
446                     self.nodes[maxnode].current_addresses.remove(realloc)
447                     retries += 1
448                     # Redo the outer loop.
449                     should_loop = True
450                     break
451
452     def recover(self):
453         global options, prev
454
455         verbose_begin("TAKEOVER")
456
457         self.ctdb_takeover_run()
458
459         verbose_end()
460
461         grat_ip_moves = 0
462
463         if prev is not None:
464             (ip_moves, grat_ip_moves, details) = self.diff(prev)
465             self.ip_moves.append(ip_moves)
466             self.grat_ip_moves.append(grat_ip_moves)
467
468             if options.diff:
469                 print_begin("DIFF")
470                 print "\n".join(details)
471                 print_end()
472
473         imbalance = self.calculate_imbalance()
474         self.imbalance.append(imbalance)
475         if options.balance:
476             print_begin("IMBALANCE")
477             print imbalance
478             print_end()
479
480         if options.show:
481             print_begin("STATE")
482             print self
483             print_end()
484
485         prev = copy.deepcopy(self)
486
487         return grat_ip_moves
488
489
490 ############################################################
491
492 prev = None
493
494 options = process_args()
495