Eventscripts: print a message when reconfiguring a service.
[sahlberg/ctdb.git] / tests / takeover / ctdb_takeover.py
1 #!/usr/bin/env python
2
3 # ctdb ip takeover code
4
5 # Copyright (C) Martin Schwenke 2010
6
7 # Based on original CTDB C code:
8 #
9 # Copyright (C) Ronnie Sahlberg  2007
10 # Copyright (C) Andrew Tridgell  2007
11
12 # This program is free software; you can redistribute it and/or modify
13 # it under the terms of the GNU General Public License as published by
14 # the Free Software Foundation; either version 3 of the License, or
15 # (at your option) any later version.
16
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 # GNU General Public License for more details.
21
22 # You should have received a copy of the GNU General Public License
23 # along with this program; if not, see <http://www.gnu.org/licenses/>.
24
25
26 import os
27 import sys
28 # Use optparse since newer argparse not available in RHEL5/EPEL.
29 from optparse import OptionParser
30 import copy
31 import random
32
33 options = None
34
35 def process_args(extra_options=[]):
36     global options
37
38     parser = OptionParser(option_list=extra_options)
39
40     parser.add_option("--nd",
41                       action="store_false", dest="deterministic_public_ips",
42                       default=True,
43                       help="turn off deterministic_public_ips")
44     parser.add_option("--ni",
45                       action="store_true", dest="no_ip_failback", default=False,
46                       help="turn on no_ip_failback")
47     parser.add_option("-b", "--balance",
48                       action="store_true", dest="balance", default=False,
49                       help="show (im)balance information after each event")
50     parser.add_option("-d", "--diff",
51                       action="store_true", dest="diff", default=False,
52                       help="show IP address movements for each event")
53     parser.add_option("-n", "--no-print",
54                       action="store_false", dest="show", default=True,
55                       help="don't show IP address layout after each event")
56     parser.add_option("-v", "--verbose",
57                       action="store_true", dest="verbose", default=False,
58                       help="print information and actions taken to stdout")
59     parser.add_option("--hack",
60                       action="store", type="int", dest="hack", default=0,
61                       help="apply a hack (see the code!!!)")
62     parser.add_option("-r", "--retries",
63                       action="store", type="int", dest="retries", default=5,
64                       help="number of retry loops for rebalancing [default: %default]")
65     parser.add_option("-i", "--iterations",
66                       action="store", type="int", dest="iterations",
67                       default=1000,
68                       help="number of iterations to run in test [default: %default]")
69     parser.add_option("-o", "--odds",
70                       action="store", type="int", dest="odds", default=4,
71                       help="make the chances of a failover 1 in ODDS [default: %default]")
72
73     def seed_callback(option, opt, value, parser):
74         random.seed(value)
75     parser.add_option("-s", "--seed",
76                       action="callback", type="int", callback=seed_callback,
77                       help="initial random number seed for random events")
78
79     parser.add_option("-x", "--exit",
80                       action="store_true", dest="exit", default=False,
81                       help="exit on the 1st gratuitous IP move")
82     
83     (options, args) = parser.parse_args()
84
85     if len(args) != 0:
86         parser.error("too many argumentss")
87
88 def print_begin(t):
89     print "=" * 40
90     print "%s:" % (t)
91
92 def print_end():
93     print "-" * 40
94
95 def verbose_begin(t):
96     if options.verbose:
97         print_begin(t)
98
99 def verbose_end():
100     if options.verbose:
101         print_end()
102
103 def verbose_print(t):
104     if options.verbose:
105         if not type(t) == list:
106             t = [t]
107         if t != []:
108             print "\n".join([str(i) for i in t])
109
110
111 class Node(object):
112     def __init__(self, public_addresses):
113         self.public_addresses = set(public_addresses)
114         self.current_addresses = set()
115         self.healthy = True
116
117     def can_node_serve_ip(self, ip):
118         return ip in self.public_addresses
119
120     def node_ip_coverage(self):
121         return len(self.current_addresses)
122
123 class Cluster(object):
124     def __init__(self):
125         self.nodes = []
126         self.deterministic_public_ips = options.deterministic_public_ips
127         self.no_ip_failback = options.no_ip_failback
128         self.all_public_ips = set()
129
130         # Statistics
131         self.ip_moves = []
132         self.grat_ip_moves = []
133         self.imbalance = []
134         self.events = -1
135         self.num_unhealthy = []
136
137         self.prev = None
138
139     def __str__(self):
140         return "\n".join(["%2d %s %s" %
141                           (i,
142                            "*" if len(n.public_addresses) == 0 else \
143                                (" " if n.healthy else "#"),
144                            sorted(list(n.current_addresses)))
145                           for (i, n) in enumerate(self.nodes)])
146
147     def print_statistics(self):
148         print_begin("STATISTICS")
149         print "Events:              %6d" % self.events
150         print "Total IP moves:      %6d" % sum(self.ip_moves)
151         print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
152         print "Max imbalance:       %6d" % max(self.imbalance)
153         print "Final imbalance:     %6d" % self.imbalance[-1]
154         print "Maximum unhealthy:   %6d" % max(self.num_unhealthy)
155         print_end()
156
157     def find_pnn_with_ip(self, ip):
158         for (i, n) in enumerate(self.nodes):
159             if ip in n.current_addresses:
160                 return i
161         return -1
162
163     def quietly_remove_ip(self, ip):
164         # Remove address from old node.
165         old = self.find_pnn_with_ip(ip)
166         if old != -1:
167             self.nodes[old].current_addresses.remove(ip)
168
169     def add_node(self, node):
170         self.nodes.append(node)
171         self.all_public_ips |= node.public_addresses
172
173     def healthy(self, *pnns):
174         verbose_begin("HEALTHY")
175
176         for pnn in pnns:
177             self.nodes[pnn].healthy = True
178             verbose_print(pnn)
179
180         verbose_end()
181         
182     def unhealthy(self, *pnns):
183
184         verbose_begin("UNHEALTHY")
185
186         for pnn in pnns:
187             self.nodes[pnn].healthy = False
188             verbose_print(pnn)
189
190         verbose_end()
191
192     def do_something_random(self):
193
194
195         """Make a random node healthy or unhealthy.
196
197         If all nodes are healthy or unhealthy, then invert one of
198         them.  Otherwise, there's a 1 in options.odds chance of making
199         another node unhealthy."""
200
201         num_nodes = len(self.nodes)
202         healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
203         num_healthy = len(healthy_pnns)
204
205         if num_nodes == num_healthy:
206             self.unhealthy(random.randint(0, num_nodes-1))
207         elif num_healthy == 0:
208             self.healthy(random.randint(0, num_nodes-1))
209         elif random.randint(1, options.odds) == 1:
210             self.unhealthy(random.choice(healthy_pnns))
211         else:
212             all_pnns = range(num_nodes)
213             unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
214             self.healthy(random.choice(unhealthy_pnns))
215
216     def random_iterations(self):
217         i = 1
218         while i <= options.iterations:
219             verbose_begin("EVENT %d" % i)
220             verbose_end()
221             self.do_something_random()
222             if self.recover() and options.exit > 0:
223                 break
224             i += 1
225
226         self.print_statistics()
227
228     def calculate_imbalance(self):
229
230         imbalance = 0
231
232         assigned = sorted([ip
233                            for n in self.nodes
234                            for ip in n.current_addresses])
235
236         for ip in assigned:
237
238             num_capable = 0
239             maxnode = -1
240             minnode = -1
241             for (i, n) in enumerate(self.nodes):
242                 if not n.healthy:
243                     continue
244
245                 if not n.can_node_serve_ip(ip):
246                     continue
247
248                 num_capable += 1
249
250                 num = n.node_ip_coverage()
251
252                 if maxnode == -1 or num > maxnum:
253                     maxnode = i
254                     maxnum = num
255
256                 if minnode == -1 or num < minnum:
257                     minnode = i
258                     minnum = num
259             
260             if maxnode == -1:
261                 continue
262
263             i = maxnum - minnum
264             if maxnum - minnum < 2:
265                 i = 0
266             imbalance = max([imbalance, i])
267
268         return imbalance
269
270     def diff(self):
271         """Calculate differences in IP assignments between self and prev.
272
273         Gratuitous IP moves (from a healthy node to a healthy node)
274         are prefix by !!.  Any gratuitous IP moves cause this function
275         to return False.  If there are no gratuitous moves then it
276         will return True."""
277
278         ip_moves = 0
279         grat_ip_moves = 0
280         details = []
281
282         for (new, n) in enumerate(self.nodes):
283             for ip in n.current_addresses:
284                 old = self.prev.find_pnn_with_ip(ip)
285                 if old != new:
286                     ip_moves += 1
287                     if old != -1 and \
288                             self.prev.nodes[new].healthy and \
289                             self.nodes[new].healthy and \
290                             self.nodes[old].healthy and \
291                             self.prev.nodes[old].healthy:
292                         prefix = "!!"
293                         grat_ip_moves += 1
294                     else:
295                         prefix = "  "
296                     details.append("%s %s: %d -> %d" %
297                                    (prefix, ip, old, new))
298
299         return (ip_moves, grat_ip_moves, details)
300                     
301     def find_least_loaded_node(self, ip):
302         """Just like find_takeover_node but doesn't care about health."""
303         pnn = -1
304         min = 0
305         for (i, n) in enumerate(self.nodes):
306             if not n.can_node_serve_ip(ip):
307                 continue
308
309             num = n.node_ip_coverage()
310
311             if (pnn == -1):
312                 pnn = i
313                 min = num
314             else:
315                 if num < min:
316                     pnn = i
317                     min = num
318
319         if pnn == -1:
320             verbose_print("Could not find node to take over public address %s" % ip)
321             return False
322
323         self.nodes[pnn].current_addresses.add(ip)
324
325         verbose_print("%s -> %d" % (ip, pnn))
326         return True
327
328     def find_takeover_node(self, ip):
329
330         pnn = -1
331         min = 0
332         for (i, n) in enumerate(self.nodes):
333             if not n.healthy:
334                 continue
335
336             if not n.can_node_serve_ip(ip):
337                 continue
338
339             num = n.node_ip_coverage()
340
341             if (pnn == -1):
342                 pnn = i
343                 min = num
344             else:
345                 if num < min:
346                     pnn = i
347                     min = num
348
349         if pnn == -1:
350             verbose_print("Could not find node to take over public address %s" % ip)
351             return False
352
353         self.nodes[pnn].current_addresses.add(ip)
354
355         verbose_print("%s -> %d" % (ip, pnn))
356         return True
357
358     def ctdb_takeover_run(self):
359
360         self.events += 1
361
362         # Don't bother with the num_healthy stuff.  It is an
363         # irrelevant detail.
364
365         # We just keep the allocate IPs in the current_addresses field
366         # of the node.  This needs to readable, not efficient!
367
368         if self.deterministic_public_ips:
369             # Remap everything.
370             addr_list = sorted(list(self.all_public_ips))
371             for (i, ip) in enumerate(addr_list):
372                 if options.hack == 1:
373                     self.quietly_remove_ip(ip)
374                     self.find_least_loaded_node(ip)
375                 elif options.hack == 2:
376                     pnn = i % len(self.nodes)
377                     if ip in self.nodes[pnn].public_addresses:
378                         self.quietly_remove_ip(ip)
379                         # Add addresses to new node.
380                         self.nodes[pnn].current_addresses.add(ip)
381                         verbose_print("%s -> %d" % (ip, pnn))
382                 else:
383                     self.quietly_remove_ip(ip)
384                     # Add addresses to new node.
385                     pnn = i % len(self.nodes)
386                     self.nodes[pnn].current_addresses.add(ip)
387                     verbose_print("%s -> %d" % (ip, pnn))
388
389         # Remove public addresses from unhealthy nodes.
390         for (pnn, n) in enumerate(self.nodes):
391             if not n.healthy:
392                 verbose_print(["%s <- %d" % (ip, pnn)
393                                for ip in n.current_addresses])
394                 n.current_addresses = set()
395
396         # If a node can't serve an assigned address then remove it.
397         for n in self.nodes:
398             verbose_print(["%s <- %d" % (ip, pnn)
399                            for ip in n.current_addresses - n.public_addresses])
400             n.current_addresses &= n.public_addresses
401
402         # We'll only retry the balancing act up to 5 times.
403         retries = 0
404         should_loop = True
405         while should_loop:
406             should_loop = False
407
408             assigned = set([ip for n in self.nodes for ip in n.current_addresses])
409             unassigned = sorted(list(self.all_public_ips - assigned))
410
411             for ip in unassigned:
412                 self.find_takeover_node(ip)
413
414             if self.no_ip_failback:
415                 break
416
417             assigned = sorted([ip
418                                for n in self.nodes
419                                for ip in n.current_addresses])
420             for ip in assigned:
421
422                 maxnode = -1
423                 minnode = -1
424                 for (i, n) in enumerate(self.nodes):
425                     if not n.healthy:
426                         continue
427
428                     if not n.can_node_serve_ip(ip):
429                         continue
430
431                     num = n.node_ip_coverage()
432
433                     if maxnode == -1:
434                         maxnode = i
435                         maxnum = num
436                     else:
437                         if num > maxnum:
438                             maxnode = i
439                             maxnum = num
440                     if minnode == -1:
441                         minnode = i
442                         minnum = num
443                     else:
444                         if num < minnum:
445                             minnode = i
446                             minnum = num
447
448                 if maxnode == -1:
449                     print "Could not maxnode. May not be able to serve ip", ip
450                     continue
451
452                 if self.deterministic_public_ips:
453                     continue
454
455                 if maxnum > minnum + 1 and retries < options.retries:
456                     # Remove the 1st ip from maxnode
457                     t = sorted(list(self.nodes[maxnode].current_addresses))
458                     realloc = t[0]
459                     verbose_print("%s <- %d" % (realloc, maxnode))
460                     self.nodes[maxnode].current_addresses.remove(realloc)
461                     retries += 1
462                     # Redo the outer loop.
463                     should_loop = True
464                     break
465
466     def recover(self):
467         verbose_begin("TAKEOVER")
468
469         self.ctdb_takeover_run()
470
471         verbose_end()
472
473         grat_ip_moves = 0
474
475         if self.prev is not None:
476             (ip_moves, grat_ip_moves, details) = self.diff()
477             self.ip_moves.append(ip_moves)
478             self.grat_ip_moves.append(grat_ip_moves)
479
480             if options.diff:
481                 print_begin("DIFF")
482                 print "\n".join(details)
483                 print_end()
484
485         imbalance = self.calculate_imbalance()
486         self.imbalance.append(imbalance)
487         if options.balance:
488             print_begin("IMBALANCE")
489             print imbalance
490             print_end()
491
492         num_unhealthy = len(self.nodes) - \
493             len([n for n in self.nodes if n.healthy])
494         self.num_unhealthy.append(num_unhealthy)
495
496         if options.show:
497             print_begin("STATE")
498             print self
499             print_end()
500
501         self.prev = None
502         self.prev = copy.deepcopy(self)
503
504         return grat_ip_moves