3 # ctdb ip takeover code
5 # Copyright (C) Martin Schwenke 2010
7 # Based on original CTDB C code:
9 # Copyright (C) Ronnie Sahlberg 2007
10 # Copyright (C) Andrew Tridgell 2007
12 # This program is free software; you can redistribute it and/or modify
13 # it under the terms of the GNU General Public License as published by
14 # the Free Software Foundation; either version 3 of the License, or
15 # (at your option) any later version.
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # GNU General Public License for more details.
22 # You should have received a copy of the GNU General Public License
23 # along with this program; if not, see <http://www.gnu.org/licenses/>.
28 # Use optparse since newer argparse not available in RHEL5/EPEL.
29 from optparse import OptionParser
35 def process_args(extra_options=[]):
38 parser = OptionParser(option_list=extra_options)
40 parser.add_option("--nd",
41 action="store_false", dest="deterministic_public_ips",
43 help="turn off deterministic_public_ips")
44 parser.add_option("--ni",
45 action="store_true", dest="no_ip_failback", default=False,
46 help="turn on no_ip_failback")
47 parser.add_option("-b", "--balance",
48 action="store_true", dest="balance", default=False,
49 help="show (im)balance information after each event")
50 parser.add_option("-d", "--diff",
51 action="store_true", dest="diff", default=False,
52 help="show IP address movements for each event")
53 parser.add_option("-n", "--no-print",
54 action="store_false", dest="show", default=True,
55 help="don't show IP address layout after each event")
56 parser.add_option("-v", "--verbose",
57 action="store_true", dest="verbose", default=False,
58 help="print information and actions taken to stdout")
59 parser.add_option("--hack",
60 action="store", type="int", dest="hack", default=0,
61 help="apply a hack (see the code!!!)")
62 parser.add_option("-r", "--retries",
63 action="store", type="int", dest="retries", default=5,
64 help="number of retry loops for rebalancing [default: %default]")
65 parser.add_option("-i", "--iterations",
66 action="store", type="int", dest="iterations",
68 help="number of iterations to run in test [default: %default]")
69 parser.add_option("-o", "--odds",
70 action="store", type="int", dest="odds", default=4,
71 help="make the chances of a failover 1 in ODDS [default: %default]")
73 def seed_callback(option, opt, value, parser):
75 parser.add_option("-s", "--seed",
76 action="callback", type="int", callback=seed_callback,
77 help="initial random number seed for random events")
79 parser.add_option("-x", "--exit",
80 action="store_true", dest="exit", default=False,
81 help="exit on the 1st gratuitous IP move")
83 (options, args) = parser.parse_args()
86 parser.error("too many argumentss")
103 def verbose_print(t):
105 if not type(t) == list:
108 print "\n".join([str(i) for i in t])
112 def __init__(self, public_addresses):
113 self.public_addresses = set(public_addresses)
114 self.current_addresses = set()
117 def can_node_serve_ip(self, ip):
118 return ip in self.public_addresses
120 def node_ip_coverage(self):
121 return len(self.current_addresses)
123 class Cluster(object):
126 self.deterministic_public_ips = options.deterministic_public_ips
127 self.no_ip_failback = options.no_ip_failback
128 self.all_public_ips = set()
132 self.grat_ip_moves = []
135 self.num_unhealthy = []
140 return "\n".join(["%2d %s %s" %
142 "*" if len(n.public_addresses) == 0 else \
143 (" " if n.healthy else "#"),
144 sorted(list(n.current_addresses)))
145 for (i, n) in enumerate(self.nodes)])
147 def print_statistics(self):
148 print_begin("STATISTICS")
149 print "Events: %6d" % self.events
150 print "Total IP moves: %6d" % sum(self.ip_moves)
151 print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
152 print "Max imbalance: %6d" % max(self.imbalance)
153 print "Final imbalance: %6d" % self.imbalance[-1]
154 print "Maximum unhealthy: %6d" % max(self.num_unhealthy)
157 def find_pnn_with_ip(self, ip):
158 for (i, n) in enumerate(self.nodes):
159 if ip in n.current_addresses:
163 def quietly_remove_ip(self, ip):
164 # Remove address from old node.
165 old = self.find_pnn_with_ip(ip)
167 self.nodes[old].current_addresses.remove(ip)
169 def add_node(self, node):
170 self.nodes.append(node)
171 self.all_public_ips |= node.public_addresses
173 def healthy(self, *pnns):
174 verbose_begin("HEALTHY")
177 self.nodes[pnn].healthy = True
182 def unhealthy(self, *pnns):
184 verbose_begin("UNHEALTHY")
187 self.nodes[pnn].healthy = False
192 def do_something_random(self):
195 """Make a random node healthy or unhealthy.
197 If all nodes are healthy or unhealthy, then invert one of
198 them. Otherwise, there's a 1 in options.odds chance of making
199 another node unhealthy."""
201 num_nodes = len(self.nodes)
202 healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
203 num_healthy = len(healthy_pnns)
205 if num_nodes == num_healthy:
206 self.unhealthy(random.randint(0, num_nodes-1))
207 elif num_healthy == 0:
208 self.healthy(random.randint(0, num_nodes-1))
209 elif random.randint(1, options.odds) == 1:
210 self.unhealthy(random.choice(healthy_pnns))
212 all_pnns = range(num_nodes)
213 unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
214 self.healthy(random.choice(unhealthy_pnns))
216 def random_iterations(self):
218 while i <= options.iterations:
219 verbose_begin("EVENT %d" % i)
221 self.do_something_random()
222 if self.recover() and options.exit > 0:
226 self.print_statistics()
228 def calculate_imbalance(self):
232 assigned = sorted([ip
234 for ip in n.current_addresses])
241 for (i, n) in enumerate(self.nodes):
245 if not n.can_node_serve_ip(ip):
250 num = n.node_ip_coverage()
252 if maxnode == -1 or num > maxnum:
256 if minnode == -1 or num < minnum:
264 if maxnum - minnum < 2:
266 imbalance = max([imbalance, i])
271 """Calculate differences in IP assignments between self and prev.
273 Gratuitous IP moves (from a healthy node to a healthy node)
274 are prefix by !!. Any gratuitous IP moves cause this function
275 to return False. If there are no gratuitous moves then it
282 for (new, n) in enumerate(self.nodes):
283 for ip in n.current_addresses:
284 old = self.prev.find_pnn_with_ip(ip)
288 self.prev.nodes[new].healthy and \
289 self.nodes[new].healthy and \
290 self.nodes[old].healthy and \
291 self.prev.nodes[old].healthy:
296 details.append("%s %s: %d -> %d" %
297 (prefix, ip, old, new))
299 return (ip_moves, grat_ip_moves, details)
301 def find_least_loaded_node(self, ip):
302 """Just like find_takeover_node but doesn't care about health."""
305 for (i, n) in enumerate(self.nodes):
306 if not n.can_node_serve_ip(ip):
309 num = n.node_ip_coverage()
320 verbose_print("Could not find node to take over public address %s" % ip)
323 self.nodes[pnn].current_addresses.add(ip)
325 verbose_print("%s -> %d" % (ip, pnn))
328 def find_takeover_node(self, ip):
332 for (i, n) in enumerate(self.nodes):
336 if not n.can_node_serve_ip(ip):
339 num = n.node_ip_coverage()
350 verbose_print("Could not find node to take over public address %s" % ip)
353 self.nodes[pnn].current_addresses.add(ip)
355 verbose_print("%s -> %d" % (ip, pnn))
358 def ctdb_takeover_run(self):
362 # Don't bother with the num_healthy stuff. It is an
365 # We just keep the allocate IPs in the current_addresses field
366 # of the node. This needs to readable, not efficient!
368 if self.deterministic_public_ips:
370 addr_list = sorted(list(self.all_public_ips))
371 for (i, ip) in enumerate(addr_list):
372 if options.hack == 1:
373 self.quietly_remove_ip(ip)
374 self.find_least_loaded_node(ip)
375 elif options.hack == 2:
376 pnn = i % len(self.nodes)
377 if ip in self.nodes[pnn].public_addresses:
378 self.quietly_remove_ip(ip)
379 # Add addresses to new node.
380 self.nodes[pnn].current_addresses.add(ip)
381 verbose_print("%s -> %d" % (ip, pnn))
383 self.quietly_remove_ip(ip)
384 # Add addresses to new node.
385 pnn = i % len(self.nodes)
386 self.nodes[pnn].current_addresses.add(ip)
387 verbose_print("%s -> %d" % (ip, pnn))
389 # Remove public addresses from unhealthy nodes.
390 for (pnn, n) in enumerate(self.nodes):
392 verbose_print(["%s <- %d" % (ip, pnn)
393 for ip in n.current_addresses])
394 n.current_addresses = set()
396 # If a node can't serve an assigned address then remove it.
398 verbose_print(["%s <- %d" % (ip, pnn)
399 for ip in n.current_addresses - n.public_addresses])
400 n.current_addresses &= n.public_addresses
402 # We'll only retry the balancing act up to 5 times.
408 assigned = set([ip for n in self.nodes for ip in n.current_addresses])
409 unassigned = sorted(list(self.all_public_ips - assigned))
411 for ip in unassigned:
412 self.find_takeover_node(ip)
414 if self.no_ip_failback:
417 assigned = sorted([ip
419 for ip in n.current_addresses])
424 for (i, n) in enumerate(self.nodes):
428 if not n.can_node_serve_ip(ip):
431 num = n.node_ip_coverage()
449 print "Could not maxnode. May not be able to serve ip", ip
452 if self.deterministic_public_ips:
455 if maxnum > minnum + 1 and retries < options.retries:
456 # Remove the 1st ip from maxnode
457 t = sorted(list(self.nodes[maxnode].current_addresses))
459 verbose_print("%s <- %d" % (realloc, maxnode))
460 self.nodes[maxnode].current_addresses.remove(realloc)
462 # Redo the outer loop.
467 verbose_begin("TAKEOVER")
469 self.ctdb_takeover_run()
475 if self.prev is not None:
476 (ip_moves, grat_ip_moves, details) = self.diff()
477 self.ip_moves.append(ip_moves)
478 self.grat_ip_moves.append(grat_ip_moves)
482 print "\n".join(details)
485 imbalance = self.calculate_imbalance()
486 self.imbalance.append(imbalance)
488 print_begin("IMBALANCE")
492 num_unhealthy = len(self.nodes) - \
493 len([n for n in self.nodes if n.healthy])
494 self.num_unhealthy.append(num_unhealthy)
502 self.prev = copy.deepcopy(self)