3 # ctdb ip takeover code
5 # Copyright (C) Ronnie Sahlberg 2007
6 # Copyright (C) Andrew Tridgell 2007
8 # Python version (C) Martin Schwenke 2010
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either version 3 of the License, or
13 # (at your option) any later version.
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
20 # You should have received a copy of the GNU General Public License
21 # along with this program; if not, see <http://www.gnu.org/licenses/>.
26 from optparse import OptionParser
31 usage = "usage: %prog [options]"
33 parser = OptionParser(usage=usage)
35 parser.add_option("--nd",
36 action="store_false", dest="deterministic_public_ips",
38 help="turn off deterministic_public_ips")
39 parser.add_option("--ni",
40 action="store_true", dest="no_ip_failback", default=False,
41 help="turn on no_ip_failback")
42 parser.add_option("-v", "--verbose",
43 action="store_true", dest="verbose", default=False,
44 help="print information and actions taken to stdout")
45 parser.add_option("-d", "--diff",
46 action="store_true", dest="diff", default=False,
47 help="after each recovery show IP address movements")
48 parser.add_option("-n", "--no-print",
49 action="store_false", dest="show", default=True,
50 help="after each recovery don't print IP address layout")
51 parser.add_option("--hack",
52 action="store", type="int", dest="hack", default=0,
53 help="apply a hack (see the code!!!)")
54 parser.add_option("-r", "--retries",
55 action="store", type="int", dest="retries", default=5,
56 help="number of retry loops for rebalancing")
57 parser.add_option("-i", "--iterations",
58 action="store", type="int", dest="iterations",
60 help="number of iterations to run in test")
61 parser.add_option("-b", "--balance",
62 action="store_true", dest="balance", default=False,
63 help="show (im)balance information")
64 parser.add_option("-x", "--exit",
65 action="store_true", dest="exit", default=False,
66 help="exit on the 1st gratuitous IP move")
68 (options, args) = parser.parse_args()
71 parser.error("too many argumentss")
92 if not type(t) == list:
95 print "\n".join([str(i) for i in t])
99 def __init__(self, public_addresses):
100 self.public_addresses = set(public_addresses)
101 self.current_addresses = set()
104 def can_node_serve_ip(self, ip):
105 return ip in self.public_addresses
107 def node_ip_coverage(self):
108 return len(self.current_addresses)
110 class Cluster(object):
115 self.deterministic_public_ips = options.deterministic_public_ips
116 self.no_ip_failback = options.no_ip_failback
117 self.all_public_ips = set()
120 self.grat_ip_moves = []
125 return "\n".join(["%2d %s %s" %
127 "*" if len(n.public_addresses) == 0 else \
128 (" " if n.healthy else "#"),
129 sorted(list(n.current_addresses)))
130 for (i, n) in enumerate(self.nodes)])
132 def print_statistics(self):
133 print_begin("STATISTICS")
134 print "Events: %6d" % self.events
135 print "Total IP moves: %6d" % sum(self.ip_moves)
136 print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
137 print "Max imbalance: %6d" % max(self.imbalance)
138 print "Final imbalance: %6d" % self.imbalance[-1]
141 def find_pnn_with_ip(self, ip):
142 for (i, n) in enumerate(self.nodes):
143 if ip in n.current_addresses:
147 def quietly_remove_ip(self, ip):
148 # Remove address from old node.
149 old = self.find_pnn_with_ip(ip)
151 self.nodes[old].current_addresses.remove(ip)
153 def add_node(self, node):
154 self.nodes.append(node)
155 self.all_public_ips |= node.public_addresses
157 def healthy(self, *pnns):
160 verbose_begin("HEALTHY")
163 self.nodes[pnn].healthy = True
168 def unhealthy(self, *pnns):
170 verbose_begin("UNHEALTHY")
173 self.nodes[pnn].healthy = False
178 def do_something_random(self):
181 """Make a random node healthy or unhealthy.
183 If all nodes are healthy or unhealthy, then invert one of
184 them. Otherwise, there's a 1/4 chance of making another node
187 num_nodes = len(self.nodes)
188 healthy_nodes = [n for n in self.nodes if n.healthy]
189 num_healthy = len(healthy_nodes)
191 if num_nodes == num_healthy:
192 self.unhealthy(random.randint(0, num_nodes-1))
193 elif num_healthy == 0:
194 self.healthy(random.randint(0, num_nodes-1))
195 elif random.randint(1, 4) == 1:
196 self.unhealthy(self.nodes.index(random.choice(healthy_nodes)))
198 self.healthy(self.nodes.index(random.choice(list(set(self.nodes) - set(healthy_nodes)))))
200 def random_iterations(self):
202 while i <= options.iterations:
203 verbose_begin("EVENT %d" % i)
205 self.do_something_random()
206 if self.recover() and options.exit > 0:
210 self.print_statistics()
212 def calculate_imbalance(self):
216 assigned = sorted([ip
218 for ip in n.current_addresses])
225 for (i, n) in enumerate(self.nodes):
229 if not n.can_node_serve_ip(ip):
234 num = n.node_ip_coverage()
236 if maxnode == -1 or num > maxnum:
240 if minnode == -1 or num < minnum:
248 if maxnum - minnum < 2:
250 imbalance = max([imbalance, i])
254 def diff(self, prev):
255 """Calculate differences in IP assignments between self and prev.
257 Gratuitous IP moves (from a healthy node to a healthy node)
258 are prefix by !!. Any gratuitous IP moves cause this function
259 to return False. If there are no gratuitous moves then it
266 for (new, n) in enumerate(self.nodes):
267 for ip in n.current_addresses:
268 old = prev.find_pnn_with_ip(ip)
272 prev.nodes[new].healthy and \
273 self.nodes[new].healthy and \
274 self.nodes[old].healthy and \
275 prev.nodes[old].healthy:
280 details.append("%s %s: %d -> %d" %
281 (prefix, ip, old, new))
283 return (ip_moves, grat_ip_moves, details)
285 def find_least_loaded_node(self, ip):
286 """Just like find_takeover_node but doesn't care about health."""
289 for (i, n) in enumerate(self.nodes):
290 if not n.can_node_serve_ip(ip):
293 num = n.node_ip_coverage()
304 verbose_print("Could not find node to take over public address %s" % ip)
307 self.nodes[pnn].current_addresses.add(ip)
309 verbose_print("%s -> %d" % (ip, pnn))
312 def find_takeover_node(self, ip):
316 for (i, n) in enumerate(self.nodes):
320 if not n.can_node_serve_ip(ip):
323 num = n.node_ip_coverage()
334 verbose_print("Could not find node to take over public address %s" % ip)
337 self.nodes[pnn].current_addresses.add(ip)
339 verbose_print("%s -> %d" % (ip, pnn))
342 def ctdb_takeover_run(self):
348 # Don't bother with the num_healthy stuff. It is an
351 # We just keep the allocate IPs in the current_addresses field
352 # of the node. This needs to readable, not efficient!
354 if self.deterministic_public_ips:
356 addr_list = sorted(list(self.all_public_ips))
357 for (i, ip) in enumerate(addr_list):
358 if options.hack == 1:
359 self.quietly_remove_ip(ip)
360 self.find_least_loaded_node(ip)
361 elif options.hack == 2:
362 pnn = i % len(self.nodes)
363 if ip in self.nodes[pnn].public_addresses:
364 self.quietly_remove_ip(ip)
365 # Add addresses to new node.
366 self.nodes[pnn].current_addresses.add(ip)
367 verbose_print("%s -> %d" % (ip, pnn))
369 self.quietly_remove_ip(ip)
370 # Add addresses to new node.
371 pnn = i % len(self.nodes)
372 self.nodes[pnn].current_addresses.add(ip)
373 verbose_print("%s -> %d" % (ip, pnn))
375 # Remove public addresses from unhealthy nodes.
376 for (pnn, n) in enumerate(self.nodes):
378 verbose_print(["%s <- %d" % (ip, pnn)
379 for ip in n.current_addresses])
380 n.current_addresses = set()
382 # If a node can't serve an assigned address then remove it.
384 verbose_print(["%s <- %d" % (ip, pnn)
385 for ip in n.current_addresses - n.public_addresses])
386 n.current_addresses &= n.public_addresses
388 # We'll only retry the balancing act up to 5 times.
394 assigned = set([ip for n in self.nodes for ip in n.current_addresses])
395 unassigned = sorted(list(self.all_public_ips - assigned))
397 for ip in unassigned:
398 self.find_takeover_node(ip)
400 if self.no_ip_failback:
403 assigned = sorted([ip
405 for ip in n.current_addresses])
410 for (i, n) in enumerate(self.nodes):
414 if not n.can_node_serve_ip(ip):
417 num = n.node_ip_coverage()
435 print "Could not maxnode. May not be able to serve ip", ip
438 if self.deterministic_public_ips:
441 if maxnum > minnum + 1 and retries < options.retries:
442 # Remove the 1st ip from maxnode
443 t = sorted(list(self.nodes[maxnode].current_addresses))
445 verbose_print("%s <- %d" % (realloc, maxnode))
446 self.nodes[maxnode].current_addresses.remove(realloc)
448 # Redo the outer loop.
455 verbose_begin("TAKEOVER")
457 self.ctdb_takeover_run()
464 (ip_moves, grat_ip_moves, details) = self.diff(prev)
465 self.ip_moves.append(ip_moves)
466 self.grat_ip_moves.append(grat_ip_moves)
470 print "\n".join(details)
473 imbalance = self.calculate_imbalance()
474 self.imbalance.append(imbalance)
476 print_begin("IMBALANCE")
485 prev = copy.deepcopy(self)
490 ############################################################
494 options = process_args()