ctdb-takeover: Add takeover helper
[samba.git] / ctdb / server / ctdb_takeover_helper.c
1 /*
2    CTDB IP takeover helper
3
4    Copyright (C) Martin Schwenke  2016
5
6    Based on ctdb_recovery_helper.c
7    Copyright (C) Amitay Isaacs  2015
8
9    and ctdb_takeover.c
10    Copyright (C) Ronnie Sahlberg  2007
11    Copyright (C) Andrew Tridgell  2007
12    Copyright (C) Martin Schwenke  2011
13
14    This program is free software; you can redistribute it and/or modify
15    it under the terms of the GNU General Public License as published by
16    the Free Software Foundation; either version 3 of the License, or
17    (at your option) any later version.
18
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License for more details.
23
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28 #include "replace.h"
29 #include "system/network.h"
30 #include "system/filesys.h"
31
32 #include <popt.h>
33 #include <talloc.h>
34 #include <tevent.h>
35
36 #include "lib/util/debug.h"
37 #include "lib/util/strv.h"
38 #include "lib/util/strv_util.h"
39 #include "lib/util/sys_rw.h"
40 #include "lib/util/time.h"
41 #include "lib/util/tevent_unix.h"
42
43 #include "protocol/protocol.h"
44 #include "protocol/protocol_api.h"
45 #include "client/client.h"
46
47 #include "common/logging.h"
48
49 #include "server/ipalloc.h"
50
51 static int takeover_timeout = 9;
52
53 #define TIMEOUT()       timeval_current_ofs(takeover_timeout, 0)
54
55 /*
56  * Utility functions
57  */
58
59 static bool generic_recv(struct tevent_req *req, int *perr)
60 {
61         int err;
62
63         if (tevent_req_is_unix_error(req, &err)) {
64                 if (perr != NULL) {
65                         *perr = err;
66                 }
67                 return false;
68         }
69
70         return true;
71 }
72
73 static enum ipalloc_algorithm
74 determine_algorithm(const struct ctdb_tunable_list *tunables)
75 {
76         switch (tunables->ip_alloc_algorithm) {
77         case 0:
78                 return IPALLOC_DETERMINISTIC;
79         case 1:
80                 return IPALLOC_NONDETERMINISTIC;
81         case 2:
82                 return IPALLOC_LCP2;
83         default:
84                 return IPALLOC_LCP2;
85         };
86 }
87
88 /**********************************************************************/
89
90 struct get_public_ips_state {
91         struct tevent_context *ev;
92         struct ctdb_client_context *client;
93         uint32_t *pnns;
94         int count;
95         struct ctdb_public_ip_list *ips;
96 };
97
98 static void get_public_ips_done(struct tevent_req *subreq);
99
100 static struct tevent_req *get_public_ips_send(
101                                 TALLOC_CTX *mem_ctx,
102                                 struct tevent_context *ev,
103                                 struct ctdb_client_context *client,
104                                 uint32_t *pnns,
105                                 int count,
106                                 bool available_only)
107 {
108         struct tevent_req *req, *subreq;
109         struct get_public_ips_state *state;
110         struct ctdb_req_control request;
111
112         req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
113         if (req == NULL) {
114                 return tevent_req_post(req, ev);
115         }
116
117         state->pnns = pnns;
118         state->count = count;
119         state->ips = NULL;
120
121         ctdb_req_control_get_public_ips(&request, available_only);
122         subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
123                                                 state->pnns,
124                                                 state->count,
125                                                 TIMEOUT(), &request);
126         if (tevent_req_nomem(subreq, req)) {
127                 return tevent_req_post(req, ev);
128         }
129         tevent_req_set_callback(subreq, get_public_ips_done, req);
130
131         return req;
132 }
133
134 static void get_public_ips_done(struct tevent_req *subreq)
135 {
136         struct tevent_req *req = tevent_req_callback_data(
137                 subreq, struct tevent_req);
138         struct get_public_ips_state *state = tevent_req_data(
139                 req, struct get_public_ips_state);
140         struct ctdb_reply_control **reply;
141         int *err_list;
142         int ret, i;
143         bool status;
144
145         status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
146                                                 &reply);
147         TALLOC_FREE(subreq);
148         if (! status) {
149                 int ret2;
150                 uint32_t pnn;
151
152                 ret2 = ctdb_client_control_multi_error(state->pnns,
153                                                        state->count,
154                                                        err_list, &pnn);
155                 if (ret2 != 0) {
156                         D_ERR("control GET_PUBLIC_IPS failed on "
157                               "node %u, ret=%d\n", pnn, ret2);
158                 } else {
159                         D_ERR("control GET_PUBLIC_IPS failed, "
160                               "ret=%d\n", ret);
161                 }
162                 tevent_req_error(req, ret);
163                 return;
164         }
165
166         state->ips = talloc_zero_array(state, struct ctdb_public_ip_list,
167                                        state->count);
168         if (tevent_req_nomem(state->ips, req)) {
169                 return;
170         }
171
172         for (i = 0; i < state->count; i++) {
173                 uint32_t pnn;
174                 struct ctdb_public_ip_list *ips;
175
176                 pnn = state->pnns[i];
177                 ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
178                                                         &ips);
179                 if (ret != 0) {
180                         D_ERR("control GET_PUBLIC_IPS failed on "
181                               "node %u\n", pnn);
182                         tevent_req_error(req, EIO);
183                         return;
184                 }
185                 state->ips[pnn] = *ips;
186         }
187
188         talloc_free(reply);
189
190         tevent_req_done(req);
191 }
192
193 static bool get_public_ips_recv(struct tevent_req *req, int *perr,
194                                 TALLOC_CTX *mem_ctx,
195                                 struct ctdb_public_ip_list **ips)
196 {
197         struct get_public_ips_state *state = tevent_req_data(
198                 req, struct get_public_ips_state);
199         int err;
200
201         if (tevent_req_is_unix_error(req, &err)) {
202                 if (perr != NULL) {
203                         *perr = err;
204                 }
205                 return false;
206         }
207
208         *ips = talloc_steal(mem_ctx, state->ips);
209
210         return true;
211 }
212
213 /**********************************************************************/
214
215 struct release_ip_state {
216         int num_sent;
217         int num_replies;
218         int num_fails;
219         int err_any;
220         uint32_t *ban_credits;
221 };
222
223 struct release_ip_one_state {
224         struct tevent_req *req;
225         uint32_t *pnns;
226         int count;
227         const char *ip_str;
228 };
229
230 static void release_ip_done(struct tevent_req *subreq);
231
232 static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
233                                           struct tevent_context *ev,
234                                           struct ctdb_client_context *client,
235                                           uint32_t *pnns,
236                                           int count,
237                                           struct timeval timeout,
238                                           struct public_ip_list *all_ips,
239                                           uint32_t *ban_credits)
240 {
241         struct tevent_req *req, *subreq;
242         struct release_ip_state *state;
243         struct ctdb_req_control request;
244         struct public_ip_list *tmp_ip;
245
246         req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
247         if (req == NULL) {
248                 return NULL;
249         }
250
251         state->num_sent = 0;
252         state->num_replies = 0;
253         state->num_fails = 0;
254         state->ban_credits = ban_credits;
255
256         /* Send a RELEASE_IP to all nodes that should not be hosting
257          * each IP.  For each IP, all but one of these will be
258          * redundant.  However, the redundant ones are used to tell
259          * nodes which node should be hosting the IP so that commands
260          * like "ctdb ip" can display a particular nodes idea of who
261          * is hosting what. */
262         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
263                 struct release_ip_one_state *substate;
264                 struct ctdb_public_ip ip;
265                 int i;
266
267                 substate = talloc_zero(state, struct release_ip_one_state);
268                 if (tevent_req_nomem(substate, req)) {
269                         return tevent_req_post(req, ev);
270                 }
271
272                 substate->pnns = talloc_zero_array(substate, uint32_t, count);
273                 if (tevent_req_nomem(substate->pnns, req)) {
274                         return tevent_req_post(req, ev);
275                 }
276
277                 substate->count = 0;
278                 substate->req = req;
279
280                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
281                                                              &tmp_ip->addr);
282                 if (tevent_req_nomem(substate->ip_str, req)) {
283                         return tevent_req_post(req, ev);
284                 }
285
286                 for (i = 0; i < count; i++) {
287                         uint32_t pnn = pnns[i];
288                         /* If pnn is not the node that should be
289                          * hosting the IP then add it to the list of
290                          * nodes that need to do a release. */
291                         if (tmp_ip->pnn != pnn) {
292                                 substate->pnns[substate->count] = pnn;
293                                 substate->count++;
294                         }
295                 }
296
297                 ip.pnn = tmp_ip->pnn;
298                 ip.addr = tmp_ip->addr;
299                 ctdb_req_control_release_ip(&request, &ip);
300                 subreq = ctdb_client_control_multi_send(state, ev, client,
301                                                         substate->pnns,
302                                                         substate->count,
303                                                         timeout,/* cumulative */
304                                                         &request);
305                 if (tevent_req_nomem(subreq, req)) {
306                         return tevent_req_post(req, ev);
307                 }
308                 tevent_req_set_callback(subreq, release_ip_done, substate);
309
310                 state->num_sent++;
311         }
312
313         return req;
314 }
315
316 static void release_ip_done(struct tevent_req *subreq)
317 {
318         struct release_ip_one_state *substate = tevent_req_callback_data(
319                 subreq, struct release_ip_one_state);
320         struct tevent_req *req = substate->req;
321         struct release_ip_state *state = tevent_req_data(
322                 req, struct release_ip_state);
323         int ret, i;
324         int *err_list;
325         bool status, found_errors;
326
327         status = ctdb_client_control_multi_recv(subreq, &ret, state,
328                                                 &err_list, NULL);
329         TALLOC_FREE(subreq);
330
331         if (status) {
332                 D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
333                        substate->ip_str, substate->count);
334                 goto done;
335         }
336
337         /* Get some clear error messages out of err_list and count
338          * banning credits
339          */
340         found_errors = false;
341         for (i = 0; i < substate->count; i++) {
342                 int err = err_list[i];
343                 if (err != 0) {
344                         uint32_t pnn = substate->pnns[i];
345
346                         D_ERR("RELEASE_IP %s failed on node %u, "
347                               "ret=%d\n", substate->ip_str, pnn, err);
348
349                         state->ban_credits[pnn]++;
350                         state->err_any = err;
351                         found_errors = true;
352                 }
353         }
354         if (! found_errors) {
355                 D_ERR("RELEASE_IP %s internal error, ret=%d\n",
356                       substate->ip_str, ret);
357                 state->err_any = EIO;
358         }
359
360         state->num_fails++;
361
362 done:
363         talloc_free(substate);
364
365         state->num_replies++;
366
367         if (state->num_replies < state->num_sent) {
368                 /* Not all replies received, don't go further */
369                 return;
370         }
371
372         if (state->num_fails > 0) {
373                 tevent_req_error(req, state->err_any);
374                 return;
375         }
376
377         tevent_req_done(req);
378 }
379
380 static bool release_ip_recv(struct tevent_req *req, int *perr)
381 {
382         return generic_recv(req, perr);
383 }
384
385 /**********************************************************************/
386
387 struct take_ip_state {
388         int num_sent;
389         int num_replies;
390         int num_fails;
391         int err_any;
392         uint32_t *ban_credits;
393 };
394
395 struct take_ip_one_state {
396         struct tevent_req *req;
397         uint32_t pnn;
398         const char *ip_str;
399 };
400
401 static void take_ip_done(struct tevent_req *subreq);
402
403 static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
404                                        struct tevent_context *ev,
405                                        struct ctdb_client_context *client,
406                                        struct timeval timeout,
407                                        struct public_ip_list *all_ips,
408                                        uint32_t *ban_credits)
409 {
410         struct tevent_req *req, *subreq;
411         struct take_ip_state *state;
412         struct ctdb_req_control request;
413         struct public_ip_list *tmp_ip;
414
415         req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
416         if (req == NULL) {
417                 return NULL;
418         }
419
420         state->num_sent = 0;
421         state->num_replies = 0;
422         state->num_fails = 0;
423         state->ban_credits = ban_credits;
424
425         /* For each IP, send a TAKOVER_IP to the node that should be
426          * hosting it.  Many of these will often be redundant (since
427          * the allocation won't have changed) but they can be useful
428          * to recover from inconsistencies. */
429         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
430                 struct take_ip_one_state *substate;
431                 struct ctdb_public_ip ip;
432
433                 if (tmp_ip->pnn == -1) {
434                         /* IP will be unassigned */
435                         continue;
436                 }
437
438                 substate = talloc_zero(state, struct take_ip_one_state);
439                 if (tevent_req_nomem(substate, req)) {
440                         return tevent_req_post(req, ev);
441                 }
442
443                 substate->req = req;
444                 substate->pnn = tmp_ip->pnn;
445
446                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
447                                                              &tmp_ip->addr);
448                 if (tevent_req_nomem(substate->ip_str, req)) {
449                         return tevent_req_post(req, ev);
450                 }
451
452                 ip.pnn = tmp_ip->pnn;
453                 ip.addr = tmp_ip->addr;
454                 ctdb_req_control_takeover_ip(&request, &ip);
455                 subreq = ctdb_client_control_send(
456                                         state, ev, client, tmp_ip->pnn,
457                                         timeout, /* cumulative */
458                                         &request);
459                 if (tevent_req_nomem(subreq, req)) {
460                         return tevent_req_post(req, ev);
461                 }
462                 tevent_req_set_callback(subreq, take_ip_done, substate);
463
464                 state->num_sent++;
465         }
466
467         /* None sent, finished... */
468         if (state->num_sent == 0) {
469                 tevent_req_done(req);
470                 return tevent_req_post(req, ev);
471         }
472
473         return req;
474 }
475
476 static void take_ip_done(struct tevent_req *subreq)
477 {
478         struct take_ip_one_state *substate = tevent_req_callback_data(
479                 subreq, struct take_ip_one_state);
480         struct tevent_req *req = substate->req;
481         struct ctdb_reply_control *reply;
482         struct take_ip_state *state = tevent_req_data(
483                 req, struct take_ip_state);
484         int ret = 0;
485         bool status;
486
487         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
488         TALLOC_FREE(subreq);
489
490         if (! status) {
491                 D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
492                       substate->ip_str, substate->pnn, ret);
493                 goto fail;
494         }
495
496         ret = ctdb_reply_control_takeover_ip(reply);
497         if (ret != 0) {
498                 D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
499                       substate->ip_str, substate->pnn, ret);
500                 goto fail;
501         }
502
503         D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
504                substate->ip_str, substate->pnn);
505         goto done;
506
507 fail:
508         state->ban_credits[substate->pnn]++;
509         state->num_fails++;
510         state->err_any = ret;
511
512 done:
513         talloc_free(substate);
514
515         state->num_replies++;
516
517         if (state->num_replies < state->num_sent) {
518                 /* Not all replies received, don't go further */
519                 return;
520         }
521
522         if (state->num_fails > 0) {
523                 tevent_req_error(req, state->err_any);
524                 return;
525         }
526
527         tevent_req_done(req);
528 }
529
530 static bool take_ip_recv(struct tevent_req *req, int *perr)
531 {
532         return generic_recv(req, perr);
533 }
534
535 /**********************************************************************/
536
537 struct ipreallocated_state {
538         uint32_t *pnns;
539         int count;
540         uint32_t *ban_credits;
541 };
542
543 static void ipreallocated_done(struct tevent_req *subreq);
544
545 static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
546                                              struct tevent_context *ev,
547                                              struct ctdb_client_context *client,
548                                              uint32_t *pnns,
549                                              int count,
550                                              struct timeval timeout,
551                                              uint32_t *ban_credits)
552 {
553         struct tevent_req *req, *subreq;
554         struct ipreallocated_state *state;
555         struct ctdb_req_control request;
556
557         req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
558         if (req == NULL) {
559                 return NULL;
560         }
561
562         state->pnns = pnns;
563         state->count = count;
564         state->ban_credits = ban_credits;
565
566         ctdb_req_control_ipreallocated(&request);
567         subreq = ctdb_client_control_multi_send(state, ev, client,
568                                                 pnns, count,
569                                                 timeout, /* cumulative */
570                                                 &request);
571         if (tevent_req_nomem(subreq, req)) {
572                 return tevent_req_post(req, ev);
573         }
574         tevent_req_set_callback(subreq, ipreallocated_done, req);
575
576         return req;
577 }
578
579 static void ipreallocated_done(struct tevent_req *subreq)
580 {
581         struct tevent_req *req = tevent_req_callback_data(
582                 subreq, struct tevent_req);
583         struct ipreallocated_state *state = tevent_req_data(
584                 req, struct ipreallocated_state);
585         int *err_list = NULL;
586         int ret, i;
587         bool status, found_errors;
588
589         status = ctdb_client_control_multi_recv(subreq, &ret, state,
590                                                 &err_list, NULL);
591         TALLOC_FREE(subreq);
592
593         if (status) {
594                 D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
595                 tevent_req_done(req);
596                 return;
597         }
598
599         /* Get some clear error messages out of err_list and count
600          * banning credits
601          */
602         found_errors = false;
603         for (i = 0; i < state->count; i++) {
604                 int err = err_list[i];
605                 if (err != 0) {
606                         uint32_t pnn = state->pnns[i];
607
608                         D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
609                               pnn, err);
610
611                         state->ban_credits[pnn]++;
612                         found_errors = true;
613                 }
614         }
615
616         if (! found_errors) {
617                 D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
618         }
619
620         tevent_req_error(req, ret);
621 }
622
623 static bool ipreallocated_recv(struct tevent_req *req, int *perr)
624 {
625         return generic_recv(req, perr);
626 }
627
628 /**********************************************************************/
629
630 /*
631  * Recalculate the allocation of public IPs to nodes and have the
632  * nodes host their allocated addresses.
633  *
634  * - Get tunables
635  * - Get nodemap
636  * - Initialise IP allocation state.  Pass:
637  *   + algorithm to be used;
638  *   + various tunables (NoIPTakeover, NoIPFailback, NoIPHostOnAllDisabled)
639  *   + list of nodes to force rebalance (internal structure, currently
640  *     no way to fetch, only used by LCP2 for nodes that have had new
641  *     IP addresses added).
642  * - Set IP flags for IP allocation based on node map
643  * - Retrieve known and available IP addresses (done separately so
644  *   values can be faked in unit testing)
645  * - Use ipalloc_set_public_ips() to set known and available IP
646  *   addresses for allocation
647  * - If cluster can't host IP addresses then jump to IPREALLOCATED
648  * - Run IP allocation algorithm
649  * - Send RELEASE_IP to all nodes for IPs they should not host
650  * - Send TAKE_IP to all nodes for IPs they should host
651  * - Send IPREALLOCATED to all nodes
652  */
653
654 struct takeover_state {
655         struct tevent_context *ev;
656         struct ctdb_client_context *client;
657         struct timeval timeout;
658         int num_nodes;
659         uint32_t *pnns_connected;
660         int num_connected;
661         uint32_t *pnns_active;
662         int num_active;
663         uint32_t destnode;
664         uint32_t *force_rebalance_nodes;
665         struct ctdb_tunable_list *tun_list;
666         struct ipalloc_state *ipalloc_state;
667         struct ctdb_public_ip_list *known_ips;
668         struct public_ip_list *all_ips;
669         uint32_t *ban_credits;
670 };
671
672 static void takeover_tunables_done(struct tevent_req *subreq);
673 static void takeover_nodemap_done(struct tevent_req *subreq);
674 static void takeover_known_ips_done(struct tevent_req *subreq);
675 static void takeover_avail_ips_done(struct tevent_req *subreq);
676 static void takeover_release_ip_done(struct tevent_req *subreq);
677 static void takeover_take_ip_done(struct tevent_req *subreq);
678 static void takeover_ipreallocated(struct tevent_req *req);
679 static void takeover_ipreallocated_done(struct tevent_req *subreq);
680 static void takeover_failed(struct tevent_req *subreq, int ret);
681 static void takeover_failed_done(struct tevent_req *subreq);
682
683 static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
684                                         struct tevent_context *ev,
685                                         struct ctdb_client_context *client,
686                                         uint32_t *force_rebalance_nodes)
687 {
688         struct tevent_req *req, *subreq;
689         struct takeover_state *state;
690         struct ctdb_req_control request;
691
692         req = tevent_req_create(mem_ctx, &state, struct takeover_state);
693         if (req == NULL) {
694                 return NULL;
695         }
696
697         state->ev = ev;
698         state->client = client;
699         state->force_rebalance_nodes = force_rebalance_nodes;
700         state->destnode = ctdb_client_pnn(client);
701
702         ctdb_req_control_get_all_tunables(&request);
703         subreq = ctdb_client_control_send(state, state->ev, state->client,
704                                           state->destnode, TIMEOUT(),
705                                           &request);
706         if (tevent_req_nomem(subreq, req)) {
707                 return tevent_req_post(req, ev);
708         }
709         tevent_req_set_callback(subreq, takeover_tunables_done, req);
710
711         return req;
712 }
713
714 static void takeover_tunables_done(struct tevent_req *subreq)
715 {
716         struct tevent_req *req = tevent_req_callback_data(
717                 subreq, struct tevent_req);
718         struct takeover_state *state = tevent_req_data(
719                 req, struct takeover_state);
720         struct ctdb_reply_control *reply;
721         struct ctdb_req_control request;
722         int ret;
723         bool status;
724
725         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
726         TALLOC_FREE(subreq);
727         if (! status) {
728                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
729                 tevent_req_error(req, ret);
730                 return;
731         }
732
733         ret = ctdb_reply_control_get_all_tunables(reply, state,
734                                                   &state->tun_list);
735         if (ret != 0) {
736                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
737                 tevent_req_error(req, ret);
738                 return;
739         }
740
741         talloc_free(reply);
742
743         takeover_timeout = state->tun_list->takeover_timeout;
744
745         ctdb_req_control_get_nodemap(&request);
746         subreq = ctdb_client_control_send(state, state->ev, state->client,
747                                           state->destnode, TIMEOUT(),
748                                           &request);
749         if (tevent_req_nomem(subreq, req)) {
750                 return;
751         }
752         tevent_req_set_callback(subreq, takeover_nodemap_done, req);
753 }
754
755 static void takeover_nodemap_done(struct tevent_req *subreq)
756 {
757         struct tevent_req *req = tevent_req_callback_data(
758                 subreq, struct tevent_req);
759         struct takeover_state *state = tevent_req_data(
760                 req, struct takeover_state);
761         struct ctdb_reply_control *reply;
762         bool status;
763         int ret;
764         struct ctdb_node_map *nodemap;
765
766         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
767         TALLOC_FREE(subreq);
768         if (! status) {
769                 D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
770                         state->destnode, ret);
771                 tevent_req_error(req, ret);
772                 return;
773         }
774
775         ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
776         if (ret != 0) {
777                 D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
778                 tevent_req_error(req, ret);
779                 return;
780         }
781
782         state->num_nodes = nodemap->num;
783
784         state->num_connected = list_of_connected_nodes(nodemap,
785                                                        CTDB_UNKNOWN_PNN, state,
786                                                        &state->pnns_connected);
787         if (state->num_connected <= 0) {
788                 tevent_req_error(req, ENOMEM);
789                 return;
790         }
791
792         state->num_active = list_of_active_nodes(nodemap,
793                                                  CTDB_UNKNOWN_PNN, state,
794                                                  &state->pnns_active);
795         if (state->num_active <= 0) {
796                 tevent_req_error(req, ENOMEM);
797                 return;
798         }
799
800         /* Default timeout for early jump to IPREALLOCATED.  See below
801          * for explanation of 3 times...
802          */
803         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
804
805         state->ban_credits = talloc_zero_array(state, uint32_t,
806                                                state->num_nodes);
807         if (tevent_req_nomem(state->ban_credits, req)) {
808                 return;
809         }
810
811         if (state->tun_list->disable_ip_failover != 0) {
812                 /* IP failover is completely disabled so just send out
813                  * ipreallocated event.
814                  */
815                 takeover_ipreallocated(req);
816                 return;
817         }
818
819         state->ipalloc_state =
820                 ipalloc_state_init(
821                         state, state->num_nodes,
822                         determine_algorithm(state->tun_list),
823                         (state->tun_list->no_ip_takeover != 0),
824                         (state->tun_list->no_ip_failback != 0),
825                         (state->tun_list->no_ip_host_on_all_disabled != 0),
826                         state->force_rebalance_nodes);
827         if (tevent_req_nomem(state->ipalloc_state, req)) {
828                 return;
829         }
830
831         ipalloc_set_node_flags(state->ipalloc_state, nodemap);
832
833         subreq = get_public_ips_send(state, state->ev, state->client,
834                                      state->pnns_active, state->num_active,
835                                      false);
836         if (tevent_req_nomem(subreq, req)) {
837                 return;
838         }
839
840         tevent_req_set_callback(subreq, takeover_known_ips_done, req);
841 }
842
843 static void takeover_known_ips_done(struct tevent_req *subreq)
844 {
845         struct tevent_req *req = tevent_req_callback_data(
846                 subreq, struct tevent_req);
847         struct takeover_state *state = tevent_req_data(
848                 req, struct takeover_state);
849         int ret;
850         bool status;
851
852         status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
853         TALLOC_FREE(subreq);
854
855         if (! status) {
856                 D_ERR("Failed to fetch known public IPs\n");
857                 tevent_req_error(req, ret);
858                 return;
859         }
860
861         subreq = get_public_ips_send(state, state->ev, state->client,
862                                      state->pnns_active, state->num_active,
863                                      true);
864         if (tevent_req_nomem(subreq, req)) {
865                 return;
866         }
867
868         tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
869 }
870
871 static void takeover_avail_ips_done(struct tevent_req *subreq)
872 {
873         struct tevent_req *req = tevent_req_callback_data(
874                 subreq, struct tevent_req);
875         struct takeover_state *state = tevent_req_data(
876                 req, struct takeover_state);
877         bool status;
878         int ret;
879         struct ctdb_public_ip_list *available_ips;
880
881         status = get_public_ips_recv(subreq, &ret, state, &available_ips);
882         TALLOC_FREE(subreq);
883
884         if (! status) {
885                 D_ERR("Failed to fetch available public IPs\n");
886                 tevent_req_error(req, ret);
887                 return;
888         }
889
890         ipalloc_set_public_ips(state->ipalloc_state,
891                                state->known_ips, available_ips);
892
893         if (! ipalloc_can_host_ips(state->ipalloc_state)) {
894                 D_NOTICE("No nodes available to host public IPs yet\n");
895                 takeover_ipreallocated(req);
896                 return;
897         }
898
899         /* Do the IP reassignment calculations */
900         state->all_ips = ipalloc(state->ipalloc_state);
901         if (tevent_req_nomem(state->all_ips, req)) {
902                 return;
903         }
904
905         /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
906          * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
907          * seconds.  However, RELEASE_IP can take longer due to TCP
908          * connection killing, so sometimes needs more time.
909          * Therefore, use a cumulative timeout of TakeoverTimeout * 3
910          * seconds across all 3 stages.  No explicit expiry checks are
911          * needed before each stage because tevent is smart enough to
912          * fire the timeouts even if they are in the past.  Initialise
913          * this here so it explicitly covers the stages we're
914          * interested in but, in particular, not the time taken by the
915          * ipalloc().
916          */
917         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
918
919         subreq = release_ip_send(state, state->ev, state->client,
920                                  state->pnns_connected, state->num_connected,
921                                  state->timeout, state->all_ips,
922                                  state->ban_credits);
923         if (tevent_req_nomem(subreq, req)) {
924                 return;
925         }
926         tevent_req_set_callback(subreq, takeover_release_ip_done, req);
927 }
928
929 static void takeover_release_ip_done(struct tevent_req *subreq)
930 {
931         struct tevent_req *req = tevent_req_callback_data(
932                 subreq, struct tevent_req);
933         struct takeover_state *state = tevent_req_data(
934                 req, struct takeover_state);
935         int ret;
936         bool status;
937
938         status = release_ip_recv(subreq, &ret);
939         TALLOC_FREE(subreq);
940
941         if (! status) {
942                 takeover_failed(req, ret);
943                 return;
944         }
945
946         /* All released, now for takeovers */
947
948         subreq = take_ip_send(state, state->ev, state->client,
949                               state->timeout, state->all_ips,
950                               state->ban_credits);
951         if (tevent_req_nomem(subreq, req)) {
952                 return;
953         }
954         tevent_req_set_callback(subreq, takeover_take_ip_done, req);
955 }
956
957 static void takeover_take_ip_done(struct tevent_req *subreq)
958 {
959         struct tevent_req *req = tevent_req_callback_data(
960                 subreq, struct tevent_req);
961         int ret = 0;
962         bool status;
963
964         status = take_ip_recv(subreq, &ret);
965         TALLOC_FREE(subreq);
966
967         if (! status) {
968                 takeover_failed(req, ret);
969                 return;
970         }
971
972         takeover_ipreallocated(req);
973 }
974
975 static void takeover_ipreallocated(struct tevent_req *req)
976 {
977         struct takeover_state *state = tevent_req_data(
978                 req, struct takeover_state);
979         struct tevent_req *subreq;
980
981         subreq = ipreallocated_send(state, state->ev, state->client,
982                                     state->pnns_connected,
983                                     state->num_connected,
984                                     state->timeout,
985                                     state->ban_credits);
986         if (tevent_req_nomem(subreq, req)) {
987                 return;
988         }
989         tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
990 }
991
992 static void takeover_ipreallocated_done(struct tevent_req *subreq)
993 {
994         struct tevent_req *req = tevent_req_callback_data(
995                 subreq, struct tevent_req);
996         int ret;
997         bool status;
998
999         status = ipreallocated_recv(subreq, &ret);
1000         TALLOC_FREE(subreq);
1001
1002         if (! status) {
1003                 takeover_failed(req, ret);
1004                 return;
1005         }
1006
1007         tevent_req_done(req);
1008 }
1009
1010 struct takeover_failed_state {
1011         struct tevent_req *req;
1012         int ret;
1013 };
1014
1015 void takeover_failed(struct tevent_req *req, int ret)
1016 {
1017         struct takeover_state *state = tevent_req_data(
1018                 req, struct takeover_state);
1019         struct tevent_req *subreq;
1020         uint32_t max_pnn = CTDB_UNKNOWN_PNN;
1021         int max_credits = 0;
1022         int pnn;
1023
1024         /* Check that bans are enabled */
1025         if (state->tun_list->enable_bans == 0) {
1026                 tevent_req_error(req, ret);
1027                 return;
1028         }
1029
1030         for (pnn = 0; pnn < state->num_nodes; pnn++) {
1031                 if (state->ban_credits[pnn] > max_credits) {
1032                         max_pnn = pnn;
1033                         max_credits = state->ban_credits[pnn];
1034                 }
1035         }
1036
1037         if (max_credits > 0) {
1038                 struct ctdb_req_message message;
1039                 struct takeover_failed_state *substate;
1040
1041                 D_WARNING("Assigning banning credits to node %u\n", max_pnn);
1042
1043                 substate = talloc_zero(state, struct takeover_failed_state);
1044                 if (tevent_req_nomem(substate, req)) {
1045                         return;
1046                 }
1047                 substate->req = req;
1048                 substate->ret = ret;
1049
1050                 message.srvid = CTDB_SRVID_BANNING;
1051                 message.data.pnn = max_pnn;
1052
1053                 subreq = ctdb_client_message_send(
1054                         state, state->ev, state->client,
1055                         ctdb_client_pnn(state->client),
1056                         &message);
1057                 if (subreq == NULL) {
1058                         D_ERR("failed to assign banning credits\n");
1059                         tevent_req_error(req, ret);
1060                         return;
1061                 }
1062                 tevent_req_set_callback(subreq, takeover_failed_done, substate);
1063         } else {
1064                 tevent_req_error(req, ret);
1065         }
1066 }
1067
1068 static void takeover_failed_done(struct tevent_req *subreq)
1069 {
1070         struct takeover_failed_state *substate = tevent_req_callback_data(
1071                 subreq, struct takeover_failed_state);
1072         struct tevent_req *req = substate->req;
1073         int ret;
1074         bool status;
1075
1076         status = ctdb_client_message_recv(subreq, &ret);
1077         TALLOC_FREE(subreq);
1078         if (! status) {
1079                 D_ERR("failed to assign banning credits, ret=%d\n", ret);
1080         }
1081
1082         ret = substate->ret;
1083         talloc_free(substate);
1084         tevent_req_error(req, ret);
1085 }
1086
1087 static void takeover_recv(struct tevent_req *req, int *perr)
1088 {
1089         generic_recv(req, perr);
1090 }
1091
1092 static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
1093 {
1094         char *strv = NULL;
1095         int num, i, ret;
1096         char *t;
1097         uint32_t *nodes;
1098
1099         ret = strv_split(mem_ctx, &strv, s, ",");
1100         if (ret != 0) {
1101                 D_ERR("out of memory\n");
1102                 return NULL;
1103         }
1104
1105         num = strv_count(strv);
1106
1107         nodes = talloc_array(mem_ctx, uint32_t, num);
1108         if (nodes == NULL) {
1109                 D_ERR("out of memory\n");
1110                 return NULL;
1111         }
1112
1113         t = NULL;
1114         for (i = 0; i < num; i++) {
1115                 t = strv_next(strv, t);
1116                 nodes[i] = atoi(t);
1117         }
1118
1119         return nodes;
1120 }
1121
1122 static void usage(const char *progname)
1123 {
1124         fprintf(stderr,
1125                 "\nUsage: %s <output-fd> <ctdb-socket-path> "
1126                 "[<force-rebalance-nodes>]\n",
1127                 progname);
1128 }
1129
1130 /*
1131  * Arguments - write fd, socket path
1132  */
1133 int main(int argc, const char *argv[])
1134 {
1135         int write_fd;
1136         const char *sockpath;
1137         TALLOC_CTX *mem_ctx;
1138         struct tevent_context *ev;
1139         struct ctdb_client_context *client;
1140         int ret;
1141         struct tevent_req *req;
1142         uint32_t *force_rebalance_nodes = NULL;
1143
1144         if (argc < 3 || argc > 4) {
1145                 usage(argv[0]);
1146                 exit(1);
1147         }
1148
1149         write_fd = atoi(argv[1]);
1150         sockpath = argv[2];
1151
1152         mem_ctx = talloc_new(NULL);
1153         if (mem_ctx == NULL) {
1154                 fprintf(stderr, "talloc_new() failed\n");
1155                 ret = ENOMEM;
1156                 goto done;
1157         }
1158
1159         if (argc == 4) {
1160                 force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
1161                 if (force_rebalance_nodes == NULL) {
1162                         usage(argv[0]);
1163                         exit(1);
1164                 }
1165         }
1166
1167         logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
1168
1169         ev = tevent_context_init(mem_ctx);
1170         if (ev == NULL) {
1171                 D_ERR("tevent_context_init() failed\n");
1172                 ret = ENOMEM;
1173                 goto done;
1174         }
1175
1176         ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
1177         if (ret != 0) {
1178                 D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
1179                 goto done;
1180         }
1181
1182         req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
1183         if (req == NULL) {
1184                 D_ERR("takeover_send() failed\n");
1185                 ret = 1;
1186                 goto done;
1187         }
1188
1189         if (! tevent_req_poll(req, ev)) {
1190                 D_ERR("tevent_req_poll() failed\n");
1191                 ret = 1;
1192                 goto done;
1193         }
1194
1195         takeover_recv(req, &ret);
1196         TALLOC_FREE(req);
1197         if (ret != 0) {
1198                 D_ERR("takeover run failed, ret=%d\n", ret);
1199         }
1200
1201 done:
1202         sys_write_v(write_fd, &ret, sizeof(ret));
1203
1204         talloc_free(mem_ctx);
1205         return ret;
1206 }