796d45cc4a742e3760b7c0d4bd532546b3f5661f
[sfrench/samba-autobuild/.git] / ctdb / server / ctdb_takeover_helper.c
1 /*
2    CTDB IP takeover helper
3
4    Copyright (C) Martin Schwenke  2016
5
6    Based on ctdb_recovery_helper.c
7    Copyright (C) Amitay Isaacs  2015
8
9    and ctdb_takeover.c
10    Copyright (C) Ronnie Sahlberg  2007
11    Copyright (C) Andrew Tridgell  2007
12    Copyright (C) Martin Schwenke  2011
13
14    This program is free software; you can redistribute it and/or modify
15    it under the terms of the GNU General Public License as published by
16    the Free Software Foundation; either version 3 of the License, or
17    (at your option) any later version.
18
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License for more details.
23
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28 #include "replace.h"
29 #include "system/network.h"
30 #include "system/filesys.h"
31
32 #include <popt.h>
33 #include <talloc.h>
34 #include <tevent.h>
35
36 #include "lib/util/debug.h"
37 #include "lib/util/strv.h"
38 #include "lib/util/strv_util.h"
39 #include "lib/util/sys_rw.h"
40 #include "lib/util/time.h"
41 #include "lib/util/tevent_unix.h"
42
43 #include "protocol/protocol.h"
44 #include "protocol/protocol_api.h"
45 #include "client/client.h"
46
47 #include "common/logging.h"
48
49 #include "server/ipalloc.h"
50
51 static int takeover_timeout = 9;
52
53 #define TIMEOUT()       timeval_current_ofs(takeover_timeout, 0)
54
55 /*
56  * Utility functions
57  */
58
59 static bool generic_recv(struct tevent_req *req, int *perr)
60 {
61         int err;
62
63         if (tevent_req_is_unix_error(req, &err)) {
64                 if (perr != NULL) {
65                         *perr = err;
66                 }
67                 return false;
68         }
69
70         return true;
71 }
72
73 static enum ipalloc_algorithm
74 determine_algorithm(const struct ctdb_tunable_list *tunables)
75 {
76         switch (tunables->ip_alloc_algorithm) {
77         case 0:
78                 return IPALLOC_DETERMINISTIC;
79         case 1:
80                 return IPALLOC_NONDETERMINISTIC;
81         case 2:
82                 return IPALLOC_LCP2;
83         default:
84                 return IPALLOC_LCP2;
85         };
86 }
87
88 /**********************************************************************/
89
90 struct get_public_ips_state {
91         uint32_t *pnns;
92         int count;
93         struct ctdb_public_ip_list *ips;
94 };
95
96 static void get_public_ips_done(struct tevent_req *subreq);
97
98 static struct tevent_req *get_public_ips_send(
99                                 TALLOC_CTX *mem_ctx,
100                                 struct tevent_context *ev,
101                                 struct ctdb_client_context *client,
102                                 uint32_t *pnns,
103                                 int count, int num_nodes,
104                                 bool available_only)
105 {
106         struct tevent_req *req, *subreq;
107         struct get_public_ips_state *state;
108         struct ctdb_req_control request;
109
110         req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
111         if (req == NULL) {
112                 return NULL;
113         }
114
115         state->pnns = pnns;
116         state->count = count;
117
118         state->ips  = talloc_zero_array(state,
119                                         struct ctdb_public_ip_list,
120                                         num_nodes);
121         if (tevent_req_nomem(state->ips, req)) {
122                 return tevent_req_post(req, ev);
123         }
124
125         ctdb_req_control_get_public_ips(&request, available_only);
126         subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
127                                                 state->pnns,
128                                                 state->count,
129                                                 TIMEOUT(), &request);
130         if (tevent_req_nomem(subreq, req)) {
131                 return tevent_req_post(req, ev);
132         }
133         tevent_req_set_callback(subreq, get_public_ips_done, req);
134
135         return req;
136 }
137
138 static void get_public_ips_done(struct tevent_req *subreq)
139 {
140         struct tevent_req *req = tevent_req_callback_data(
141                 subreq, struct tevent_req);
142         struct get_public_ips_state *state = tevent_req_data(
143                 req, struct get_public_ips_state);
144         struct ctdb_reply_control **reply;
145         int *err_list;
146         int ret, i;
147         bool status;
148
149         status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
150                                                 &reply);
151         TALLOC_FREE(subreq);
152         if (! status) {
153                 int ret2;
154                 uint32_t pnn;
155
156                 ret2 = ctdb_client_control_multi_error(state->pnns,
157                                                        state->count,
158                                                        err_list, &pnn);
159                 if (ret2 != 0) {
160                         D_ERR("control GET_PUBLIC_IPS failed on "
161                               "node %u, ret=%d\n", pnn, ret2);
162                 } else {
163                         D_ERR("control GET_PUBLIC_IPS failed, "
164                               "ret=%d\n", ret);
165                 }
166                 tevent_req_error(req, ret);
167                 return;
168         }
169
170         for (i = 0; i < state->count; i++) {
171                 uint32_t pnn;
172                 struct ctdb_public_ip_list *ips;
173
174                 pnn = state->pnns[i];
175                 ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
176                                                         &ips);
177                 if (ret != 0) {
178                         D_ERR("control GET_PUBLIC_IPS failed on "
179                               "node %u\n", pnn);
180                         tevent_req_error(req, EIO);
181                         return;
182                 }
183                 state->ips[pnn] = *ips;
184         }
185
186         talloc_free(reply);
187
188         tevent_req_done(req);
189 }
190
191 static bool get_public_ips_recv(struct tevent_req *req, int *perr,
192                                 TALLOC_CTX *mem_ctx,
193                                 struct ctdb_public_ip_list **ips)
194 {
195         struct get_public_ips_state *state = tevent_req_data(
196                 req, struct get_public_ips_state);
197         int err;
198
199         if (tevent_req_is_unix_error(req, &err)) {
200                 if (perr != NULL) {
201                         *perr = err;
202                 }
203                 return false;
204         }
205
206         *ips = talloc_steal(mem_ctx, state->ips);
207
208         return true;
209 }
210
211 /**********************************************************************/
212
213 struct release_ip_state {
214         int num_sent;
215         int num_replies;
216         int num_fails;
217         int err_any;
218         uint32_t *ban_credits;
219 };
220
221 struct release_ip_one_state {
222         struct tevent_req *req;
223         uint32_t *pnns;
224         int count;
225         const char *ip_str;
226 };
227
228 static void release_ip_done(struct tevent_req *subreq);
229
230 static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
231                                           struct tevent_context *ev,
232                                           struct ctdb_client_context *client,
233                                           uint32_t *pnns,
234                                           int count,
235                                           struct timeval timeout,
236                                           struct public_ip_list *all_ips,
237                                           uint32_t *ban_credits)
238 {
239         struct tevent_req *req, *subreq;
240         struct release_ip_state *state;
241         struct ctdb_req_control request;
242         struct public_ip_list *tmp_ip;
243
244         req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
245         if (req == NULL) {
246                 return NULL;
247         }
248
249         state->num_sent = 0;
250         state->num_replies = 0;
251         state->num_fails = 0;
252         state->ban_credits = ban_credits;
253
254         /* Send a RELEASE_IP to all nodes that should not be hosting
255          * each IP.  For each IP, all but one of these will be
256          * redundant.  However, the redundant ones are used to tell
257          * nodes which node should be hosting the IP so that commands
258          * like "ctdb ip" can display a particular nodes idea of who
259          * is hosting what. */
260         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
261                 struct release_ip_one_state *substate;
262                 struct ctdb_public_ip ip;
263                 int i;
264
265                 substate = talloc_zero(state, struct release_ip_one_state);
266                 if (tevent_req_nomem(substate, req)) {
267                         return tevent_req_post(req, ev);
268                 }
269
270                 substate->pnns = talloc_zero_array(substate, uint32_t, count);
271                 if (tevent_req_nomem(substate->pnns, req)) {
272                         return tevent_req_post(req, ev);
273                 }
274
275                 substate->count = 0;
276                 substate->req = req;
277
278                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
279                                                              &tmp_ip->addr);
280                 if (tevent_req_nomem(substate->ip_str, req)) {
281                         return tevent_req_post(req, ev);
282                 }
283
284                 for (i = 0; i < count; i++) {
285                         uint32_t pnn = pnns[i];
286                         /* If pnn is not the node that should be
287                          * hosting the IP then add it to the list of
288                          * nodes that need to do a release. */
289                         if (tmp_ip->pnn != pnn) {
290                                 substate->pnns[substate->count] = pnn;
291                                 substate->count++;
292                         }
293                 }
294
295                 if (substate->count == 0) {
296                         /* No releases to send for this address... */
297                         TALLOC_FREE(substate);
298                         continue;
299                 }
300
301                 ip.pnn = tmp_ip->pnn;
302                 ip.addr = tmp_ip->addr;
303                 ctdb_req_control_release_ip(&request, &ip);
304                 subreq = ctdb_client_control_multi_send(state, ev, client,
305                                                         substate->pnns,
306                                                         substate->count,
307                                                         timeout,/* cumulative */
308                                                         &request);
309                 if (tevent_req_nomem(subreq, req)) {
310                         return tevent_req_post(req, ev);
311                 }
312                 tevent_req_set_callback(subreq, release_ip_done, substate);
313
314                 state->num_sent++;
315         }
316
317         /* None sent, finished... */
318         if (state->num_sent == 0) {
319                 tevent_req_done(req);
320                 return tevent_req_post(req, ev);
321         }
322
323         return req;
324 }
325
326 static void release_ip_done(struct tevent_req *subreq)
327 {
328         struct release_ip_one_state *substate = tevent_req_callback_data(
329                 subreq, struct release_ip_one_state);
330         struct tevent_req *req = substate->req;
331         struct release_ip_state *state = tevent_req_data(
332                 req, struct release_ip_state);
333         int ret, i;
334         int *err_list;
335         bool status, found_errors;
336
337         status = ctdb_client_control_multi_recv(subreq, &ret, state,
338                                                 &err_list, NULL);
339         TALLOC_FREE(subreq);
340
341         if (status) {
342                 D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
343                        substate->ip_str, substate->count);
344                 goto done;
345         }
346
347         /* Get some clear error messages out of err_list and count
348          * banning credits
349          */
350         found_errors = false;
351         for (i = 0; i < substate->count; i++) {
352                 int err = err_list[i];
353                 if (err != 0) {
354                         uint32_t pnn = substate->pnns[i];
355
356                         D_ERR("RELEASE_IP %s failed on node %u, "
357                               "ret=%d\n", substate->ip_str, pnn, err);
358
359                         state->ban_credits[pnn]++;
360                         state->err_any = err;
361                         found_errors = true;
362                 }
363         }
364         if (! found_errors) {
365                 D_ERR("RELEASE_IP %s internal error, ret=%d\n",
366                       substate->ip_str, ret);
367                 state->err_any = EIO;
368         }
369
370         state->num_fails++;
371
372 done:
373         talloc_free(substate);
374
375         state->num_replies++;
376
377         if (state->num_replies < state->num_sent) {
378                 /* Not all replies received, don't go further */
379                 return;
380         }
381
382         if (state->num_fails > 0) {
383                 tevent_req_error(req, state->err_any);
384                 return;
385         }
386
387         tevent_req_done(req);
388 }
389
390 static bool release_ip_recv(struct tevent_req *req, int *perr)
391 {
392         return generic_recv(req, perr);
393 }
394
395 /**********************************************************************/
396
397 struct take_ip_state {
398         int num_sent;
399         int num_replies;
400         int num_fails;
401         int err_any;
402         uint32_t *ban_credits;
403 };
404
405 struct take_ip_one_state {
406         struct tevent_req *req;
407         uint32_t pnn;
408         const char *ip_str;
409 };
410
411 static void take_ip_done(struct tevent_req *subreq);
412
413 static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
414                                        struct tevent_context *ev,
415                                        struct ctdb_client_context *client,
416                                        struct timeval timeout,
417                                        struct public_ip_list *all_ips,
418                                        uint32_t *ban_credits)
419 {
420         struct tevent_req *req, *subreq;
421         struct take_ip_state *state;
422         struct ctdb_req_control request;
423         struct public_ip_list *tmp_ip;
424
425         req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
426         if (req == NULL) {
427                 return NULL;
428         }
429
430         state->num_sent = 0;
431         state->num_replies = 0;
432         state->num_fails = 0;
433         state->ban_credits = ban_credits;
434
435         /* For each IP, send a TAKOVER_IP to the node that should be
436          * hosting it.  Many of these will often be redundant (since
437          * the allocation won't have changed) but they can be useful
438          * to recover from inconsistencies. */
439         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
440                 struct take_ip_one_state *substate;
441                 struct ctdb_public_ip ip;
442
443                 if (tmp_ip->pnn == -1) {
444                         /* IP will be unassigned */
445                         continue;
446                 }
447
448                 substate = talloc_zero(state, struct take_ip_one_state);
449                 if (tevent_req_nomem(substate, req)) {
450                         return tevent_req_post(req, ev);
451                 }
452
453                 substate->req = req;
454                 substate->pnn = tmp_ip->pnn;
455
456                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
457                                                              &tmp_ip->addr);
458                 if (tevent_req_nomem(substate->ip_str, req)) {
459                         return tevent_req_post(req, ev);
460                 }
461
462                 ip.pnn = tmp_ip->pnn;
463                 ip.addr = tmp_ip->addr;
464                 ctdb_req_control_takeover_ip(&request, &ip);
465                 subreq = ctdb_client_control_send(
466                                         state, ev, client, tmp_ip->pnn,
467                                         timeout, /* cumulative */
468                                         &request);
469                 if (tevent_req_nomem(subreq, req)) {
470                         return tevent_req_post(req, ev);
471                 }
472                 tevent_req_set_callback(subreq, take_ip_done, substate);
473
474                 state->num_sent++;
475         }
476
477         /* None sent, finished... */
478         if (state->num_sent == 0) {
479                 tevent_req_done(req);
480                 return tevent_req_post(req, ev);
481         }
482
483         return req;
484 }
485
486 static void take_ip_done(struct tevent_req *subreq)
487 {
488         struct take_ip_one_state *substate = tevent_req_callback_data(
489                 subreq, struct take_ip_one_state);
490         struct tevent_req *req = substate->req;
491         struct ctdb_reply_control *reply;
492         struct take_ip_state *state = tevent_req_data(
493                 req, struct take_ip_state);
494         int ret = 0;
495         bool status;
496
497         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
498         TALLOC_FREE(subreq);
499
500         if (! status) {
501                 D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
502                       substate->ip_str, substate->pnn, ret);
503                 goto fail;
504         }
505
506         ret = ctdb_reply_control_takeover_ip(reply);
507         if (ret != 0) {
508                 D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
509                       substate->ip_str, substate->pnn, ret);
510                 goto fail;
511         }
512
513         D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
514                substate->ip_str, substate->pnn);
515         goto done;
516
517 fail:
518         state->ban_credits[substate->pnn]++;
519         state->num_fails++;
520         state->err_any = ret;
521
522 done:
523         talloc_free(substate);
524
525         state->num_replies++;
526
527         if (state->num_replies < state->num_sent) {
528                 /* Not all replies received, don't go further */
529                 return;
530         }
531
532         if (state->num_fails > 0) {
533                 tevent_req_error(req, state->err_any);
534                 return;
535         }
536
537         tevent_req_done(req);
538 }
539
540 static bool take_ip_recv(struct tevent_req *req, int *perr)
541 {
542         return generic_recv(req, perr);
543 }
544
545 /**********************************************************************/
546
547 struct ipreallocated_state {
548         uint32_t *pnns;
549         int count;
550         uint32_t *ban_credits;
551 };
552
553 static void ipreallocated_done(struct tevent_req *subreq);
554
555 static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
556                                              struct tevent_context *ev,
557                                              struct ctdb_client_context *client,
558                                              uint32_t *pnns,
559                                              int count,
560                                              struct timeval timeout,
561                                              uint32_t *ban_credits)
562 {
563         struct tevent_req *req, *subreq;
564         struct ipreallocated_state *state;
565         struct ctdb_req_control request;
566
567         req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
568         if (req == NULL) {
569                 return NULL;
570         }
571
572         state->pnns = pnns;
573         state->count = count;
574         state->ban_credits = ban_credits;
575
576         ctdb_req_control_ipreallocated(&request);
577         subreq = ctdb_client_control_multi_send(state, ev, client,
578                                                 pnns, count,
579                                                 timeout, /* cumulative */
580                                                 &request);
581         if (tevent_req_nomem(subreq, req)) {
582                 return tevent_req_post(req, ev);
583         }
584         tevent_req_set_callback(subreq, ipreallocated_done, req);
585
586         return req;
587 }
588
589 static void ipreallocated_done(struct tevent_req *subreq)
590 {
591         struct tevent_req *req = tevent_req_callback_data(
592                 subreq, struct tevent_req);
593         struct ipreallocated_state *state = tevent_req_data(
594                 req, struct ipreallocated_state);
595         int *err_list = NULL;
596         int ret, i;
597         bool status, found_errors;
598
599         status = ctdb_client_control_multi_recv(subreq, &ret, state,
600                                                 &err_list, NULL);
601         TALLOC_FREE(subreq);
602
603         if (status) {
604                 D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
605                 tevent_req_done(req);
606                 return;
607         }
608
609         /* Get some clear error messages out of err_list and count
610          * banning credits
611          */
612         found_errors = false;
613         for (i = 0; i < state->count; i++) {
614                 int err = err_list[i];
615                 if (err != 0) {
616                         uint32_t pnn = state->pnns[i];
617
618                         D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
619                               pnn, err);
620
621                         state->ban_credits[pnn]++;
622                         found_errors = true;
623                 }
624         }
625
626         if (! found_errors) {
627                 D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
628         }
629
630         tevent_req_error(req, ret);
631 }
632
633 static bool ipreallocated_recv(struct tevent_req *req, int *perr)
634 {
635         return generic_recv(req, perr);
636 }
637
638 /**********************************************************************/
639
640 /*
641  * Recalculate the allocation of public IPs to nodes and have the
642  * nodes host their allocated addresses.
643  *
644  * - Get tunables
645  * - Get nodemap
646  * - Initialise IP allocation state.  Pass:
647  *   + algorithm to be used;
648  *   + various tunables (NoIPTakeover, NoIPFailback, NoIPHostOnAllDisabled)
649  *   + list of nodes to force rebalance (internal structure, currently
650  *     no way to fetch, only used by LCP2 for nodes that have had new
651  *     IP addresses added).
652  * - Set IP flags for IP allocation based on node map
653  * - Retrieve known and available IP addresses (done separately so
654  *   values can be faked in unit testing)
655  * - Use ipalloc_set_public_ips() to set known and available IP
656  *   addresses for allocation
657  * - If cluster can't host IP addresses then jump to IPREALLOCATED
658  * - Run IP allocation algorithm
659  * - Send RELEASE_IP to all nodes for IPs they should not host
660  * - Send TAKE_IP to all nodes for IPs they should host
661  * - Send IPREALLOCATED to all nodes
662  */
663
664 struct takeover_state {
665         struct tevent_context *ev;
666         struct ctdb_client_context *client;
667         struct timeval timeout;
668         int num_nodes;
669         uint32_t *pnns_connected;
670         int num_connected;
671         uint32_t *pnns_active;
672         int num_active;
673         uint32_t destnode;
674         uint32_t *force_rebalance_nodes;
675         struct ctdb_tunable_list *tun_list;
676         struct ipalloc_state *ipalloc_state;
677         struct ctdb_public_ip_list *known_ips;
678         struct public_ip_list *all_ips;
679         uint32_t *ban_credits;
680 };
681
682 static void takeover_tunables_done(struct tevent_req *subreq);
683 static void takeover_nodemap_done(struct tevent_req *subreq);
684 static void takeover_known_ips_done(struct tevent_req *subreq);
685 static void takeover_avail_ips_done(struct tevent_req *subreq);
686 static void takeover_release_ip_done(struct tevent_req *subreq);
687 static void takeover_take_ip_done(struct tevent_req *subreq);
688 static void takeover_ipreallocated(struct tevent_req *req);
689 static void takeover_ipreallocated_done(struct tevent_req *subreq);
690 static void takeover_failed(struct tevent_req *subreq, int ret);
691 static void takeover_failed_done(struct tevent_req *subreq);
692
693 static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
694                                         struct tevent_context *ev,
695                                         struct ctdb_client_context *client,
696                                         uint32_t *force_rebalance_nodes)
697 {
698         struct tevent_req *req, *subreq;
699         struct takeover_state *state;
700         struct ctdb_req_control request;
701
702         req = tevent_req_create(mem_ctx, &state, struct takeover_state);
703         if (req == NULL) {
704                 return NULL;
705         }
706
707         state->ev = ev;
708         state->client = client;
709         state->force_rebalance_nodes = force_rebalance_nodes;
710         state->destnode = ctdb_client_pnn(client);
711
712         ctdb_req_control_get_all_tunables(&request);
713         subreq = ctdb_client_control_send(state, state->ev, state->client,
714                                           state->destnode, TIMEOUT(),
715                                           &request);
716         if (tevent_req_nomem(subreq, req)) {
717                 return tevent_req_post(req, ev);
718         }
719         tevent_req_set_callback(subreq, takeover_tunables_done, req);
720
721         return req;
722 }
723
724 static void takeover_tunables_done(struct tevent_req *subreq)
725 {
726         struct tevent_req *req = tevent_req_callback_data(
727                 subreq, struct tevent_req);
728         struct takeover_state *state = tevent_req_data(
729                 req, struct takeover_state);
730         struct ctdb_reply_control *reply;
731         struct ctdb_req_control request;
732         int ret;
733         bool status;
734
735         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
736         TALLOC_FREE(subreq);
737         if (! status) {
738                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
739                 tevent_req_error(req, ret);
740                 return;
741         }
742
743         ret = ctdb_reply_control_get_all_tunables(reply, state,
744                                                   &state->tun_list);
745         if (ret != 0) {
746                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
747                 tevent_req_error(req, ret);
748                 return;
749         }
750
751         talloc_free(reply);
752
753         takeover_timeout = state->tun_list->takeover_timeout;
754
755         ctdb_req_control_get_nodemap(&request);
756         subreq = ctdb_client_control_send(state, state->ev, state->client,
757                                           state->destnode, TIMEOUT(),
758                                           &request);
759         if (tevent_req_nomem(subreq, req)) {
760                 return;
761         }
762         tevent_req_set_callback(subreq, takeover_nodemap_done, req);
763 }
764
765 static void takeover_nodemap_done(struct tevent_req *subreq)
766 {
767         struct tevent_req *req = tevent_req_callback_data(
768                 subreq, struct tevent_req);
769         struct takeover_state *state = tevent_req_data(
770                 req, struct takeover_state);
771         struct ctdb_reply_control *reply;
772         bool status;
773         int ret;
774         struct ctdb_node_map *nodemap;
775
776         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
777         TALLOC_FREE(subreq);
778         if (! status) {
779                 D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
780                         state->destnode, ret);
781                 tevent_req_error(req, ret);
782                 return;
783         }
784
785         ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
786         if (ret != 0) {
787                 D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
788                 tevent_req_error(req, ret);
789                 return;
790         }
791
792         state->num_nodes = nodemap->num;
793
794         state->num_connected = list_of_connected_nodes(nodemap,
795                                                        CTDB_UNKNOWN_PNN, state,
796                                                        &state->pnns_connected);
797         if (state->num_connected <= 0) {
798                 tevent_req_error(req, ENOMEM);
799                 return;
800         }
801
802         state->num_active = list_of_active_nodes(nodemap,
803                                                  CTDB_UNKNOWN_PNN, state,
804                                                  &state->pnns_active);
805         if (state->num_active <= 0) {
806                 tevent_req_error(req, ENOMEM);
807                 return;
808         }
809
810         /* Default timeout for early jump to IPREALLOCATED.  See below
811          * for explanation of 3 times...
812          */
813         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
814
815         state->ban_credits = talloc_zero_array(state, uint32_t,
816                                                state->num_nodes);
817         if (tevent_req_nomem(state->ban_credits, req)) {
818                 return;
819         }
820
821         if (state->tun_list->disable_ip_failover != 0) {
822                 /* IP failover is completely disabled so just send out
823                  * ipreallocated event.
824                  */
825                 takeover_ipreallocated(req);
826                 return;
827         }
828
829         state->ipalloc_state =
830                 ipalloc_state_init(
831                         state, state->num_nodes,
832                         determine_algorithm(state->tun_list),
833                         (state->tun_list->no_ip_takeover != 0),
834                         (state->tun_list->no_ip_failback != 0),
835                         (state->tun_list->no_ip_host_on_all_disabled != 0),
836                         state->force_rebalance_nodes);
837         if (tevent_req_nomem(state->ipalloc_state, req)) {
838                 return;
839         }
840
841         ipalloc_set_node_flags(state->ipalloc_state, nodemap);
842
843         subreq = get_public_ips_send(state, state->ev, state->client,
844                                      state->pnns_active, state->num_active,
845                                      state->num_nodes, false);
846         if (tevent_req_nomem(subreq, req)) {
847                 return;
848         }
849
850         tevent_req_set_callback(subreq, takeover_known_ips_done, req);
851 }
852
853 static void takeover_known_ips_done(struct tevent_req *subreq)
854 {
855         struct tevent_req *req = tevent_req_callback_data(
856                 subreq, struct tevent_req);
857         struct takeover_state *state = tevent_req_data(
858                 req, struct takeover_state);
859         int ret;
860         bool status;
861
862         status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
863         TALLOC_FREE(subreq);
864
865         if (! status) {
866                 D_ERR("Failed to fetch known public IPs\n");
867                 tevent_req_error(req, ret);
868                 return;
869         }
870
871         subreq = get_public_ips_send(state, state->ev, state->client,
872                                      state->pnns_active, state->num_active,
873                                      state->num_nodes, true);
874         if (tevent_req_nomem(subreq, req)) {
875                 return;
876         }
877
878         tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
879 }
880
881 static void takeover_avail_ips_done(struct tevent_req *subreq)
882 {
883         struct tevent_req *req = tevent_req_callback_data(
884                 subreq, struct tevent_req);
885         struct takeover_state *state = tevent_req_data(
886                 req, struct takeover_state);
887         bool status;
888         int ret;
889         struct ctdb_public_ip_list *available_ips;
890
891         status = get_public_ips_recv(subreq, &ret, state, &available_ips);
892         TALLOC_FREE(subreq);
893
894         if (! status) {
895                 D_ERR("Failed to fetch available public IPs\n");
896                 tevent_req_error(req, ret);
897                 return;
898         }
899
900         ipalloc_set_public_ips(state->ipalloc_state,
901                                state->known_ips, available_ips);
902
903         if (! ipalloc_can_host_ips(state->ipalloc_state)) {
904                 D_NOTICE("No nodes available to host public IPs yet\n");
905                 takeover_ipreallocated(req);
906                 return;
907         }
908
909         /* Do the IP reassignment calculations */
910         state->all_ips = ipalloc(state->ipalloc_state);
911         if (tevent_req_nomem(state->all_ips, req)) {
912                 return;
913         }
914
915         /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
916          * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
917          * seconds.  However, RELEASE_IP can take longer due to TCP
918          * connection killing, so sometimes needs more time.
919          * Therefore, use a cumulative timeout of TakeoverTimeout * 3
920          * seconds across all 3 stages.  No explicit expiry checks are
921          * needed before each stage because tevent is smart enough to
922          * fire the timeouts even if they are in the past.  Initialise
923          * this here so it explicitly covers the stages we're
924          * interested in but, in particular, not the time taken by the
925          * ipalloc().
926          */
927         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
928
929         subreq = release_ip_send(state, state->ev, state->client,
930                                  state->pnns_connected, state->num_connected,
931                                  state->timeout, state->all_ips,
932                                  state->ban_credits);
933         if (tevent_req_nomem(subreq, req)) {
934                 return;
935         }
936         tevent_req_set_callback(subreq, takeover_release_ip_done, req);
937 }
938
939 static void takeover_release_ip_done(struct tevent_req *subreq)
940 {
941         struct tevent_req *req = tevent_req_callback_data(
942                 subreq, struct tevent_req);
943         struct takeover_state *state = tevent_req_data(
944                 req, struct takeover_state);
945         int ret;
946         bool status;
947
948         status = release_ip_recv(subreq, &ret);
949         TALLOC_FREE(subreq);
950
951         if (! status) {
952                 takeover_failed(req, ret);
953                 return;
954         }
955
956         /* All released, now for takeovers */
957
958         subreq = take_ip_send(state, state->ev, state->client,
959                               state->timeout, state->all_ips,
960                               state->ban_credits);
961         if (tevent_req_nomem(subreq, req)) {
962                 return;
963         }
964         tevent_req_set_callback(subreq, takeover_take_ip_done, req);
965 }
966
967 static void takeover_take_ip_done(struct tevent_req *subreq)
968 {
969         struct tevent_req *req = tevent_req_callback_data(
970                 subreq, struct tevent_req);
971         int ret = 0;
972         bool status;
973
974         status = take_ip_recv(subreq, &ret);
975         TALLOC_FREE(subreq);
976
977         if (! status) {
978                 takeover_failed(req, ret);
979                 return;
980         }
981
982         takeover_ipreallocated(req);
983 }
984
985 static void takeover_ipreallocated(struct tevent_req *req)
986 {
987         struct takeover_state *state = tevent_req_data(
988                 req, struct takeover_state);
989         struct tevent_req *subreq;
990
991         subreq = ipreallocated_send(state, state->ev, state->client,
992                                     state->pnns_connected,
993                                     state->num_connected,
994                                     state->timeout,
995                                     state->ban_credits);
996         if (tevent_req_nomem(subreq, req)) {
997                 return;
998         }
999         tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
1000 }
1001
1002 static void takeover_ipreallocated_done(struct tevent_req *subreq)
1003 {
1004         struct tevent_req *req = tevent_req_callback_data(
1005                 subreq, struct tevent_req);
1006         int ret;
1007         bool status;
1008
1009         status = ipreallocated_recv(subreq, &ret);
1010         TALLOC_FREE(subreq);
1011
1012         if (! status) {
1013                 takeover_failed(req, ret);
1014                 return;
1015         }
1016
1017         tevent_req_done(req);
1018 }
1019
1020 struct takeover_failed_state {
1021         struct tevent_req *req;
1022         int ret;
1023 };
1024
1025 void takeover_failed(struct tevent_req *req, int ret)
1026 {
1027         struct takeover_state *state = tevent_req_data(
1028                 req, struct takeover_state);
1029         struct tevent_req *subreq;
1030         uint32_t max_pnn = CTDB_UNKNOWN_PNN;
1031         int max_credits = 0;
1032         int pnn;
1033
1034         /* Check that bans are enabled */
1035         if (state->tun_list->enable_bans == 0) {
1036                 tevent_req_error(req, ret);
1037                 return;
1038         }
1039
1040         for (pnn = 0; pnn < state->num_nodes; pnn++) {
1041                 if (state->ban_credits[pnn] > max_credits) {
1042                         max_pnn = pnn;
1043                         max_credits = state->ban_credits[pnn];
1044                 }
1045         }
1046
1047         if (max_credits > 0) {
1048                 struct ctdb_req_message message;
1049                 struct takeover_failed_state *substate;
1050
1051                 D_WARNING("Assigning banning credits to node %u\n", max_pnn);
1052
1053                 substate = talloc_zero(state, struct takeover_failed_state);
1054                 if (tevent_req_nomem(substate, req)) {
1055                         return;
1056                 }
1057                 substate->req = req;
1058                 substate->ret = ret;
1059
1060                 message.srvid = CTDB_SRVID_BANNING;
1061                 message.data.pnn = max_pnn;
1062
1063                 subreq = ctdb_client_message_send(
1064                         state, state->ev, state->client,
1065                         ctdb_client_pnn(state->client),
1066                         &message);
1067                 if (subreq == NULL) {
1068                         D_ERR("failed to assign banning credits\n");
1069                         tevent_req_error(req, ret);
1070                         return;
1071                 }
1072                 tevent_req_set_callback(subreq, takeover_failed_done, substate);
1073         } else {
1074                 tevent_req_error(req, ret);
1075         }
1076 }
1077
1078 static void takeover_failed_done(struct tevent_req *subreq)
1079 {
1080         struct takeover_failed_state *substate = tevent_req_callback_data(
1081                 subreq, struct takeover_failed_state);
1082         struct tevent_req *req = substate->req;
1083         int ret;
1084         bool status;
1085
1086         status = ctdb_client_message_recv(subreq, &ret);
1087         TALLOC_FREE(subreq);
1088         if (! status) {
1089                 D_ERR("failed to assign banning credits, ret=%d\n", ret);
1090         }
1091
1092         ret = substate->ret;
1093         talloc_free(substate);
1094         tevent_req_error(req, ret);
1095 }
1096
1097 static void takeover_recv(struct tevent_req *req, int *perr)
1098 {
1099         generic_recv(req, perr);
1100 }
1101
1102 static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
1103 {
1104         char *strv = NULL;
1105         int num, i, ret;
1106         char *t;
1107         uint32_t *nodes;
1108
1109         ret = strv_split(mem_ctx, &strv, s, ",");
1110         if (ret != 0) {
1111                 D_ERR("out of memory\n");
1112                 return NULL;
1113         }
1114
1115         num = strv_count(strv);
1116
1117         nodes = talloc_array(mem_ctx, uint32_t, num);
1118         if (nodes == NULL) {
1119                 D_ERR("out of memory\n");
1120                 return NULL;
1121         }
1122
1123         t = NULL;
1124         for (i = 0; i < num; i++) {
1125                 t = strv_next(strv, t);
1126                 nodes[i] = atoi(t);
1127         }
1128
1129         return nodes;
1130 }
1131
1132 static void usage(const char *progname)
1133 {
1134         fprintf(stderr,
1135                 "\nUsage: %s <output-fd> <ctdb-socket-path> "
1136                 "[<force-rebalance-nodes>]\n",
1137                 progname);
1138 }
1139
1140 /*
1141  * Arguments - write fd, socket path
1142  */
1143 int main(int argc, const char *argv[])
1144 {
1145         int write_fd;
1146         const char *sockpath;
1147         TALLOC_CTX *mem_ctx;
1148         struct tevent_context *ev;
1149         struct ctdb_client_context *client;
1150         int ret;
1151         struct tevent_req *req;
1152         uint32_t *force_rebalance_nodes = NULL;
1153
1154         if (argc < 3 || argc > 4) {
1155                 usage(argv[0]);
1156                 exit(1);
1157         }
1158
1159         write_fd = atoi(argv[1]);
1160         sockpath = argv[2];
1161
1162         mem_ctx = talloc_new(NULL);
1163         if (mem_ctx == NULL) {
1164                 fprintf(stderr, "talloc_new() failed\n");
1165                 ret = ENOMEM;
1166                 goto done;
1167         }
1168
1169         if (argc == 4) {
1170                 force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
1171                 if (force_rebalance_nodes == NULL) {
1172                         usage(argv[0]);
1173                         ret = EINVAL;
1174                         goto done;
1175                 }
1176         }
1177
1178         ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
1179         if (ret != 0) {
1180                 fprintf(stderr,
1181                         "ctdb-takeover: Unable to initialize logging\n");
1182                 goto done;
1183         }
1184
1185         ev = tevent_context_init(mem_ctx);
1186         if (ev == NULL) {
1187                 D_ERR("tevent_context_init() failed\n");
1188                 ret = ENOMEM;
1189                 goto done;
1190         }
1191
1192         ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
1193         if (ret != 0) {
1194                 D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
1195                 goto done;
1196         }
1197
1198         req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
1199         if (req == NULL) {
1200                 D_ERR("takeover_send() failed\n");
1201                 ret = 1;
1202                 goto done;
1203         }
1204
1205         if (! tevent_req_poll(req, ev)) {
1206                 D_ERR("tevent_req_poll() failed\n");
1207                 ret = 1;
1208                 goto done;
1209         }
1210
1211         takeover_recv(req, &ret);
1212         TALLOC_FREE(req);
1213         if (ret != 0) {
1214                 D_ERR("takeover run failed, ret=%d\n", ret);
1215         }
1216
1217 done:
1218         sys_write_v(write_fd, &ret, sizeof(ret));
1219
1220         talloc_free(mem_ctx);
1221         return ret;
1222 }