eae4bfa79ba7b1ae14ce7a40a077b92ede9ad6a0
[sfrench/samba-autobuild/.git] / ctdb / server / ctdb_takeover_helper.c
1 /*
2    CTDB IP takeover helper
3
4    Copyright (C) Martin Schwenke  2016
5
6    Based on ctdb_recovery_helper.c
7    Copyright (C) Amitay Isaacs  2015
8
9    and ctdb_takeover.c
10    Copyright (C) Ronnie Sahlberg  2007
11    Copyright (C) Andrew Tridgell  2007
12    Copyright (C) Martin Schwenke  2011
13
14    This program is free software; you can redistribute it and/or modify
15    it under the terms of the GNU General Public License as published by
16    the Free Software Foundation; either version 3 of the License, or
17    (at your option) any later version.
18
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License for more details.
23
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28 #include "replace.h"
29 #include "system/network.h"
30 #include "system/filesys.h"
31
32 #include <popt.h>
33 #include <talloc.h>
34 #include <tevent.h>
35
36 #include "lib/util/debug.h"
37 #include "lib/util/strv.h"
38 #include "lib/util/strv_util.h"
39 #include "lib/util/sys_rw.h"
40 #include "lib/util/time.h"
41 #include "lib/util/tevent_unix.h"
42
43 #include "protocol/protocol.h"
44 #include "protocol/protocol_api.h"
45 #include "client/client.h"
46
47 #include "common/logging.h"
48
49 #include "server/ipalloc.h"
50
51 static int takeover_timeout = 9;
52
53 #define TIMEOUT()       timeval_current_ofs(takeover_timeout, 0)
54
55 /*
56  * Utility functions
57  */
58
59 static bool generic_recv(struct tevent_req *req, int *perr)
60 {
61         int err;
62
63         if (tevent_req_is_unix_error(req, &err)) {
64                 if (perr != NULL) {
65                         *perr = err;
66                 }
67                 return false;
68         }
69
70         return true;
71 }
72
73 static enum ipalloc_algorithm
74 determine_algorithm(const struct ctdb_tunable_list *tunables)
75 {
76         switch (tunables->ip_alloc_algorithm) {
77         case 0:
78                 return IPALLOC_DETERMINISTIC;
79         case 1:
80                 return IPALLOC_NONDETERMINISTIC;
81         case 2:
82                 return IPALLOC_LCP2;
83         default:
84                 return IPALLOC_LCP2;
85         };
86 }
87
88 /**********************************************************************/
89
90 struct get_public_ips_state {
91         uint32_t *pnns;
92         int count;
93         struct ctdb_public_ip_list *ips;
94 };
95
96 static void get_public_ips_done(struct tevent_req *subreq);
97
98 static struct tevent_req *get_public_ips_send(
99                                 TALLOC_CTX *mem_ctx,
100                                 struct tevent_context *ev,
101                                 struct ctdb_client_context *client,
102                                 uint32_t *pnns,
103                                 int count, int num_nodes,
104                                 bool available_only)
105 {
106         struct tevent_req *req, *subreq;
107         struct get_public_ips_state *state;
108         struct ctdb_req_control request;
109
110         req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
111         if (req == NULL) {
112                 return NULL;
113         }
114
115         state->pnns = pnns;
116         state->count = count;
117
118         state->ips  = talloc_zero_array(state,
119                                         struct ctdb_public_ip_list,
120                                         num_nodes);
121         if (tevent_req_nomem(state->ips, req)) {
122                 return tevent_req_post(req, ev);
123         }
124
125         /* Short circuit if no nodes being asked for IPs */
126         if (state->count == 0) {
127                 tevent_req_done(req);
128                 return tevent_req_post(req, ev);
129         }
130
131         ctdb_req_control_get_public_ips(&request, available_only);
132         subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
133                                                 state->pnns,
134                                                 state->count,
135                                                 TIMEOUT(), &request);
136         if (tevent_req_nomem(subreq, req)) {
137                 return tevent_req_post(req, ev);
138         }
139         tevent_req_set_callback(subreq, get_public_ips_done, req);
140
141         return req;
142 }
143
144 static void get_public_ips_done(struct tevent_req *subreq)
145 {
146         struct tevent_req *req = tevent_req_callback_data(
147                 subreq, struct tevent_req);
148         struct get_public_ips_state *state = tevent_req_data(
149                 req, struct get_public_ips_state);
150         struct ctdb_reply_control **reply;
151         int *err_list;
152         int ret, i;
153         bool status, found_errors;
154
155         status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
156                                                 &reply);
157         TALLOC_FREE(subreq);
158         if (! status) {
159                 found_errors = false;
160                 for (i = 0; i < state->count; i++) {
161                         if (err_list[i] != 0) {
162                                 uint32_t pnn = state->pnns[i];
163
164                                 D_ERR("control GET_PUBLIC_IPS failed on "
165                                       "node %u, ret=%d\n", pnn, err_list[i]);
166
167                                 found_errors = true;
168                         }
169                 }
170
171                 tevent_req_error(req, ret);
172                 return;
173         }
174
175         found_errors = false;
176         for (i = 0; i < state->count; i++) {
177                 uint32_t pnn;
178                 struct ctdb_public_ip_list *ips;
179
180                 pnn = state->pnns[i];
181                 ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
182                                                         &ips);
183                 if (ret != 0) {
184                         D_ERR("control GET_PUBLIC_IPS failed on "
185                               "node %u\n", pnn);
186                         found_errors = true;
187                         continue;
188                 }
189
190                 D_INFO("Fetched public IPs from node %u\n", pnn);
191                 state->ips[pnn] = *ips;
192         }
193
194         if (found_errors) {
195                 tevent_req_error(req, EIO);
196                 return;
197         }
198
199         talloc_free(reply);
200
201         tevent_req_done(req);
202 }
203
204 static bool get_public_ips_recv(struct tevent_req *req, int *perr,
205                                 TALLOC_CTX *mem_ctx,
206                                 struct ctdb_public_ip_list **ips)
207 {
208         struct get_public_ips_state *state = tevent_req_data(
209                 req, struct get_public_ips_state);
210         int err;
211
212         if (tevent_req_is_unix_error(req, &err)) {
213                 if (perr != NULL) {
214                         *perr = err;
215                 }
216                 return false;
217         }
218
219         *ips = talloc_steal(mem_ctx, state->ips);
220
221         return true;
222 }
223
224 /**********************************************************************/
225
226 struct release_ip_state {
227         int num_sent;
228         int num_replies;
229         int num_fails;
230         int err_any;
231         uint32_t *ban_credits;
232 };
233
234 struct release_ip_one_state {
235         struct tevent_req *req;
236         uint32_t *pnns;
237         int count;
238         const char *ip_str;
239 };
240
241 static void release_ip_done(struct tevent_req *subreq);
242
243 static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
244                                           struct tevent_context *ev,
245                                           struct ctdb_client_context *client,
246                                           uint32_t *pnns,
247                                           int count,
248                                           struct timeval timeout,
249                                           struct public_ip_list *all_ips,
250                                           uint32_t *ban_credits)
251 {
252         struct tevent_req *req, *subreq;
253         struct release_ip_state *state;
254         struct ctdb_req_control request;
255         struct public_ip_list *tmp_ip;
256
257         req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
258         if (req == NULL) {
259                 return NULL;
260         }
261
262         state->num_sent = 0;
263         state->num_replies = 0;
264         state->num_fails = 0;
265         state->ban_credits = ban_credits;
266
267         /* Send a RELEASE_IP to all nodes that should not be hosting
268          * each IP.  For each IP, all but one of these will be
269          * redundant.  However, the redundant ones are used to tell
270          * nodes which node should be hosting the IP so that commands
271          * like "ctdb ip" can display a particular nodes idea of who
272          * is hosting what. */
273         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
274                 struct release_ip_one_state *substate;
275                 struct ctdb_public_ip ip;
276                 int i;
277
278                 substate = talloc_zero(state, struct release_ip_one_state);
279                 if (tevent_req_nomem(substate, req)) {
280                         return tevent_req_post(req, ev);
281                 }
282
283                 substate->pnns = talloc_zero_array(substate, uint32_t, count);
284                 if (tevent_req_nomem(substate->pnns, req)) {
285                         return tevent_req_post(req, ev);
286                 }
287
288                 substate->count = 0;
289                 substate->req = req;
290
291                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
292                                                              &tmp_ip->addr);
293                 if (tevent_req_nomem(substate->ip_str, req)) {
294                         return tevent_req_post(req, ev);
295                 }
296
297                 for (i = 0; i < count; i++) {
298                         uint32_t pnn = pnns[i];
299                         /* If pnn is not the node that should be
300                          * hosting the IP then add it to the list of
301                          * nodes that need to do a release. */
302                         if (tmp_ip->pnn != pnn) {
303                                 substate->pnns[substate->count] = pnn;
304                                 substate->count++;
305                         }
306                 }
307
308                 if (substate->count == 0) {
309                         /* No releases to send for this address... */
310                         TALLOC_FREE(substate);
311                         continue;
312                 }
313
314                 ip.pnn = tmp_ip->pnn;
315                 ip.addr = tmp_ip->addr;
316                 ctdb_req_control_release_ip(&request, &ip);
317                 subreq = ctdb_client_control_multi_send(state, ev, client,
318                                                         substate->pnns,
319                                                         substate->count,
320                                                         timeout,/* cumulative */
321                                                         &request);
322                 if (tevent_req_nomem(subreq, req)) {
323                         return tevent_req_post(req, ev);
324                 }
325                 tevent_req_set_callback(subreq, release_ip_done, substate);
326
327                 state->num_sent++;
328         }
329
330         /* None sent, finished... */
331         if (state->num_sent == 0) {
332                 tevent_req_done(req);
333                 return tevent_req_post(req, ev);
334         }
335
336         return req;
337 }
338
339 static void release_ip_done(struct tevent_req *subreq)
340 {
341         struct release_ip_one_state *substate = tevent_req_callback_data(
342                 subreq, struct release_ip_one_state);
343         struct tevent_req *req = substate->req;
344         struct release_ip_state *state = tevent_req_data(
345                 req, struct release_ip_state);
346         int ret, i;
347         int *err_list;
348         bool status, found_errors;
349
350         status = ctdb_client_control_multi_recv(subreq, &ret, state,
351                                                 &err_list, NULL);
352         TALLOC_FREE(subreq);
353
354         if (status) {
355                 D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
356                        substate->ip_str, substate->count);
357                 goto done;
358         }
359
360         /* Get some clear error messages out of err_list and count
361          * banning credits
362          */
363         found_errors = false;
364         for (i = 0; i < substate->count; i++) {
365                 int err = err_list[i];
366                 if (err != 0) {
367                         uint32_t pnn = substate->pnns[i];
368
369                         D_ERR("RELEASE_IP %s failed on node %u, "
370                               "ret=%d\n", substate->ip_str, pnn, err);
371
372                         state->ban_credits[pnn]++;
373                         state->err_any = err;
374                         found_errors = true;
375                 }
376         }
377         if (! found_errors) {
378                 D_ERR("RELEASE_IP %s internal error, ret=%d\n",
379                       substate->ip_str, ret);
380                 state->err_any = EIO;
381         }
382
383         state->num_fails++;
384
385 done:
386         talloc_free(substate);
387
388         state->num_replies++;
389
390         if (state->num_replies < state->num_sent) {
391                 /* Not all replies received, don't go further */
392                 return;
393         }
394
395         if (state->num_fails > 0) {
396                 tevent_req_error(req, state->err_any);
397                 return;
398         }
399
400         tevent_req_done(req);
401 }
402
403 static bool release_ip_recv(struct tevent_req *req, int *perr)
404 {
405         return generic_recv(req, perr);
406 }
407
408 /**********************************************************************/
409
410 struct take_ip_state {
411         int num_sent;
412         int num_replies;
413         int num_fails;
414         int err_any;
415         uint32_t *ban_credits;
416 };
417
418 struct take_ip_one_state {
419         struct tevent_req *req;
420         uint32_t pnn;
421         const char *ip_str;
422 };
423
424 static void take_ip_done(struct tevent_req *subreq);
425
426 static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
427                                        struct tevent_context *ev,
428                                        struct ctdb_client_context *client,
429                                        struct timeval timeout,
430                                        struct public_ip_list *all_ips,
431                                        uint32_t *ban_credits)
432 {
433         struct tevent_req *req, *subreq;
434         struct take_ip_state *state;
435         struct ctdb_req_control request;
436         struct public_ip_list *tmp_ip;
437
438         req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
439         if (req == NULL) {
440                 return NULL;
441         }
442
443         state->num_sent = 0;
444         state->num_replies = 0;
445         state->num_fails = 0;
446         state->ban_credits = ban_credits;
447
448         /* For each IP, send a TAKOVER_IP to the node that should be
449          * hosting it.  Many of these will often be redundant (since
450          * the allocation won't have changed) but they can be useful
451          * to recover from inconsistencies. */
452         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
453                 struct take_ip_one_state *substate;
454                 struct ctdb_public_ip ip;
455
456                 if (tmp_ip->pnn == -1) {
457                         /* IP will be unassigned */
458                         continue;
459                 }
460
461                 substate = talloc_zero(state, struct take_ip_one_state);
462                 if (tevent_req_nomem(substate, req)) {
463                         return tevent_req_post(req, ev);
464                 }
465
466                 substate->req = req;
467                 substate->pnn = tmp_ip->pnn;
468
469                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
470                                                              &tmp_ip->addr);
471                 if (tevent_req_nomem(substate->ip_str, req)) {
472                         return tevent_req_post(req, ev);
473                 }
474
475                 ip.pnn = tmp_ip->pnn;
476                 ip.addr = tmp_ip->addr;
477                 ctdb_req_control_takeover_ip(&request, &ip);
478                 subreq = ctdb_client_control_send(
479                                         state, ev, client, tmp_ip->pnn,
480                                         timeout, /* cumulative */
481                                         &request);
482                 if (tevent_req_nomem(subreq, req)) {
483                         return tevent_req_post(req, ev);
484                 }
485                 tevent_req_set_callback(subreq, take_ip_done, substate);
486
487                 state->num_sent++;
488         }
489
490         /* None sent, finished... */
491         if (state->num_sent == 0) {
492                 tevent_req_done(req);
493                 return tevent_req_post(req, ev);
494         }
495
496         return req;
497 }
498
499 static void take_ip_done(struct tevent_req *subreq)
500 {
501         struct take_ip_one_state *substate = tevent_req_callback_data(
502                 subreq, struct take_ip_one_state);
503         struct tevent_req *req = substate->req;
504         struct ctdb_reply_control *reply;
505         struct take_ip_state *state = tevent_req_data(
506                 req, struct take_ip_state);
507         int ret = 0;
508         bool status;
509
510         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
511         TALLOC_FREE(subreq);
512
513         if (! status) {
514                 D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
515                       substate->ip_str, substate->pnn, ret);
516                 goto fail;
517         }
518
519         ret = ctdb_reply_control_takeover_ip(reply);
520         if (ret != 0) {
521                 D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
522                       substate->ip_str, substate->pnn, ret);
523                 goto fail;
524         }
525
526         D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
527                substate->ip_str, substate->pnn);
528         goto done;
529
530 fail:
531         state->ban_credits[substate->pnn]++;
532         state->num_fails++;
533         state->err_any = ret;
534
535 done:
536         talloc_free(substate);
537
538         state->num_replies++;
539
540         if (state->num_replies < state->num_sent) {
541                 /* Not all replies received, don't go further */
542                 return;
543         }
544
545         if (state->num_fails > 0) {
546                 tevent_req_error(req, state->err_any);
547                 return;
548         }
549
550         tevent_req_done(req);
551 }
552
553 static bool take_ip_recv(struct tevent_req *req, int *perr)
554 {
555         return generic_recv(req, perr);
556 }
557
558 /**********************************************************************/
559
560 struct ipreallocated_state {
561         uint32_t *pnns;
562         int count;
563         uint32_t *ban_credits;
564 };
565
566 static void ipreallocated_done(struct tevent_req *subreq);
567
568 static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
569                                              struct tevent_context *ev,
570                                              struct ctdb_client_context *client,
571                                              uint32_t *pnns,
572                                              int count,
573                                              struct timeval timeout,
574                                              uint32_t *ban_credits)
575 {
576         struct tevent_req *req, *subreq;
577         struct ipreallocated_state *state;
578         struct ctdb_req_control request;
579
580         req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
581         if (req == NULL) {
582                 return NULL;
583         }
584
585         state->pnns = pnns;
586         state->count = count;
587         state->ban_credits = ban_credits;
588
589         ctdb_req_control_ipreallocated(&request);
590         subreq = ctdb_client_control_multi_send(state, ev, client,
591                                                 pnns, count,
592                                                 timeout, /* cumulative */
593                                                 &request);
594         if (tevent_req_nomem(subreq, req)) {
595                 return tevent_req_post(req, ev);
596         }
597         tevent_req_set_callback(subreq, ipreallocated_done, req);
598
599         return req;
600 }
601
602 static void ipreallocated_done(struct tevent_req *subreq)
603 {
604         struct tevent_req *req = tevent_req_callback_data(
605                 subreq, struct tevent_req);
606         struct ipreallocated_state *state = tevent_req_data(
607                 req, struct ipreallocated_state);
608         int *err_list = NULL;
609         int ret, i;
610         bool status, found_errors;
611
612         status = ctdb_client_control_multi_recv(subreq, &ret, state,
613                                                 &err_list, NULL);
614         TALLOC_FREE(subreq);
615
616         if (status) {
617                 D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
618                 tevent_req_done(req);
619                 return;
620         }
621
622         /* Get some clear error messages out of err_list and count
623          * banning credits
624          */
625         found_errors = false;
626         for (i = 0; i < state->count; i++) {
627                 int err = err_list[i];
628                 if (err != 0) {
629                         uint32_t pnn = state->pnns[i];
630
631                         D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
632                               pnn, err);
633
634                         state->ban_credits[pnn]++;
635                         found_errors = true;
636                 }
637         }
638
639         if (! found_errors) {
640                 D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
641         }
642
643         tevent_req_error(req, ret);
644 }
645
646 static bool ipreallocated_recv(struct tevent_req *req, int *perr)
647 {
648         return generic_recv(req, perr);
649 }
650
651 /**********************************************************************/
652
653 /*
654  * Recalculate the allocation of public IPs to nodes and have the
655  * nodes host their allocated addresses.
656  *
657  * - Get tunables
658  * - Get nodemap
659  * - Initialise IP allocation state.  Pass:
660  *   + algorithm to be used;
661  *   + various tunables (NoIPTakeover, NoIPFailback, NoIPHostOnAllDisabled)
662  *   + list of nodes to force rebalance (internal structure, currently
663  *     no way to fetch, only used by LCP2 for nodes that have had new
664  *     IP addresses added).
665  * - Set IP flags for IP allocation based on node map
666  * - Retrieve known and available IP addresses (done separately so
667  *   values can be faked in unit testing)
668  * - Use ipalloc_set_public_ips() to set known and available IP
669  *   addresses for allocation
670  * - If cluster can't host IP addresses then jump to IPREALLOCATED
671  * - Run IP allocation algorithm
672  * - Send RELEASE_IP to all nodes for IPs they should not host
673  * - Send TAKE_IP to all nodes for IPs they should host
674  * - Send IPREALLOCATED to all nodes
675  */
676
677 struct takeover_state {
678         struct tevent_context *ev;
679         struct ctdb_client_context *client;
680         struct timeval timeout;
681         int num_nodes;
682         uint32_t *pnns_connected;
683         int num_connected;
684         uint32_t *pnns_active;
685         int num_active;
686         uint32_t destnode;
687         uint32_t *force_rebalance_nodes;
688         struct ctdb_tunable_list *tun_list;
689         struct ipalloc_state *ipalloc_state;
690         struct ctdb_public_ip_list *known_ips;
691         struct public_ip_list *all_ips;
692         uint32_t *ban_credits;
693 };
694
695 static void takeover_tunables_done(struct tevent_req *subreq);
696 static void takeover_nodemap_done(struct tevent_req *subreq);
697 static void takeover_known_ips_done(struct tevent_req *subreq);
698 static void takeover_avail_ips_done(struct tevent_req *subreq);
699 static void takeover_release_ip_done(struct tevent_req *subreq);
700 static void takeover_take_ip_done(struct tevent_req *subreq);
701 static void takeover_ipreallocated(struct tevent_req *req);
702 static void takeover_ipreallocated_done(struct tevent_req *subreq);
703 static void takeover_failed(struct tevent_req *subreq, int ret);
704 static void takeover_failed_done(struct tevent_req *subreq);
705
706 static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
707                                         struct tevent_context *ev,
708                                         struct ctdb_client_context *client,
709                                         uint32_t *force_rebalance_nodes)
710 {
711         struct tevent_req *req, *subreq;
712         struct takeover_state *state;
713         struct ctdb_req_control request;
714
715         req = tevent_req_create(mem_ctx, &state, struct takeover_state);
716         if (req == NULL) {
717                 return NULL;
718         }
719
720         state->ev = ev;
721         state->client = client;
722         state->force_rebalance_nodes = force_rebalance_nodes;
723         state->destnode = ctdb_client_pnn(client);
724
725         ctdb_req_control_get_all_tunables(&request);
726         subreq = ctdb_client_control_send(state, state->ev, state->client,
727                                           state->destnode, TIMEOUT(),
728                                           &request);
729         if (tevent_req_nomem(subreq, req)) {
730                 return tevent_req_post(req, ev);
731         }
732         tevent_req_set_callback(subreq, takeover_tunables_done, req);
733
734         return req;
735 }
736
737 static void takeover_tunables_done(struct tevent_req *subreq)
738 {
739         struct tevent_req *req = tevent_req_callback_data(
740                 subreq, struct tevent_req);
741         struct takeover_state *state = tevent_req_data(
742                 req, struct takeover_state);
743         struct ctdb_reply_control *reply;
744         struct ctdb_req_control request;
745         int ret;
746         bool status;
747
748         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
749         TALLOC_FREE(subreq);
750         if (! status) {
751                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
752                 tevent_req_error(req, ret);
753                 return;
754         }
755
756         ret = ctdb_reply_control_get_all_tunables(reply, state,
757                                                   &state->tun_list);
758         if (ret != 0) {
759                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
760                 tevent_req_error(req, ret);
761                 return;
762         }
763
764         talloc_free(reply);
765
766         takeover_timeout = state->tun_list->takeover_timeout;
767
768         ctdb_req_control_get_nodemap(&request);
769         subreq = ctdb_client_control_send(state, state->ev, state->client,
770                                           state->destnode, TIMEOUT(),
771                                           &request);
772         if (tevent_req_nomem(subreq, req)) {
773                 return;
774         }
775         tevent_req_set_callback(subreq, takeover_nodemap_done, req);
776 }
777
778 static void takeover_nodemap_done(struct tevent_req *subreq)
779 {
780         struct tevent_req *req = tevent_req_callback_data(
781                 subreq, struct tevent_req);
782         struct takeover_state *state = tevent_req_data(
783                 req, struct takeover_state);
784         struct ctdb_reply_control *reply;
785         bool status;
786         int ret;
787         struct ctdb_node_map *nodemap;
788
789         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
790         TALLOC_FREE(subreq);
791         if (! status) {
792                 D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
793                         state->destnode, ret);
794                 tevent_req_error(req, ret);
795                 return;
796         }
797
798         ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
799         if (ret != 0) {
800                 D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
801                 tevent_req_error(req, ret);
802                 return;
803         }
804
805         state->num_nodes = nodemap->num;
806
807         state->num_connected = list_of_connected_nodes(nodemap,
808                                                        CTDB_UNKNOWN_PNN, state,
809                                                        &state->pnns_connected);
810         if (state->num_connected <= 0) {
811                 tevent_req_error(req, ENOMEM);
812                 return;
813         }
814
815         state->num_active = list_of_active_nodes(nodemap,
816                                                  CTDB_UNKNOWN_PNN, state,
817                                                  &state->pnns_active);
818         if (state->num_active <= 0) {
819                 tevent_req_error(req, ENOMEM);
820                 return;
821         }
822
823         /* Default timeout for early jump to IPREALLOCATED.  See below
824          * for explanation of 3 times...
825          */
826         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
827
828         state->ban_credits = talloc_zero_array(state, uint32_t,
829                                                state->num_nodes);
830         if (tevent_req_nomem(state->ban_credits, req)) {
831                 return;
832         }
833
834         if (state->tun_list->disable_ip_failover != 0) {
835                 /* IP failover is completely disabled so just send out
836                  * ipreallocated event.
837                  */
838                 takeover_ipreallocated(req);
839                 return;
840         }
841
842         state->ipalloc_state =
843                 ipalloc_state_init(
844                         state, state->num_nodes,
845                         determine_algorithm(state->tun_list),
846                         (state->tun_list->no_ip_takeover != 0),
847                         (state->tun_list->no_ip_failback != 0),
848                         (state->tun_list->no_ip_host_on_all_disabled != 0),
849                         state->force_rebalance_nodes);
850         if (tevent_req_nomem(state->ipalloc_state, req)) {
851                 return;
852         }
853
854         ipalloc_set_node_flags(state->ipalloc_state, nodemap);
855
856         subreq = get_public_ips_send(state, state->ev, state->client,
857                                      state->pnns_active, state->num_active,
858                                      state->num_nodes, false);
859         if (tevent_req_nomem(subreq, req)) {
860                 return;
861         }
862
863         tevent_req_set_callback(subreq, takeover_known_ips_done, req);
864 }
865
866 static void takeover_known_ips_done(struct tevent_req *subreq)
867 {
868         struct tevent_req *req = tevent_req_callback_data(
869                 subreq, struct tevent_req);
870         struct takeover_state *state = tevent_req_data(
871                 req, struct takeover_state);
872         int ret;
873         bool status;
874         uint32_t *pnns = NULL;
875         int count, i;
876
877         status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
878         TALLOC_FREE(subreq);
879
880         if (! status) {
881                 D_ERR("Failed to fetch known public IPs\n");
882                 takeover_failed(req, ret);
883                 return;
884         }
885
886         /* Get available IPs from active nodes that actually have known IPs */
887
888         pnns = talloc_zero_array(state, uint32_t, state->num_active);
889         if (tevent_req_nomem(pnns, req)) {
890                 return;
891         }
892
893         count = 0;
894         for (i = 0; i < state->num_active; i++) {
895                 uint32_t pnn = state->pnns_active[i];
896
897                 /* If pnn has IPs then fetch available IPs from it */
898                 if (state->known_ips[pnn].num > 0) {
899                         pnns[count] = pnn;
900                         count++;
901                 }
902         }
903
904         subreq = get_public_ips_send(state, state->ev, state->client,
905                                      pnns, count,
906                                      state->num_nodes, true);
907         if (tevent_req_nomem(subreq, req)) {
908                 return;
909         }
910
911         tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
912 }
913
914 static void takeover_avail_ips_done(struct tevent_req *subreq)
915 {
916         struct tevent_req *req = tevent_req_callback_data(
917                 subreq, struct tevent_req);
918         struct takeover_state *state = tevent_req_data(
919                 req, struct takeover_state);
920         bool status;
921         int ret;
922         struct ctdb_public_ip_list *available_ips;
923
924         status = get_public_ips_recv(subreq, &ret, state, &available_ips);
925         TALLOC_FREE(subreq);
926
927         if (! status) {
928                 D_ERR("Failed to fetch available public IPs\n");
929                 takeover_failed(req, ret);
930                 return;
931         }
932
933         ipalloc_set_public_ips(state->ipalloc_state,
934                                state->known_ips, available_ips);
935
936         if (! ipalloc_can_host_ips(state->ipalloc_state)) {
937                 D_NOTICE("No nodes available to host public IPs yet\n");
938                 takeover_ipreallocated(req);
939                 return;
940         }
941
942         /* Do the IP reassignment calculations */
943         state->all_ips = ipalloc(state->ipalloc_state);
944         if (tevent_req_nomem(state->all_ips, req)) {
945                 return;
946         }
947
948         /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
949          * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
950          * seconds.  However, RELEASE_IP can take longer due to TCP
951          * connection killing, so sometimes needs more time.
952          * Therefore, use a cumulative timeout of TakeoverTimeout * 3
953          * seconds across all 3 stages.  No explicit expiry checks are
954          * needed before each stage because tevent is smart enough to
955          * fire the timeouts even if they are in the past.  Initialise
956          * this here so it explicitly covers the stages we're
957          * interested in but, in particular, not the time taken by the
958          * ipalloc().
959          */
960         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
961
962         subreq = release_ip_send(state, state->ev, state->client,
963                                  state->pnns_connected, state->num_connected,
964                                  state->timeout, state->all_ips,
965                                  state->ban_credits);
966         if (tevent_req_nomem(subreq, req)) {
967                 return;
968         }
969         tevent_req_set_callback(subreq, takeover_release_ip_done, req);
970 }
971
972 static void takeover_release_ip_done(struct tevent_req *subreq)
973 {
974         struct tevent_req *req = tevent_req_callback_data(
975                 subreq, struct tevent_req);
976         struct takeover_state *state = tevent_req_data(
977                 req, struct takeover_state);
978         int ret;
979         bool status;
980
981         status = release_ip_recv(subreq, &ret);
982         TALLOC_FREE(subreq);
983
984         if (! status) {
985                 takeover_failed(req, ret);
986                 return;
987         }
988
989         /* All released, now for takeovers */
990
991         subreq = take_ip_send(state, state->ev, state->client,
992                               state->timeout, state->all_ips,
993                               state->ban_credits);
994         if (tevent_req_nomem(subreq, req)) {
995                 return;
996         }
997         tevent_req_set_callback(subreq, takeover_take_ip_done, req);
998 }
999
1000 static void takeover_take_ip_done(struct tevent_req *subreq)
1001 {
1002         struct tevent_req *req = tevent_req_callback_data(
1003                 subreq, struct tevent_req);
1004         int ret = 0;
1005         bool status;
1006
1007         status = take_ip_recv(subreq, &ret);
1008         TALLOC_FREE(subreq);
1009
1010         if (! status) {
1011                 takeover_failed(req, ret);
1012                 return;
1013         }
1014
1015         takeover_ipreallocated(req);
1016 }
1017
1018 static void takeover_ipreallocated(struct tevent_req *req)
1019 {
1020         struct takeover_state *state = tevent_req_data(
1021                 req, struct takeover_state);
1022         struct tevent_req *subreq;
1023
1024         subreq = ipreallocated_send(state, state->ev, state->client,
1025                                     state->pnns_connected,
1026                                     state->num_connected,
1027                                     state->timeout,
1028                                     state->ban_credits);
1029         if (tevent_req_nomem(subreq, req)) {
1030                 return;
1031         }
1032         tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
1033 }
1034
1035 static void takeover_ipreallocated_done(struct tevent_req *subreq)
1036 {
1037         struct tevent_req *req = tevent_req_callback_data(
1038                 subreq, struct tevent_req);
1039         int ret;
1040         bool status;
1041
1042         status = ipreallocated_recv(subreq, &ret);
1043         TALLOC_FREE(subreq);
1044
1045         if (! status) {
1046                 takeover_failed(req, ret);
1047                 return;
1048         }
1049
1050         tevent_req_done(req);
1051 }
1052
1053 struct takeover_failed_state {
1054         struct tevent_req *req;
1055         int ret;
1056 };
1057
1058 void takeover_failed(struct tevent_req *req, int ret)
1059 {
1060         struct takeover_state *state = tevent_req_data(
1061                 req, struct takeover_state);
1062         struct tevent_req *subreq;
1063         uint32_t max_pnn = CTDB_UNKNOWN_PNN;
1064         int max_credits = 0;
1065         int pnn;
1066
1067         /* Check that bans are enabled */
1068         if (state->tun_list->enable_bans == 0) {
1069                 tevent_req_error(req, ret);
1070                 return;
1071         }
1072
1073         for (pnn = 0; pnn < state->num_nodes; pnn++) {
1074                 if (state->ban_credits[pnn] > max_credits) {
1075                         max_pnn = pnn;
1076                         max_credits = state->ban_credits[pnn];
1077                 }
1078         }
1079
1080         if (max_credits > 0) {
1081                 struct ctdb_req_message message;
1082                 struct takeover_failed_state *substate;
1083
1084                 D_WARNING("Assigning banning credits to node %u\n", max_pnn);
1085
1086                 substate = talloc_zero(state, struct takeover_failed_state);
1087                 if (tevent_req_nomem(substate, req)) {
1088                         return;
1089                 }
1090                 substate->req = req;
1091                 substate->ret = ret;
1092
1093                 message.srvid = CTDB_SRVID_BANNING;
1094                 message.data.pnn = max_pnn;
1095
1096                 subreq = ctdb_client_message_send(
1097                         state, state->ev, state->client,
1098                         ctdb_client_pnn(state->client),
1099                         &message);
1100                 if (subreq == NULL) {
1101                         D_ERR("failed to assign banning credits\n");
1102                         tevent_req_error(req, ret);
1103                         return;
1104                 }
1105                 tevent_req_set_callback(subreq, takeover_failed_done, substate);
1106         } else {
1107                 tevent_req_error(req, ret);
1108         }
1109 }
1110
1111 static void takeover_failed_done(struct tevent_req *subreq)
1112 {
1113         struct takeover_failed_state *substate = tevent_req_callback_data(
1114                 subreq, struct takeover_failed_state);
1115         struct tevent_req *req = substate->req;
1116         int ret;
1117         bool status;
1118
1119         status = ctdb_client_message_recv(subreq, &ret);
1120         TALLOC_FREE(subreq);
1121         if (! status) {
1122                 D_ERR("failed to assign banning credits, ret=%d\n", ret);
1123         }
1124
1125         ret = substate->ret;
1126         talloc_free(substate);
1127         tevent_req_error(req, ret);
1128 }
1129
1130 static void takeover_recv(struct tevent_req *req, int *perr)
1131 {
1132         generic_recv(req, perr);
1133 }
1134
1135 static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
1136 {
1137         char *strv = NULL;
1138         int num, i, ret;
1139         char *t;
1140         uint32_t *nodes;
1141
1142         ret = strv_split(mem_ctx, &strv, s, ",");
1143         if (ret != 0) {
1144                 D_ERR("out of memory\n");
1145                 return NULL;
1146         }
1147
1148         num = strv_count(strv);
1149
1150         nodes = talloc_array(mem_ctx, uint32_t, num);
1151         if (nodes == NULL) {
1152                 D_ERR("out of memory\n");
1153                 return NULL;
1154         }
1155
1156         t = NULL;
1157         for (i = 0; i < num; i++) {
1158                 t = strv_next(strv, t);
1159                 nodes[i] = atoi(t);
1160         }
1161
1162         return nodes;
1163 }
1164
1165 static void usage(const char *progname)
1166 {
1167         fprintf(stderr,
1168                 "\nUsage: %s <output-fd> <ctdb-socket-path> "
1169                 "[<force-rebalance-nodes>]\n",
1170                 progname);
1171 }
1172
1173 /*
1174  * Arguments - write fd, socket path
1175  */
1176 int main(int argc, const char *argv[])
1177 {
1178         int write_fd;
1179         const char *sockpath;
1180         TALLOC_CTX *mem_ctx;
1181         struct tevent_context *ev;
1182         struct ctdb_client_context *client;
1183         int ret;
1184         struct tevent_req *req;
1185         uint32_t *force_rebalance_nodes = NULL;
1186
1187         if (argc < 3 || argc > 4) {
1188                 usage(argv[0]);
1189                 exit(1);
1190         }
1191
1192         write_fd = atoi(argv[1]);
1193         sockpath = argv[2];
1194
1195         mem_ctx = talloc_new(NULL);
1196         if (mem_ctx == NULL) {
1197                 fprintf(stderr, "talloc_new() failed\n");
1198                 ret = ENOMEM;
1199                 goto done;
1200         }
1201
1202         if (argc == 4) {
1203                 force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
1204                 if (force_rebalance_nodes == NULL) {
1205                         usage(argv[0]);
1206                         ret = EINVAL;
1207                         goto done;
1208                 }
1209         }
1210
1211         ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
1212         if (ret != 0) {
1213                 fprintf(stderr,
1214                         "ctdb-takeover: Unable to initialize logging\n");
1215                 goto done;
1216         }
1217
1218         ev = tevent_context_init(mem_ctx);
1219         if (ev == NULL) {
1220                 D_ERR("tevent_context_init() failed\n");
1221                 ret = ENOMEM;
1222                 goto done;
1223         }
1224
1225         ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
1226         if (ret != 0) {
1227                 D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
1228                 goto done;
1229         }
1230
1231         req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
1232         if (req == NULL) {
1233                 D_ERR("takeover_send() failed\n");
1234                 ret = 1;
1235                 goto done;
1236         }
1237
1238         if (! tevent_req_poll(req, ev)) {
1239                 D_ERR("tevent_req_poll() failed\n");
1240                 ret = 1;
1241                 goto done;
1242         }
1243
1244         takeover_recv(req, &ret);
1245         TALLOC_FREE(req);
1246         if (ret != 0) {
1247                 D_ERR("takeover run failed, ret=%d\n", ret);
1248         }
1249
1250 done:
1251         sys_write_v(write_fd, &ret, sizeof(ret));
1252
1253         talloc_free(mem_ctx);
1254         return ret;
1255 }