ctdb-takeover: Assign banning credits on failure to fetch public IPs
[sfrench/samba-autobuild/.git] / ctdb / server / ctdb_takeover_helper.c
1 /*
2    CTDB IP takeover helper
3
4    Copyright (C) Martin Schwenke  2016
5
6    Based on ctdb_recovery_helper.c
7    Copyright (C) Amitay Isaacs  2015
8
9    and ctdb_takeover.c
10    Copyright (C) Ronnie Sahlberg  2007
11    Copyright (C) Andrew Tridgell  2007
12    Copyright (C) Martin Schwenke  2011
13
14    This program is free software; you can redistribute it and/or modify
15    it under the terms of the GNU General Public License as published by
16    the Free Software Foundation; either version 3 of the License, or
17    (at your option) any later version.
18
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License for more details.
23
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28 #include "replace.h"
29 #include "system/network.h"
30 #include "system/filesys.h"
31
32 #include <popt.h>
33 #include <talloc.h>
34 #include <tevent.h>
35
36 #include "lib/util/debug.h"
37 #include "lib/util/strv.h"
38 #include "lib/util/strv_util.h"
39 #include "lib/util/sys_rw.h"
40 #include "lib/util/time.h"
41 #include "lib/util/tevent_unix.h"
42
43 #include "protocol/protocol.h"
44 #include "protocol/protocol_api.h"
45 #include "client/client.h"
46
47 #include "common/logging.h"
48
49 #include "server/ipalloc.h"
50
51 static int takeover_timeout = 9;
52
53 #define TIMEOUT()       timeval_current_ofs(takeover_timeout, 0)
54
55 /*
56  * Utility functions
57  */
58
59 static bool generic_recv(struct tevent_req *req, int *perr)
60 {
61         int err;
62
63         if (tevent_req_is_unix_error(req, &err)) {
64                 if (perr != NULL) {
65                         *perr = err;
66                 }
67                 return false;
68         }
69
70         return true;
71 }
72
73 static enum ipalloc_algorithm
74 determine_algorithm(const struct ctdb_tunable_list *tunables)
75 {
76         switch (tunables->ip_alloc_algorithm) {
77         case 0:
78                 return IPALLOC_DETERMINISTIC;
79         case 1:
80                 return IPALLOC_NONDETERMINISTIC;
81         case 2:
82                 return IPALLOC_LCP2;
83         default:
84                 return IPALLOC_LCP2;
85         };
86 }
87
88 /**********************************************************************/
89
90 struct get_public_ips_state {
91         uint32_t *pnns;
92         int count;
93         struct ctdb_public_ip_list *ips;
94         uint32_t *ban_credits;
95 };
96
97 static void get_public_ips_done(struct tevent_req *subreq);
98
99 static struct tevent_req *get_public_ips_send(
100                                 TALLOC_CTX *mem_ctx,
101                                 struct tevent_context *ev,
102                                 struct ctdb_client_context *client,
103                                 uint32_t *pnns,
104                                 int count, int num_nodes,
105                                 uint32_t *ban_credits,
106                                 bool available_only)
107 {
108         struct tevent_req *req, *subreq;
109         struct get_public_ips_state *state;
110         struct ctdb_req_control request;
111
112         req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
113         if (req == NULL) {
114                 return NULL;
115         }
116
117         state->pnns = pnns;
118         state->count = count;
119         state->ban_credits = ban_credits;
120
121         state->ips  = talloc_zero_array(state,
122                                         struct ctdb_public_ip_list,
123                                         num_nodes);
124         if (tevent_req_nomem(state->ips, req)) {
125                 return tevent_req_post(req, ev);
126         }
127
128         /* Short circuit if no nodes being asked for IPs */
129         if (state->count == 0) {
130                 tevent_req_done(req);
131                 return tevent_req_post(req, ev);
132         }
133
134         ctdb_req_control_get_public_ips(&request, available_only);
135         subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
136                                                 state->pnns,
137                                                 state->count,
138                                                 TIMEOUT(), &request);
139         if (tevent_req_nomem(subreq, req)) {
140                 return tevent_req_post(req, ev);
141         }
142         tevent_req_set_callback(subreq, get_public_ips_done, req);
143
144         return req;
145 }
146
147 static void get_public_ips_done(struct tevent_req *subreq)
148 {
149         struct tevent_req *req = tevent_req_callback_data(
150                 subreq, struct tevent_req);
151         struct get_public_ips_state *state = tevent_req_data(
152                 req, struct get_public_ips_state);
153         struct ctdb_reply_control **reply;
154         int *err_list;
155         int ret, i;
156         bool status, found_errors;
157
158         status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
159                                                 &reply);
160         TALLOC_FREE(subreq);
161         if (! status) {
162                 found_errors = false;
163                 for (i = 0; i < state->count; i++) {
164                         if (err_list[i] != 0) {
165                                 uint32_t pnn = state->pnns[i];
166
167                                 D_ERR("control GET_PUBLIC_IPS failed on "
168                                       "node %u, ret=%d\n", pnn, err_list[i]);
169
170                                 state->ban_credits[pnn]++;
171                                 found_errors = true;
172                         }
173                 }
174
175                 tevent_req_error(req, ret);
176                 return;
177         }
178
179         found_errors = false;
180         for (i = 0; i < state->count; i++) {
181                 uint32_t pnn;
182                 struct ctdb_public_ip_list *ips;
183
184                 pnn = state->pnns[i];
185                 ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
186                                                         &ips);
187                 if (ret != 0) {
188                         D_ERR("control GET_PUBLIC_IPS failed on "
189                               "node %u\n", pnn);
190                         state->ban_credits[pnn]++;
191                         found_errors = true;
192                         continue;
193                 }
194
195                 D_INFO("Fetched public IPs from node %u\n", pnn);
196                 state->ips[pnn] = *ips;
197         }
198
199         if (found_errors) {
200                 tevent_req_error(req, EIO);
201                 return;
202         }
203
204         talloc_free(reply);
205
206         tevent_req_done(req);
207 }
208
209 static bool get_public_ips_recv(struct tevent_req *req, int *perr,
210                                 TALLOC_CTX *mem_ctx,
211                                 struct ctdb_public_ip_list **ips)
212 {
213         struct get_public_ips_state *state = tevent_req_data(
214                 req, struct get_public_ips_state);
215         int err;
216
217         if (tevent_req_is_unix_error(req, &err)) {
218                 if (perr != NULL) {
219                         *perr = err;
220                 }
221                 return false;
222         }
223
224         *ips = talloc_steal(mem_ctx, state->ips);
225
226         return true;
227 }
228
229 /**********************************************************************/
230
231 struct release_ip_state {
232         int num_sent;
233         int num_replies;
234         int num_fails;
235         int err_any;
236         uint32_t *ban_credits;
237 };
238
239 struct release_ip_one_state {
240         struct tevent_req *req;
241         uint32_t *pnns;
242         int count;
243         const char *ip_str;
244 };
245
246 static void release_ip_done(struct tevent_req *subreq);
247
248 static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
249                                           struct tevent_context *ev,
250                                           struct ctdb_client_context *client,
251                                           uint32_t *pnns,
252                                           int count,
253                                           struct timeval timeout,
254                                           struct public_ip_list *all_ips,
255                                           uint32_t *ban_credits)
256 {
257         struct tevent_req *req, *subreq;
258         struct release_ip_state *state;
259         struct ctdb_req_control request;
260         struct public_ip_list *tmp_ip;
261
262         req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
263         if (req == NULL) {
264                 return NULL;
265         }
266
267         state->num_sent = 0;
268         state->num_replies = 0;
269         state->num_fails = 0;
270         state->ban_credits = ban_credits;
271
272         /* Send a RELEASE_IP to all nodes that should not be hosting
273          * each IP.  For each IP, all but one of these will be
274          * redundant.  However, the redundant ones are used to tell
275          * nodes which node should be hosting the IP so that commands
276          * like "ctdb ip" can display a particular nodes idea of who
277          * is hosting what. */
278         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
279                 struct release_ip_one_state *substate;
280                 struct ctdb_public_ip ip;
281                 int i;
282
283                 substate = talloc_zero(state, struct release_ip_one_state);
284                 if (tevent_req_nomem(substate, req)) {
285                         return tevent_req_post(req, ev);
286                 }
287
288                 substate->pnns = talloc_zero_array(substate, uint32_t, count);
289                 if (tevent_req_nomem(substate->pnns, req)) {
290                         return tevent_req_post(req, ev);
291                 }
292
293                 substate->count = 0;
294                 substate->req = req;
295
296                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
297                                                              &tmp_ip->addr);
298                 if (tevent_req_nomem(substate->ip_str, req)) {
299                         return tevent_req_post(req, ev);
300                 }
301
302                 for (i = 0; i < count; i++) {
303                         uint32_t pnn = pnns[i];
304                         /* If pnn is not the node that should be
305                          * hosting the IP then add it to the list of
306                          * nodes that need to do a release. */
307                         if (tmp_ip->pnn != pnn) {
308                                 substate->pnns[substate->count] = pnn;
309                                 substate->count++;
310                         }
311                 }
312
313                 if (substate->count == 0) {
314                         /* No releases to send for this address... */
315                         TALLOC_FREE(substate);
316                         continue;
317                 }
318
319                 ip.pnn = tmp_ip->pnn;
320                 ip.addr = tmp_ip->addr;
321                 ctdb_req_control_release_ip(&request, &ip);
322                 subreq = ctdb_client_control_multi_send(state, ev, client,
323                                                         substate->pnns,
324                                                         substate->count,
325                                                         timeout,/* cumulative */
326                                                         &request);
327                 if (tevent_req_nomem(subreq, req)) {
328                         return tevent_req_post(req, ev);
329                 }
330                 tevent_req_set_callback(subreq, release_ip_done, substate);
331
332                 state->num_sent++;
333         }
334
335         /* None sent, finished... */
336         if (state->num_sent == 0) {
337                 tevent_req_done(req);
338                 return tevent_req_post(req, ev);
339         }
340
341         return req;
342 }
343
344 static void release_ip_done(struct tevent_req *subreq)
345 {
346         struct release_ip_one_state *substate = tevent_req_callback_data(
347                 subreq, struct release_ip_one_state);
348         struct tevent_req *req = substate->req;
349         struct release_ip_state *state = tevent_req_data(
350                 req, struct release_ip_state);
351         int ret, i;
352         int *err_list;
353         bool status, found_errors;
354
355         status = ctdb_client_control_multi_recv(subreq, &ret, state,
356                                                 &err_list, NULL);
357         TALLOC_FREE(subreq);
358
359         if (status) {
360                 D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
361                        substate->ip_str, substate->count);
362                 goto done;
363         }
364
365         /* Get some clear error messages out of err_list and count
366          * banning credits
367          */
368         found_errors = false;
369         for (i = 0; i < substate->count; i++) {
370                 int err = err_list[i];
371                 if (err != 0) {
372                         uint32_t pnn = substate->pnns[i];
373
374                         D_ERR("RELEASE_IP %s failed on node %u, "
375                               "ret=%d\n", substate->ip_str, pnn, err);
376
377                         state->ban_credits[pnn]++;
378                         state->err_any = err;
379                         found_errors = true;
380                 }
381         }
382         if (! found_errors) {
383                 D_ERR("RELEASE_IP %s internal error, ret=%d\n",
384                       substate->ip_str, ret);
385                 state->err_any = EIO;
386         }
387
388         state->num_fails++;
389
390 done:
391         talloc_free(substate);
392
393         state->num_replies++;
394
395         if (state->num_replies < state->num_sent) {
396                 /* Not all replies received, don't go further */
397                 return;
398         }
399
400         if (state->num_fails > 0) {
401                 tevent_req_error(req, state->err_any);
402                 return;
403         }
404
405         tevent_req_done(req);
406 }
407
408 static bool release_ip_recv(struct tevent_req *req, int *perr)
409 {
410         return generic_recv(req, perr);
411 }
412
413 /**********************************************************************/
414
415 struct take_ip_state {
416         int num_sent;
417         int num_replies;
418         int num_fails;
419         int err_any;
420         uint32_t *ban_credits;
421 };
422
423 struct take_ip_one_state {
424         struct tevent_req *req;
425         uint32_t pnn;
426         const char *ip_str;
427 };
428
429 static void take_ip_done(struct tevent_req *subreq);
430
431 static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
432                                        struct tevent_context *ev,
433                                        struct ctdb_client_context *client,
434                                        struct timeval timeout,
435                                        struct public_ip_list *all_ips,
436                                        uint32_t *ban_credits)
437 {
438         struct tevent_req *req, *subreq;
439         struct take_ip_state *state;
440         struct ctdb_req_control request;
441         struct public_ip_list *tmp_ip;
442
443         req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
444         if (req == NULL) {
445                 return NULL;
446         }
447
448         state->num_sent = 0;
449         state->num_replies = 0;
450         state->num_fails = 0;
451         state->ban_credits = ban_credits;
452
453         /* For each IP, send a TAKOVER_IP to the node that should be
454          * hosting it.  Many of these will often be redundant (since
455          * the allocation won't have changed) but they can be useful
456          * to recover from inconsistencies. */
457         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
458                 struct take_ip_one_state *substate;
459                 struct ctdb_public_ip ip;
460
461                 if (tmp_ip->pnn == -1) {
462                         /* IP will be unassigned */
463                         continue;
464                 }
465
466                 substate = talloc_zero(state, struct take_ip_one_state);
467                 if (tevent_req_nomem(substate, req)) {
468                         return tevent_req_post(req, ev);
469                 }
470
471                 substate->req = req;
472                 substate->pnn = tmp_ip->pnn;
473
474                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
475                                                              &tmp_ip->addr);
476                 if (tevent_req_nomem(substate->ip_str, req)) {
477                         return tevent_req_post(req, ev);
478                 }
479
480                 ip.pnn = tmp_ip->pnn;
481                 ip.addr = tmp_ip->addr;
482                 ctdb_req_control_takeover_ip(&request, &ip);
483                 subreq = ctdb_client_control_send(
484                                         state, ev, client, tmp_ip->pnn,
485                                         timeout, /* cumulative */
486                                         &request);
487                 if (tevent_req_nomem(subreq, req)) {
488                         return tevent_req_post(req, ev);
489                 }
490                 tevent_req_set_callback(subreq, take_ip_done, substate);
491
492                 state->num_sent++;
493         }
494
495         /* None sent, finished... */
496         if (state->num_sent == 0) {
497                 tevent_req_done(req);
498                 return tevent_req_post(req, ev);
499         }
500
501         return req;
502 }
503
504 static void take_ip_done(struct tevent_req *subreq)
505 {
506         struct take_ip_one_state *substate = tevent_req_callback_data(
507                 subreq, struct take_ip_one_state);
508         struct tevent_req *req = substate->req;
509         struct ctdb_reply_control *reply;
510         struct take_ip_state *state = tevent_req_data(
511                 req, struct take_ip_state);
512         int ret = 0;
513         bool status;
514
515         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
516         TALLOC_FREE(subreq);
517
518         if (! status) {
519                 D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
520                       substate->ip_str, substate->pnn, ret);
521                 goto fail;
522         }
523
524         ret = ctdb_reply_control_takeover_ip(reply);
525         if (ret != 0) {
526                 D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
527                       substate->ip_str, substate->pnn, ret);
528                 goto fail;
529         }
530
531         D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
532                substate->ip_str, substate->pnn);
533         goto done;
534
535 fail:
536         state->ban_credits[substate->pnn]++;
537         state->num_fails++;
538         state->err_any = ret;
539
540 done:
541         talloc_free(substate);
542
543         state->num_replies++;
544
545         if (state->num_replies < state->num_sent) {
546                 /* Not all replies received, don't go further */
547                 return;
548         }
549
550         if (state->num_fails > 0) {
551                 tevent_req_error(req, state->err_any);
552                 return;
553         }
554
555         tevent_req_done(req);
556 }
557
558 static bool take_ip_recv(struct tevent_req *req, int *perr)
559 {
560         return generic_recv(req, perr);
561 }
562
563 /**********************************************************************/
564
565 struct ipreallocated_state {
566         uint32_t *pnns;
567         int count;
568         uint32_t *ban_credits;
569 };
570
571 static void ipreallocated_done(struct tevent_req *subreq);
572
573 static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
574                                              struct tevent_context *ev,
575                                              struct ctdb_client_context *client,
576                                              uint32_t *pnns,
577                                              int count,
578                                              struct timeval timeout,
579                                              uint32_t *ban_credits)
580 {
581         struct tevent_req *req, *subreq;
582         struct ipreallocated_state *state;
583         struct ctdb_req_control request;
584
585         req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
586         if (req == NULL) {
587                 return NULL;
588         }
589
590         state->pnns = pnns;
591         state->count = count;
592         state->ban_credits = ban_credits;
593
594         ctdb_req_control_ipreallocated(&request);
595         subreq = ctdb_client_control_multi_send(state, ev, client,
596                                                 pnns, count,
597                                                 timeout, /* cumulative */
598                                                 &request);
599         if (tevent_req_nomem(subreq, req)) {
600                 return tevent_req_post(req, ev);
601         }
602         tevent_req_set_callback(subreq, ipreallocated_done, req);
603
604         return req;
605 }
606
607 static void ipreallocated_done(struct tevent_req *subreq)
608 {
609         struct tevent_req *req = tevent_req_callback_data(
610                 subreq, struct tevent_req);
611         struct ipreallocated_state *state = tevent_req_data(
612                 req, struct ipreallocated_state);
613         int *err_list = NULL;
614         int ret, i;
615         bool status, found_errors;
616
617         status = ctdb_client_control_multi_recv(subreq, &ret, state,
618                                                 &err_list, NULL);
619         TALLOC_FREE(subreq);
620
621         if (status) {
622                 D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
623                 tevent_req_done(req);
624                 return;
625         }
626
627         /* Get some clear error messages out of err_list and count
628          * banning credits
629          */
630         found_errors = false;
631         for (i = 0; i < state->count; i++) {
632                 int err = err_list[i];
633                 if (err != 0) {
634                         uint32_t pnn = state->pnns[i];
635
636                         D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
637                               pnn, err);
638
639                         state->ban_credits[pnn]++;
640                         found_errors = true;
641                 }
642         }
643
644         if (! found_errors) {
645                 D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
646         }
647
648         tevent_req_error(req, ret);
649 }
650
651 static bool ipreallocated_recv(struct tevent_req *req, int *perr)
652 {
653         return generic_recv(req, perr);
654 }
655
656 /**********************************************************************/
657
658 /*
659  * Recalculate the allocation of public IPs to nodes and have the
660  * nodes host their allocated addresses.
661  *
662  * - Get tunables
663  * - Get nodemap
664  * - Initialise IP allocation state.  Pass:
665  *   + algorithm to be used;
666  *   + various tunables (NoIPTakeover, NoIPFailback, NoIPHostOnAllDisabled)
667  *   + list of nodes to force rebalance (internal structure, currently
668  *     no way to fetch, only used by LCP2 for nodes that have had new
669  *     IP addresses added).
670  * - Set IP flags for IP allocation based on node map
671  * - Retrieve known and available IP addresses (done separately so
672  *   values can be faked in unit testing)
673  * - Use ipalloc_set_public_ips() to set known and available IP
674  *   addresses for allocation
675  * - If cluster can't host IP addresses then jump to IPREALLOCATED
676  * - Run IP allocation algorithm
677  * - Send RELEASE_IP to all nodes for IPs they should not host
678  * - Send TAKE_IP to all nodes for IPs they should host
679  * - Send IPREALLOCATED to all nodes
680  */
681
682 struct takeover_state {
683         struct tevent_context *ev;
684         struct ctdb_client_context *client;
685         struct timeval timeout;
686         int num_nodes;
687         uint32_t *pnns_connected;
688         int num_connected;
689         uint32_t *pnns_active;
690         int num_active;
691         uint32_t destnode;
692         uint32_t *force_rebalance_nodes;
693         struct ctdb_tunable_list *tun_list;
694         struct ipalloc_state *ipalloc_state;
695         struct ctdb_public_ip_list *known_ips;
696         struct public_ip_list *all_ips;
697         uint32_t *ban_credits;
698 };
699
700 static void takeover_tunables_done(struct tevent_req *subreq);
701 static void takeover_nodemap_done(struct tevent_req *subreq);
702 static void takeover_known_ips_done(struct tevent_req *subreq);
703 static void takeover_avail_ips_done(struct tevent_req *subreq);
704 static void takeover_release_ip_done(struct tevent_req *subreq);
705 static void takeover_take_ip_done(struct tevent_req *subreq);
706 static void takeover_ipreallocated(struct tevent_req *req);
707 static void takeover_ipreallocated_done(struct tevent_req *subreq);
708 static void takeover_failed(struct tevent_req *subreq, int ret);
709 static void takeover_failed_done(struct tevent_req *subreq);
710
711 static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
712                                         struct tevent_context *ev,
713                                         struct ctdb_client_context *client,
714                                         uint32_t *force_rebalance_nodes)
715 {
716         struct tevent_req *req, *subreq;
717         struct takeover_state *state;
718         struct ctdb_req_control request;
719
720         req = tevent_req_create(mem_ctx, &state, struct takeover_state);
721         if (req == NULL) {
722                 return NULL;
723         }
724
725         state->ev = ev;
726         state->client = client;
727         state->force_rebalance_nodes = force_rebalance_nodes;
728         state->destnode = ctdb_client_pnn(client);
729
730         ctdb_req_control_get_all_tunables(&request);
731         subreq = ctdb_client_control_send(state, state->ev, state->client,
732                                           state->destnode, TIMEOUT(),
733                                           &request);
734         if (tevent_req_nomem(subreq, req)) {
735                 return tevent_req_post(req, ev);
736         }
737         tevent_req_set_callback(subreq, takeover_tunables_done, req);
738
739         return req;
740 }
741
742 static void takeover_tunables_done(struct tevent_req *subreq)
743 {
744         struct tevent_req *req = tevent_req_callback_data(
745                 subreq, struct tevent_req);
746         struct takeover_state *state = tevent_req_data(
747                 req, struct takeover_state);
748         struct ctdb_reply_control *reply;
749         struct ctdb_req_control request;
750         int ret;
751         bool status;
752
753         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
754         TALLOC_FREE(subreq);
755         if (! status) {
756                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
757                 tevent_req_error(req, ret);
758                 return;
759         }
760
761         ret = ctdb_reply_control_get_all_tunables(reply, state,
762                                                   &state->tun_list);
763         if (ret != 0) {
764                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
765                 tevent_req_error(req, ret);
766                 return;
767         }
768
769         talloc_free(reply);
770
771         takeover_timeout = state->tun_list->takeover_timeout;
772
773         ctdb_req_control_get_nodemap(&request);
774         subreq = ctdb_client_control_send(state, state->ev, state->client,
775                                           state->destnode, TIMEOUT(),
776                                           &request);
777         if (tevent_req_nomem(subreq, req)) {
778                 return;
779         }
780         tevent_req_set_callback(subreq, takeover_nodemap_done, req);
781 }
782
783 static void takeover_nodemap_done(struct tevent_req *subreq)
784 {
785         struct tevent_req *req = tevent_req_callback_data(
786                 subreq, struct tevent_req);
787         struct takeover_state *state = tevent_req_data(
788                 req, struct takeover_state);
789         struct ctdb_reply_control *reply;
790         bool status;
791         int ret;
792         struct ctdb_node_map *nodemap;
793
794         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
795         TALLOC_FREE(subreq);
796         if (! status) {
797                 D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
798                         state->destnode, ret);
799                 tevent_req_error(req, ret);
800                 return;
801         }
802
803         ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
804         if (ret != 0) {
805                 D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
806                 tevent_req_error(req, ret);
807                 return;
808         }
809
810         state->num_nodes = nodemap->num;
811
812         state->num_connected = list_of_connected_nodes(nodemap,
813                                                        CTDB_UNKNOWN_PNN, state,
814                                                        &state->pnns_connected);
815         if (state->num_connected <= 0) {
816                 tevent_req_error(req, ENOMEM);
817                 return;
818         }
819
820         state->num_active = list_of_active_nodes(nodemap,
821                                                  CTDB_UNKNOWN_PNN, state,
822                                                  &state->pnns_active);
823         if (state->num_active <= 0) {
824                 tevent_req_error(req, ENOMEM);
825                 return;
826         }
827
828         /* Default timeout for early jump to IPREALLOCATED.  See below
829          * for explanation of 3 times...
830          */
831         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
832
833         state->ban_credits = talloc_zero_array(state, uint32_t,
834                                                state->num_nodes);
835         if (tevent_req_nomem(state->ban_credits, req)) {
836                 return;
837         }
838
839         if (state->tun_list->disable_ip_failover != 0) {
840                 /* IP failover is completely disabled so just send out
841                  * ipreallocated event.
842                  */
843                 takeover_ipreallocated(req);
844                 return;
845         }
846
847         state->ipalloc_state =
848                 ipalloc_state_init(
849                         state, state->num_nodes,
850                         determine_algorithm(state->tun_list),
851                         (state->tun_list->no_ip_takeover != 0),
852                         (state->tun_list->no_ip_failback != 0),
853                         (state->tun_list->no_ip_host_on_all_disabled != 0),
854                         state->force_rebalance_nodes);
855         if (tevent_req_nomem(state->ipalloc_state, req)) {
856                 return;
857         }
858
859         ipalloc_set_node_flags(state->ipalloc_state, nodemap);
860
861         subreq = get_public_ips_send(state, state->ev, state->client,
862                                      state->pnns_active, state->num_active,
863                                      state->num_nodes, state->ban_credits,
864                                      false);
865         if (tevent_req_nomem(subreq, req)) {
866                 return;
867         }
868
869         tevent_req_set_callback(subreq, takeover_known_ips_done, req);
870 }
871
872 static void takeover_known_ips_done(struct tevent_req *subreq)
873 {
874         struct tevent_req *req = tevent_req_callback_data(
875                 subreq, struct tevent_req);
876         struct takeover_state *state = tevent_req_data(
877                 req, struct takeover_state);
878         int ret;
879         bool status;
880         uint32_t *pnns = NULL;
881         int count, i;
882
883         status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
884         TALLOC_FREE(subreq);
885
886         if (! status) {
887                 D_ERR("Failed to fetch known public IPs\n");
888                 takeover_failed(req, ret);
889                 return;
890         }
891
892         /* Get available IPs from active nodes that actually have known IPs */
893
894         pnns = talloc_zero_array(state, uint32_t, state->num_active);
895         if (tevent_req_nomem(pnns, req)) {
896                 return;
897         }
898
899         count = 0;
900         for (i = 0; i < state->num_active; i++) {
901                 uint32_t pnn = state->pnns_active[i];
902
903                 /* If pnn has IPs then fetch available IPs from it */
904                 if (state->known_ips[pnn].num > 0) {
905                         pnns[count] = pnn;
906                         count++;
907                 }
908         }
909
910         subreq = get_public_ips_send(state, state->ev, state->client,
911                                      pnns, count,
912                                      state->num_nodes, state->ban_credits,
913                                      true);
914         if (tevent_req_nomem(subreq, req)) {
915                 return;
916         }
917
918         tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
919 }
920
921 static void takeover_avail_ips_done(struct tevent_req *subreq)
922 {
923         struct tevent_req *req = tevent_req_callback_data(
924                 subreq, struct tevent_req);
925         struct takeover_state *state = tevent_req_data(
926                 req, struct takeover_state);
927         bool status;
928         int ret;
929         struct ctdb_public_ip_list *available_ips;
930
931         status = get_public_ips_recv(subreq, &ret, state, &available_ips);
932         TALLOC_FREE(subreq);
933
934         if (! status) {
935                 D_ERR("Failed to fetch available public IPs\n");
936                 takeover_failed(req, ret);
937                 return;
938         }
939
940         ipalloc_set_public_ips(state->ipalloc_state,
941                                state->known_ips, available_ips);
942
943         if (! ipalloc_can_host_ips(state->ipalloc_state)) {
944                 D_NOTICE("No nodes available to host public IPs yet\n");
945                 takeover_ipreallocated(req);
946                 return;
947         }
948
949         /* Do the IP reassignment calculations */
950         state->all_ips = ipalloc(state->ipalloc_state);
951         if (tevent_req_nomem(state->all_ips, req)) {
952                 return;
953         }
954
955         /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
956          * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
957          * seconds.  However, RELEASE_IP can take longer due to TCP
958          * connection killing, so sometimes needs more time.
959          * Therefore, use a cumulative timeout of TakeoverTimeout * 3
960          * seconds across all 3 stages.  No explicit expiry checks are
961          * needed before each stage because tevent is smart enough to
962          * fire the timeouts even if they are in the past.  Initialise
963          * this here so it explicitly covers the stages we're
964          * interested in but, in particular, not the time taken by the
965          * ipalloc().
966          */
967         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
968
969         subreq = release_ip_send(state, state->ev, state->client,
970                                  state->pnns_connected, state->num_connected,
971                                  state->timeout, state->all_ips,
972                                  state->ban_credits);
973         if (tevent_req_nomem(subreq, req)) {
974                 return;
975         }
976         tevent_req_set_callback(subreq, takeover_release_ip_done, req);
977 }
978
979 static void takeover_release_ip_done(struct tevent_req *subreq)
980 {
981         struct tevent_req *req = tevent_req_callback_data(
982                 subreq, struct tevent_req);
983         struct takeover_state *state = tevent_req_data(
984                 req, struct takeover_state);
985         int ret;
986         bool status;
987
988         status = release_ip_recv(subreq, &ret);
989         TALLOC_FREE(subreq);
990
991         if (! status) {
992                 takeover_failed(req, ret);
993                 return;
994         }
995
996         /* All released, now for takeovers */
997
998         subreq = take_ip_send(state, state->ev, state->client,
999                               state->timeout, state->all_ips,
1000                               state->ban_credits);
1001         if (tevent_req_nomem(subreq, req)) {
1002                 return;
1003         }
1004         tevent_req_set_callback(subreq, takeover_take_ip_done, req);
1005 }
1006
1007 static void takeover_take_ip_done(struct tevent_req *subreq)
1008 {
1009         struct tevent_req *req = tevent_req_callback_data(
1010                 subreq, struct tevent_req);
1011         int ret = 0;
1012         bool status;
1013
1014         status = take_ip_recv(subreq, &ret);
1015         TALLOC_FREE(subreq);
1016
1017         if (! status) {
1018                 takeover_failed(req, ret);
1019                 return;
1020         }
1021
1022         takeover_ipreallocated(req);
1023 }
1024
1025 static void takeover_ipreallocated(struct tevent_req *req)
1026 {
1027         struct takeover_state *state = tevent_req_data(
1028                 req, struct takeover_state);
1029         struct tevent_req *subreq;
1030
1031         subreq = ipreallocated_send(state, state->ev, state->client,
1032                                     state->pnns_connected,
1033                                     state->num_connected,
1034                                     state->timeout,
1035                                     state->ban_credits);
1036         if (tevent_req_nomem(subreq, req)) {
1037                 return;
1038         }
1039         tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
1040 }
1041
1042 static void takeover_ipreallocated_done(struct tevent_req *subreq)
1043 {
1044         struct tevent_req *req = tevent_req_callback_data(
1045                 subreq, struct tevent_req);
1046         int ret;
1047         bool status;
1048
1049         status = ipreallocated_recv(subreq, &ret);
1050         TALLOC_FREE(subreq);
1051
1052         if (! status) {
1053                 takeover_failed(req, ret);
1054                 return;
1055         }
1056
1057         tevent_req_done(req);
1058 }
1059
1060 struct takeover_failed_state {
1061         struct tevent_req *req;
1062         int ret;
1063 };
1064
1065 void takeover_failed(struct tevent_req *req, int ret)
1066 {
1067         struct takeover_state *state = tevent_req_data(
1068                 req, struct takeover_state);
1069         struct tevent_req *subreq;
1070         uint32_t max_pnn = CTDB_UNKNOWN_PNN;
1071         int max_credits = 0;
1072         int pnn;
1073
1074         /* Check that bans are enabled */
1075         if (state->tun_list->enable_bans == 0) {
1076                 tevent_req_error(req, ret);
1077                 return;
1078         }
1079
1080         for (pnn = 0; pnn < state->num_nodes; pnn++) {
1081                 if (state->ban_credits[pnn] > max_credits) {
1082                         max_pnn = pnn;
1083                         max_credits = state->ban_credits[pnn];
1084                 }
1085         }
1086
1087         if (max_credits > 0) {
1088                 struct ctdb_req_message message;
1089                 struct takeover_failed_state *substate;
1090
1091                 D_WARNING("Assigning banning credits to node %u\n", max_pnn);
1092
1093                 substate = talloc_zero(state, struct takeover_failed_state);
1094                 if (tevent_req_nomem(substate, req)) {
1095                         return;
1096                 }
1097                 substate->req = req;
1098                 substate->ret = ret;
1099
1100                 message.srvid = CTDB_SRVID_BANNING;
1101                 message.data.pnn = max_pnn;
1102
1103                 subreq = ctdb_client_message_send(
1104                         state, state->ev, state->client,
1105                         ctdb_client_pnn(state->client),
1106                         &message);
1107                 if (subreq == NULL) {
1108                         D_ERR("failed to assign banning credits\n");
1109                         tevent_req_error(req, ret);
1110                         return;
1111                 }
1112                 tevent_req_set_callback(subreq, takeover_failed_done, substate);
1113         } else {
1114                 tevent_req_error(req, ret);
1115         }
1116 }
1117
1118 static void takeover_failed_done(struct tevent_req *subreq)
1119 {
1120         struct takeover_failed_state *substate = tevent_req_callback_data(
1121                 subreq, struct takeover_failed_state);
1122         struct tevent_req *req = substate->req;
1123         int ret;
1124         bool status;
1125
1126         status = ctdb_client_message_recv(subreq, &ret);
1127         TALLOC_FREE(subreq);
1128         if (! status) {
1129                 D_ERR("failed to assign banning credits, ret=%d\n", ret);
1130         }
1131
1132         ret = substate->ret;
1133         talloc_free(substate);
1134         tevent_req_error(req, ret);
1135 }
1136
1137 static void takeover_recv(struct tevent_req *req, int *perr)
1138 {
1139         generic_recv(req, perr);
1140 }
1141
1142 static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
1143 {
1144         char *strv = NULL;
1145         int num, i, ret;
1146         char *t;
1147         uint32_t *nodes;
1148
1149         ret = strv_split(mem_ctx, &strv, s, ",");
1150         if (ret != 0) {
1151                 D_ERR("out of memory\n");
1152                 return NULL;
1153         }
1154
1155         num = strv_count(strv);
1156
1157         nodes = talloc_array(mem_ctx, uint32_t, num);
1158         if (nodes == NULL) {
1159                 D_ERR("out of memory\n");
1160                 return NULL;
1161         }
1162
1163         t = NULL;
1164         for (i = 0; i < num; i++) {
1165                 t = strv_next(strv, t);
1166                 nodes[i] = atoi(t);
1167         }
1168
1169         return nodes;
1170 }
1171
1172 static void usage(const char *progname)
1173 {
1174         fprintf(stderr,
1175                 "\nUsage: %s <output-fd> <ctdb-socket-path> "
1176                 "[<force-rebalance-nodes>]\n",
1177                 progname);
1178 }
1179
1180 /*
1181  * Arguments - write fd, socket path
1182  */
1183 int main(int argc, const char *argv[])
1184 {
1185         int write_fd;
1186         const char *sockpath;
1187         TALLOC_CTX *mem_ctx;
1188         struct tevent_context *ev;
1189         struct ctdb_client_context *client;
1190         int ret;
1191         struct tevent_req *req;
1192         uint32_t *force_rebalance_nodes = NULL;
1193
1194         if (argc < 3 || argc > 4) {
1195                 usage(argv[0]);
1196                 exit(1);
1197         }
1198
1199         write_fd = atoi(argv[1]);
1200         sockpath = argv[2];
1201
1202         mem_ctx = talloc_new(NULL);
1203         if (mem_ctx == NULL) {
1204                 fprintf(stderr, "talloc_new() failed\n");
1205                 ret = ENOMEM;
1206                 goto done;
1207         }
1208
1209         if (argc == 4) {
1210                 force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
1211                 if (force_rebalance_nodes == NULL) {
1212                         usage(argv[0]);
1213                         ret = EINVAL;
1214                         goto done;
1215                 }
1216         }
1217
1218         ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
1219         if (ret != 0) {
1220                 fprintf(stderr,
1221                         "ctdb-takeover: Unable to initialize logging\n");
1222                 goto done;
1223         }
1224
1225         ev = tevent_context_init(mem_ctx);
1226         if (ev == NULL) {
1227                 D_ERR("tevent_context_init() failed\n");
1228                 ret = ENOMEM;
1229                 goto done;
1230         }
1231
1232         ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
1233         if (ret != 0) {
1234                 D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
1235                 goto done;
1236         }
1237
1238         req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
1239         if (req == NULL) {
1240                 D_ERR("takeover_send() failed\n");
1241                 ret = 1;
1242                 goto done;
1243         }
1244
1245         if (! tevent_req_poll(req, ev)) {
1246                 D_ERR("tevent_req_poll() failed\n");
1247                 ret = 1;
1248                 goto done;
1249         }
1250
1251         takeover_recv(req, &ret);
1252         TALLOC_FREE(req);
1253         if (ret != 0) {
1254                 D_ERR("takeover run failed, ret=%d\n", ret);
1255         }
1256
1257 done:
1258         sys_write_v(write_fd, &ret, sizeof(ret));
1259
1260         talloc_free(mem_ctx);
1261         return ret;
1262 }