ctdb-daemon: Mark NoIPHostOnAllDisabled tunable as obsolete
[samba.git] / ctdb / server / ctdb_takeover_helper.c
1 /*
2    CTDB IP takeover helper
3
4    Copyright (C) Martin Schwenke  2016
5
6    Based on ctdb_recovery_helper.c
7    Copyright (C) Amitay Isaacs  2015
8
9    and ctdb_takeover.c
10    Copyright (C) Ronnie Sahlberg  2007
11    Copyright (C) Andrew Tridgell  2007
12    Copyright (C) Martin Schwenke  2011
13
14    This program is free software; you can redistribute it and/or modify
15    it under the terms of the GNU General Public License as published by
16    the Free Software Foundation; either version 3 of the License, or
17    (at your option) any later version.
18
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License for more details.
23
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28 #include "replace.h"
29 #include "system/network.h"
30 #include "system/filesys.h"
31
32 #include <popt.h>
33 #include <talloc.h>
34 #include <tevent.h>
35
36 #include "lib/util/debug.h"
37 #include "lib/util/strv.h"
38 #include "lib/util/strv_util.h"
39 #include "lib/util/sys_rw.h"
40 #include "lib/util/time.h"
41 #include "lib/util/tevent_unix.h"
42
43 #include "protocol/protocol.h"
44 #include "protocol/protocol_api.h"
45 #include "protocol/protocol_util.h"
46 #include "client/client.h"
47
48 #include "common/logging.h"
49
50 #include "server/ipalloc.h"
51
52 static int takeover_timeout = 9;
53
54 #define TIMEOUT()       timeval_current_ofs(takeover_timeout, 0)
55
56 /*
57  * Utility functions
58  */
59
60 static bool generic_recv(struct tevent_req *req, int *perr)
61 {
62         int err;
63
64         if (tevent_req_is_unix_error(req, &err)) {
65                 if (perr != NULL) {
66                         *perr = err;
67                 }
68                 return false;
69         }
70
71         return true;
72 }
73
74 static enum ipalloc_algorithm
75 determine_algorithm(const struct ctdb_tunable_list *tunables)
76 {
77         switch (tunables->ip_alloc_algorithm) {
78         case 0:
79                 return IPALLOC_DETERMINISTIC;
80         case 1:
81                 return IPALLOC_NONDETERMINISTIC;
82         case 2:
83                 return IPALLOC_LCP2;
84         default:
85                 return IPALLOC_LCP2;
86         };
87 }
88
89 /**********************************************************************/
90
91 struct get_public_ips_state {
92         uint32_t *pnns;
93         int count;
94         struct ctdb_public_ip_list *ips;
95         uint32_t *ban_credits;
96 };
97
98 static void get_public_ips_done(struct tevent_req *subreq);
99
100 static struct tevent_req *get_public_ips_send(
101                                 TALLOC_CTX *mem_ctx,
102                                 struct tevent_context *ev,
103                                 struct ctdb_client_context *client,
104                                 uint32_t *pnns,
105                                 int count, int num_nodes,
106                                 uint32_t *ban_credits,
107                                 bool available_only)
108 {
109         struct tevent_req *req, *subreq;
110         struct get_public_ips_state *state;
111         struct ctdb_req_control request;
112
113         req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
114         if (req == NULL) {
115                 return NULL;
116         }
117
118         state->pnns = pnns;
119         state->count = count;
120         state->ban_credits = ban_credits;
121
122         state->ips  = talloc_zero_array(state,
123                                         struct ctdb_public_ip_list,
124                                         num_nodes);
125         if (tevent_req_nomem(state->ips, req)) {
126                 return tevent_req_post(req, ev);
127         }
128
129         /* Short circuit if no nodes being asked for IPs */
130         if (state->count == 0) {
131                 tevent_req_done(req);
132                 return tevent_req_post(req, ev);
133         }
134
135         ctdb_req_control_get_public_ips(&request, available_only);
136         subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
137                                                 state->pnns,
138                                                 state->count,
139                                                 TIMEOUT(), &request);
140         if (tevent_req_nomem(subreq, req)) {
141                 return tevent_req_post(req, ev);
142         }
143         tevent_req_set_callback(subreq, get_public_ips_done, req);
144
145         return req;
146 }
147
148 static void get_public_ips_done(struct tevent_req *subreq)
149 {
150         struct tevent_req *req = tevent_req_callback_data(
151                 subreq, struct tevent_req);
152         struct get_public_ips_state *state = tevent_req_data(
153                 req, struct get_public_ips_state);
154         struct ctdb_reply_control **reply;
155         int *err_list;
156         int ret, i;
157         bool status, found_errors;
158
159         status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
160                                                 &reply);
161         TALLOC_FREE(subreq);
162         if (! status) {
163                 found_errors = false;
164                 for (i = 0; i < state->count; i++) {
165                         if (err_list[i] != 0) {
166                                 uint32_t pnn = state->pnns[i];
167
168                                 D_ERR("control GET_PUBLIC_IPS failed on "
169                                       "node %u, ret=%d\n", pnn, err_list[i]);
170
171                                 state->ban_credits[pnn]++;
172                                 found_errors = true;
173                         }
174                 }
175
176                 tevent_req_error(req, ret);
177                 return;
178         }
179
180         found_errors = false;
181         for (i = 0; i < state->count; i++) {
182                 uint32_t pnn;
183                 struct ctdb_public_ip_list *ips;
184
185                 pnn = state->pnns[i];
186                 ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
187                                                         &ips);
188                 if (ret != 0) {
189                         D_ERR("control GET_PUBLIC_IPS failed on "
190                               "node %u\n", pnn);
191                         state->ban_credits[pnn]++;
192                         found_errors = true;
193                         continue;
194                 }
195
196                 D_INFO("Fetched public IPs from node %u\n", pnn);
197                 state->ips[pnn] = *ips;
198         }
199
200         if (found_errors) {
201                 tevent_req_error(req, EIO);
202                 return;
203         }
204
205         talloc_free(reply);
206
207         tevent_req_done(req);
208 }
209
210 static bool get_public_ips_recv(struct tevent_req *req, int *perr,
211                                 TALLOC_CTX *mem_ctx,
212                                 struct ctdb_public_ip_list **ips)
213 {
214         struct get_public_ips_state *state = tevent_req_data(
215                 req, struct get_public_ips_state);
216         int err;
217
218         if (tevent_req_is_unix_error(req, &err)) {
219                 if (perr != NULL) {
220                         *perr = err;
221                 }
222                 return false;
223         }
224
225         *ips = talloc_steal(mem_ctx, state->ips);
226
227         return true;
228 }
229
230 /**********************************************************************/
231
232 struct release_ip_state {
233         int num_sent;
234         int num_replies;
235         int num_fails;
236         int err_any;
237         uint32_t *ban_credits;
238 };
239
240 struct release_ip_one_state {
241         struct tevent_req *req;
242         uint32_t *pnns;
243         int count;
244         const char *ip_str;
245 };
246
247 static void release_ip_done(struct tevent_req *subreq);
248
249 static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
250                                           struct tevent_context *ev,
251                                           struct ctdb_client_context *client,
252                                           uint32_t *pnns,
253                                           int count,
254                                           struct timeval timeout,
255                                           struct public_ip_list *all_ips,
256                                           uint32_t *ban_credits)
257 {
258         struct tevent_req *req, *subreq;
259         struct release_ip_state *state;
260         struct ctdb_req_control request;
261         struct public_ip_list *tmp_ip;
262
263         req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
264         if (req == NULL) {
265                 return NULL;
266         }
267
268         state->num_sent = 0;
269         state->num_replies = 0;
270         state->num_fails = 0;
271         state->ban_credits = ban_credits;
272
273         /* Send a RELEASE_IP to all nodes that should not be hosting
274          * each IP.  For each IP, all but one of these will be
275          * redundant.  However, the redundant ones are used to tell
276          * nodes which node should be hosting the IP so that commands
277          * like "ctdb ip" can display a particular nodes idea of who
278          * is hosting what. */
279         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
280                 struct release_ip_one_state *substate;
281                 struct ctdb_public_ip ip;
282                 int i;
283
284                 substate = talloc_zero(state, struct release_ip_one_state);
285                 if (tevent_req_nomem(substate, req)) {
286                         return tevent_req_post(req, ev);
287                 }
288
289                 substate->pnns = talloc_zero_array(substate, uint32_t, count);
290                 if (tevent_req_nomem(substate->pnns, req)) {
291                         return tevent_req_post(req, ev);
292                 }
293
294                 substate->count = 0;
295                 substate->req = req;
296
297                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
298                                                              &tmp_ip->addr,
299                                                              false);
300                 if (tevent_req_nomem(substate->ip_str, req)) {
301                         return tevent_req_post(req, ev);
302                 }
303
304                 for (i = 0; i < count; i++) {
305                         uint32_t pnn = pnns[i];
306
307                         /* Skip this node if IP is not known */
308                         if (! bitmap_query(tmp_ip->known_on, pnn)) {
309                                 continue;
310                         }
311
312                         /* If pnn is not the node that should be
313                          * hosting the IP then add it to the list of
314                          * nodes that need to do a release. */
315                         if (tmp_ip->pnn != pnn) {
316                                 substate->pnns[substate->count] = pnn;
317                                 substate->count++;
318                         }
319                 }
320
321                 if (substate->count == 0) {
322                         /* No releases to send for this address... */
323                         TALLOC_FREE(substate);
324                         continue;
325                 }
326
327                 ip.pnn = tmp_ip->pnn;
328                 ip.addr = tmp_ip->addr;
329                 ctdb_req_control_release_ip(&request, &ip);
330                 subreq = ctdb_client_control_multi_send(state, ev, client,
331                                                         substate->pnns,
332                                                         substate->count,
333                                                         timeout,/* cumulative */
334                                                         &request);
335                 if (tevent_req_nomem(subreq, req)) {
336                         return tevent_req_post(req, ev);
337                 }
338                 tevent_req_set_callback(subreq, release_ip_done, substate);
339
340                 state->num_sent++;
341         }
342
343         /* None sent, finished... */
344         if (state->num_sent == 0) {
345                 tevent_req_done(req);
346                 return tevent_req_post(req, ev);
347         }
348
349         return req;
350 }
351
352 static void release_ip_done(struct tevent_req *subreq)
353 {
354         struct release_ip_one_state *substate = tevent_req_callback_data(
355                 subreq, struct release_ip_one_state);
356         struct tevent_req *req = substate->req;
357         struct release_ip_state *state = tevent_req_data(
358                 req, struct release_ip_state);
359         int ret, i;
360         int *err_list;
361         bool status, found_errors;
362
363         status = ctdb_client_control_multi_recv(subreq, &ret, state,
364                                                 &err_list, NULL);
365         TALLOC_FREE(subreq);
366
367         if (status) {
368                 D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
369                        substate->ip_str, substate->count);
370                 goto done;
371         }
372
373         /* Get some clear error messages out of err_list and count
374          * banning credits
375          */
376         found_errors = false;
377         for (i = 0; i < substate->count; i++) {
378                 int err = err_list[i];
379                 if (err != 0) {
380                         uint32_t pnn = substate->pnns[i];
381
382                         D_ERR("RELEASE_IP %s failed on node %u, "
383                               "ret=%d\n", substate->ip_str, pnn, err);
384
385                         state->ban_credits[pnn]++;
386                         state->err_any = err;
387                         found_errors = true;
388                 }
389         }
390         if (! found_errors) {
391                 D_ERR("RELEASE_IP %s internal error, ret=%d\n",
392                       substate->ip_str, ret);
393                 state->err_any = EIO;
394         }
395
396         state->num_fails++;
397
398 done:
399         talloc_free(substate);
400
401         state->num_replies++;
402
403         if (state->num_replies < state->num_sent) {
404                 /* Not all replies received, don't go further */
405                 return;
406         }
407
408         if (state->num_fails > 0) {
409                 tevent_req_error(req, state->err_any);
410                 return;
411         }
412
413         tevent_req_done(req);
414 }
415
416 static bool release_ip_recv(struct tevent_req *req, int *perr)
417 {
418         return generic_recv(req, perr);
419 }
420
421 /**********************************************************************/
422
423 struct take_ip_state {
424         int num_sent;
425         int num_replies;
426         int num_fails;
427         int err_any;
428         uint32_t *ban_credits;
429 };
430
431 struct take_ip_one_state {
432         struct tevent_req *req;
433         uint32_t pnn;
434         const char *ip_str;
435 };
436
437 static void take_ip_done(struct tevent_req *subreq);
438
439 static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
440                                        struct tevent_context *ev,
441                                        struct ctdb_client_context *client,
442                                        struct timeval timeout,
443                                        struct public_ip_list *all_ips,
444                                        uint32_t *ban_credits)
445 {
446         struct tevent_req *req, *subreq;
447         struct take_ip_state *state;
448         struct ctdb_req_control request;
449         struct public_ip_list *tmp_ip;
450
451         req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
452         if (req == NULL) {
453                 return NULL;
454         }
455
456         state->num_sent = 0;
457         state->num_replies = 0;
458         state->num_fails = 0;
459         state->ban_credits = ban_credits;
460
461         /* For each IP, send a TAKOVER_IP to the node that should be
462          * hosting it.  Many of these will often be redundant (since
463          * the allocation won't have changed) but they can be useful
464          * to recover from inconsistencies. */
465         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
466                 struct take_ip_one_state *substate;
467                 struct ctdb_public_ip ip;
468
469                 if (tmp_ip->pnn == -1) {
470                         /* IP will be unassigned */
471                         continue;
472                 }
473
474                 substate = talloc_zero(state, struct take_ip_one_state);
475                 if (tevent_req_nomem(substate, req)) {
476                         return tevent_req_post(req, ev);
477                 }
478
479                 substate->req = req;
480                 substate->pnn = tmp_ip->pnn;
481
482                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
483                                                              &tmp_ip->addr,
484                                                              false);
485                 if (tevent_req_nomem(substate->ip_str, req)) {
486                         return tevent_req_post(req, ev);
487                 }
488
489                 ip.pnn = tmp_ip->pnn;
490                 ip.addr = tmp_ip->addr;
491                 ctdb_req_control_takeover_ip(&request, &ip);
492                 subreq = ctdb_client_control_send(
493                                         state, ev, client, tmp_ip->pnn,
494                                         timeout, /* cumulative */
495                                         &request);
496                 if (tevent_req_nomem(subreq, req)) {
497                         return tevent_req_post(req, ev);
498                 }
499                 tevent_req_set_callback(subreq, take_ip_done, substate);
500
501                 state->num_sent++;
502         }
503
504         /* None sent, finished... */
505         if (state->num_sent == 0) {
506                 tevent_req_done(req);
507                 return tevent_req_post(req, ev);
508         }
509
510         return req;
511 }
512
513 static void take_ip_done(struct tevent_req *subreq)
514 {
515         struct take_ip_one_state *substate = tevent_req_callback_data(
516                 subreq, struct take_ip_one_state);
517         struct tevent_req *req = substate->req;
518         struct ctdb_reply_control *reply;
519         struct take_ip_state *state = tevent_req_data(
520                 req, struct take_ip_state);
521         int ret = 0;
522         bool status;
523
524         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
525         TALLOC_FREE(subreq);
526
527         if (! status) {
528                 D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
529                       substate->ip_str, substate->pnn, ret);
530                 goto fail;
531         }
532
533         ret = ctdb_reply_control_takeover_ip(reply);
534         if (ret != 0) {
535                 D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
536                       substate->ip_str, substate->pnn, ret);
537                 goto fail;
538         }
539
540         D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
541                substate->ip_str, substate->pnn);
542         goto done;
543
544 fail:
545         state->ban_credits[substate->pnn]++;
546         state->num_fails++;
547         state->err_any = ret;
548
549 done:
550         talloc_free(substate);
551
552         state->num_replies++;
553
554         if (state->num_replies < state->num_sent) {
555                 /* Not all replies received, don't go further */
556                 return;
557         }
558
559         if (state->num_fails > 0) {
560                 tevent_req_error(req, state->err_any);
561                 return;
562         }
563
564         tevent_req_done(req);
565 }
566
567 static bool take_ip_recv(struct tevent_req *req, int *perr)
568 {
569         return generic_recv(req, perr);
570 }
571
572 /**********************************************************************/
573
574 struct ipreallocated_state {
575         uint32_t *pnns;
576         int count;
577         uint32_t *ban_credits;
578 };
579
580 static void ipreallocated_done(struct tevent_req *subreq);
581
582 static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
583                                              struct tevent_context *ev,
584                                              struct ctdb_client_context *client,
585                                              uint32_t *pnns,
586                                              int count,
587                                              struct timeval timeout,
588                                              uint32_t *ban_credits)
589 {
590         struct tevent_req *req, *subreq;
591         struct ipreallocated_state *state;
592         struct ctdb_req_control request;
593
594         req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
595         if (req == NULL) {
596                 return NULL;
597         }
598
599         state->pnns = pnns;
600         state->count = count;
601         state->ban_credits = ban_credits;
602
603         ctdb_req_control_ipreallocated(&request);
604         subreq = ctdb_client_control_multi_send(state, ev, client,
605                                                 pnns, count,
606                                                 timeout, /* cumulative */
607                                                 &request);
608         if (tevent_req_nomem(subreq, req)) {
609                 return tevent_req_post(req, ev);
610         }
611         tevent_req_set_callback(subreq, ipreallocated_done, req);
612
613         return req;
614 }
615
616 static void ipreallocated_done(struct tevent_req *subreq)
617 {
618         struct tevent_req *req = tevent_req_callback_data(
619                 subreq, struct tevent_req);
620         struct ipreallocated_state *state = tevent_req_data(
621                 req, struct ipreallocated_state);
622         int *err_list = NULL;
623         int ret, i;
624         bool status, found_errors;
625
626         status = ctdb_client_control_multi_recv(subreq, &ret, state,
627                                                 &err_list, NULL);
628         TALLOC_FREE(subreq);
629
630         if (status) {
631                 D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
632                 tevent_req_done(req);
633                 return;
634         }
635
636         /* Get some clear error messages out of err_list and count
637          * banning credits
638          */
639         found_errors = false;
640         for (i = 0; i < state->count; i++) {
641                 int err = err_list[i];
642                 if (err != 0) {
643                         uint32_t pnn = state->pnns[i];
644
645                         D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
646                               pnn, err);
647
648                         state->ban_credits[pnn]++;
649                         found_errors = true;
650                 }
651         }
652
653         if (! found_errors) {
654                 D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
655         }
656
657         tevent_req_error(req, ret);
658 }
659
660 static bool ipreallocated_recv(struct tevent_req *req, int *perr)
661 {
662         return generic_recv(req, perr);
663 }
664
665 /**********************************************************************/
666
667 /*
668  * Recalculate the allocation of public IPs to nodes and have the
669  * nodes host their allocated addresses.
670  *
671  * - Get tunables
672  * - Get nodemap
673  * - Initialise IP allocation state.  Pass:
674  *   + algorithm to be used;
675  *   + various tunables (NoIPTakeover, NoIPFailback)
676  *   + list of nodes to force rebalance (internal structure, currently
677  *     no way to fetch, only used by LCP2 for nodes that have had new
678  *     IP addresses added).
679  * - Set IP flags for IP allocation based on node map
680  * - Retrieve known and available IP addresses (done separately so
681  *   values can be faked in unit testing)
682  * - Use ipalloc_set_public_ips() to set known and available IP
683  *   addresses for allocation
684  * - If cluster can't host IP addresses then jump to IPREALLOCATED
685  * - Run IP allocation algorithm
686  * - Send RELEASE_IP to all nodes for IPs they should not host
687  * - Send TAKE_IP to all nodes for IPs they should host
688  * - Send IPREALLOCATED to all nodes
689  */
690
691 struct takeover_state {
692         struct tevent_context *ev;
693         struct ctdb_client_context *client;
694         struct timeval timeout;
695         int num_nodes;
696         uint32_t *pnns_connected;
697         int num_connected;
698         uint32_t *pnns_active;
699         int num_active;
700         uint32_t destnode;
701         uint32_t *force_rebalance_nodes;
702         struct ctdb_tunable_list *tun_list;
703         struct ipalloc_state *ipalloc_state;
704         struct ctdb_public_ip_list *known_ips;
705         struct public_ip_list *all_ips;
706         uint32_t *ban_credits;
707 };
708
709 static void takeover_tunables_done(struct tevent_req *subreq);
710 static void takeover_nodemap_done(struct tevent_req *subreq);
711 static void takeover_known_ips_done(struct tevent_req *subreq);
712 static void takeover_avail_ips_done(struct tevent_req *subreq);
713 static void takeover_release_ip_done(struct tevent_req *subreq);
714 static void takeover_take_ip_done(struct tevent_req *subreq);
715 static void takeover_ipreallocated(struct tevent_req *req);
716 static void takeover_ipreallocated_done(struct tevent_req *subreq);
717 static void takeover_failed(struct tevent_req *subreq, int ret);
718 static void takeover_failed_done(struct tevent_req *subreq);
719
720 static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
721                                         struct tevent_context *ev,
722                                         struct ctdb_client_context *client,
723                                         uint32_t *force_rebalance_nodes)
724 {
725         struct tevent_req *req, *subreq;
726         struct takeover_state *state;
727         struct ctdb_req_control request;
728
729         req = tevent_req_create(mem_ctx, &state, struct takeover_state);
730         if (req == NULL) {
731                 return NULL;
732         }
733
734         state->ev = ev;
735         state->client = client;
736         state->force_rebalance_nodes = force_rebalance_nodes;
737         state->destnode = ctdb_client_pnn(client);
738
739         ctdb_req_control_get_all_tunables(&request);
740         subreq = ctdb_client_control_send(state, state->ev, state->client,
741                                           state->destnode, TIMEOUT(),
742                                           &request);
743         if (tevent_req_nomem(subreq, req)) {
744                 return tevent_req_post(req, ev);
745         }
746         tevent_req_set_callback(subreq, takeover_tunables_done, req);
747
748         return req;
749 }
750
751 static void takeover_tunables_done(struct tevent_req *subreq)
752 {
753         struct tevent_req *req = tevent_req_callback_data(
754                 subreq, struct tevent_req);
755         struct takeover_state *state = tevent_req_data(
756                 req, struct takeover_state);
757         struct ctdb_reply_control *reply;
758         struct ctdb_req_control request;
759         int ret;
760         bool status;
761
762         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
763         TALLOC_FREE(subreq);
764         if (! status) {
765                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
766                 tevent_req_error(req, ret);
767                 return;
768         }
769
770         ret = ctdb_reply_control_get_all_tunables(reply, state,
771                                                   &state->tun_list);
772         if (ret != 0) {
773                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
774                 tevent_req_error(req, ret);
775                 return;
776         }
777
778         talloc_free(reply);
779
780         takeover_timeout = state->tun_list->takeover_timeout;
781
782         ctdb_req_control_get_nodemap(&request);
783         subreq = ctdb_client_control_send(state, state->ev, state->client,
784                                           state->destnode, TIMEOUT(),
785                                           &request);
786         if (tevent_req_nomem(subreq, req)) {
787                 return;
788         }
789         tevent_req_set_callback(subreq, takeover_nodemap_done, req);
790 }
791
792 static void takeover_nodemap_done(struct tevent_req *subreq)
793 {
794         struct tevent_req *req = tevent_req_callback_data(
795                 subreq, struct tevent_req);
796         struct takeover_state *state = tevent_req_data(
797                 req, struct takeover_state);
798         struct ctdb_reply_control *reply;
799         bool status;
800         int ret;
801         struct ctdb_node_map *nodemap;
802
803         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
804         TALLOC_FREE(subreq);
805         if (! status) {
806                 D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
807                         state->destnode, ret);
808                 tevent_req_error(req, ret);
809                 return;
810         }
811
812         ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
813         if (ret != 0) {
814                 D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
815                 tevent_req_error(req, ret);
816                 return;
817         }
818
819         state->num_nodes = nodemap->num;
820
821         state->num_connected = list_of_connected_nodes(nodemap,
822                                                        CTDB_UNKNOWN_PNN, state,
823                                                        &state->pnns_connected);
824         if (state->num_connected <= 0) {
825                 tevent_req_error(req, ENOMEM);
826                 return;
827         }
828
829         state->num_active = list_of_active_nodes(nodemap,
830                                                  CTDB_UNKNOWN_PNN, state,
831                                                  &state->pnns_active);
832         if (state->num_active <= 0) {
833                 tevent_req_error(req, ENOMEM);
834                 return;
835         }
836
837         /* Default timeout for early jump to IPREALLOCATED.  See below
838          * for explanation of 3 times...
839          */
840         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
841
842         state->ban_credits = talloc_zero_array(state, uint32_t,
843                                                state->num_nodes);
844         if (tevent_req_nomem(state->ban_credits, req)) {
845                 return;
846         }
847
848         if (state->tun_list->disable_ip_failover != 0) {
849                 /* IP failover is completely disabled so just send out
850                  * ipreallocated event.
851                  */
852                 takeover_ipreallocated(req);
853                 return;
854         }
855
856         state->ipalloc_state =
857                 ipalloc_state_init(
858                         state, state->num_nodes,
859                         determine_algorithm(state->tun_list),
860                         (state->tun_list->no_ip_takeover != 0),
861                         (state->tun_list->no_ip_failback != 0),
862                         (state->tun_list->no_ip_host_on_all_disabled != 0),
863                         state->force_rebalance_nodes);
864         if (tevent_req_nomem(state->ipalloc_state, req)) {
865                 return;
866         }
867
868         ipalloc_set_node_flags(state->ipalloc_state, nodemap);
869
870         subreq = get_public_ips_send(state, state->ev, state->client,
871                                      state->pnns_connected, state->num_connected,
872                                      state->num_nodes, state->ban_credits,
873                                      false);
874         if (tevent_req_nomem(subreq, req)) {
875                 return;
876         }
877
878         tevent_req_set_callback(subreq, takeover_known_ips_done, req);
879 }
880
881 static void takeover_known_ips_done(struct tevent_req *subreq)
882 {
883         struct tevent_req *req = tevent_req_callback_data(
884                 subreq, struct tevent_req);
885         struct takeover_state *state = tevent_req_data(
886                 req, struct takeover_state);
887         int ret;
888         bool status;
889         uint32_t *pnns = NULL;
890         int count, i;
891
892         status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
893         TALLOC_FREE(subreq);
894
895         if (! status) {
896                 D_ERR("Failed to fetch known public IPs\n");
897                 takeover_failed(req, ret);
898                 return;
899         }
900
901         /* Get available IPs from active nodes that actually have known IPs */
902
903         pnns = talloc_zero_array(state, uint32_t, state->num_active);
904         if (tevent_req_nomem(pnns, req)) {
905                 return;
906         }
907
908         count = 0;
909         for (i = 0; i < state->num_active; i++) {
910                 uint32_t pnn = state->pnns_active[i];
911
912                 /* If pnn has IPs then fetch available IPs from it */
913                 if (state->known_ips[pnn].num > 0) {
914                         pnns[count] = pnn;
915                         count++;
916                 }
917         }
918
919         subreq = get_public_ips_send(state, state->ev, state->client,
920                                      pnns, count,
921                                      state->num_nodes, state->ban_credits,
922                                      true);
923         if (tevent_req_nomem(subreq, req)) {
924                 return;
925         }
926
927         tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
928 }
929
930 static void takeover_avail_ips_done(struct tevent_req *subreq)
931 {
932         struct tevent_req *req = tevent_req_callback_data(
933                 subreq, struct tevent_req);
934         struct takeover_state *state = tevent_req_data(
935                 req, struct takeover_state);
936         bool status;
937         int ret;
938         struct ctdb_public_ip_list *available_ips;
939
940         status = get_public_ips_recv(subreq, &ret, state, &available_ips);
941         TALLOC_FREE(subreq);
942
943         if (! status) {
944                 D_ERR("Failed to fetch available public IPs\n");
945                 takeover_failed(req, ret);
946                 return;
947         }
948
949         ipalloc_set_public_ips(state->ipalloc_state,
950                                state->known_ips, available_ips);
951
952         if (! ipalloc_can_host_ips(state->ipalloc_state)) {
953                 D_NOTICE("No nodes available to host public IPs yet\n");
954                 takeover_ipreallocated(req);
955                 return;
956         }
957
958         /* Do the IP reassignment calculations */
959         state->all_ips = ipalloc(state->ipalloc_state);
960         if (tevent_req_nomem(state->all_ips, req)) {
961                 return;
962         }
963
964         /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
965          * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
966          * seconds.  However, RELEASE_IP can take longer due to TCP
967          * connection killing, so sometimes needs more time.
968          * Therefore, use a cumulative timeout of TakeoverTimeout * 3
969          * seconds across all 3 stages.  No explicit expiry checks are
970          * needed before each stage because tevent is smart enough to
971          * fire the timeouts even if they are in the past.  Initialise
972          * this here so it explicitly covers the stages we're
973          * interested in but, in particular, not the time taken by the
974          * ipalloc().
975          */
976         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
977
978         subreq = release_ip_send(state, state->ev, state->client,
979                                  state->pnns_connected, state->num_connected,
980                                  state->timeout, state->all_ips,
981                                  state->ban_credits);
982         if (tevent_req_nomem(subreq, req)) {
983                 return;
984         }
985         tevent_req_set_callback(subreq, takeover_release_ip_done, req);
986 }
987
988 static void takeover_release_ip_done(struct tevent_req *subreq)
989 {
990         struct tevent_req *req = tevent_req_callback_data(
991                 subreq, struct tevent_req);
992         struct takeover_state *state = tevent_req_data(
993                 req, struct takeover_state);
994         int ret;
995         bool status;
996
997         status = release_ip_recv(subreq, &ret);
998         TALLOC_FREE(subreq);
999
1000         if (! status) {
1001                 takeover_failed(req, ret);
1002                 return;
1003         }
1004
1005         /* All released, now for takeovers */
1006
1007         subreq = take_ip_send(state, state->ev, state->client,
1008                               state->timeout, state->all_ips,
1009                               state->ban_credits);
1010         if (tevent_req_nomem(subreq, req)) {
1011                 return;
1012         }
1013         tevent_req_set_callback(subreq, takeover_take_ip_done, req);
1014 }
1015
1016 static void takeover_take_ip_done(struct tevent_req *subreq)
1017 {
1018         struct tevent_req *req = tevent_req_callback_data(
1019                 subreq, struct tevent_req);
1020         int ret = 0;
1021         bool status;
1022
1023         status = take_ip_recv(subreq, &ret);
1024         TALLOC_FREE(subreq);
1025
1026         if (! status) {
1027                 takeover_failed(req, ret);
1028                 return;
1029         }
1030
1031         takeover_ipreallocated(req);
1032 }
1033
1034 static void takeover_ipreallocated(struct tevent_req *req)
1035 {
1036         struct takeover_state *state = tevent_req_data(
1037                 req, struct takeover_state);
1038         struct tevent_req *subreq;
1039
1040         subreq = ipreallocated_send(state, state->ev, state->client,
1041                                     state->pnns_connected,
1042                                     state->num_connected,
1043                                     state->timeout,
1044                                     state->ban_credits);
1045         if (tevent_req_nomem(subreq, req)) {
1046                 return;
1047         }
1048         tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
1049 }
1050
1051 static void takeover_ipreallocated_done(struct tevent_req *subreq)
1052 {
1053         struct tevent_req *req = tevent_req_callback_data(
1054                 subreq, struct tevent_req);
1055         int ret;
1056         bool status;
1057
1058         status = ipreallocated_recv(subreq, &ret);
1059         TALLOC_FREE(subreq);
1060
1061         if (! status) {
1062                 takeover_failed(req, ret);
1063                 return;
1064         }
1065
1066         tevent_req_done(req);
1067 }
1068
1069 struct takeover_failed_state {
1070         struct tevent_req *req;
1071         int ret;
1072 };
1073
1074 void takeover_failed(struct tevent_req *req, int ret)
1075 {
1076         struct takeover_state *state = tevent_req_data(
1077                 req, struct takeover_state);
1078         struct tevent_req *subreq;
1079         uint32_t max_pnn = CTDB_UNKNOWN_PNN;
1080         int max_credits = 0;
1081         int pnn;
1082
1083         /* Check that bans are enabled */
1084         if (state->tun_list->enable_bans == 0) {
1085                 tevent_req_error(req, ret);
1086                 return;
1087         }
1088
1089         for (pnn = 0; pnn < state->num_nodes; pnn++) {
1090                 if (state->ban_credits[pnn] > max_credits) {
1091                         max_pnn = pnn;
1092                         max_credits = state->ban_credits[pnn];
1093                 }
1094         }
1095
1096         if (max_credits > 0) {
1097                 struct ctdb_req_message message;
1098                 struct takeover_failed_state *substate;
1099
1100                 D_WARNING("Assigning banning credits to node %u\n", max_pnn);
1101
1102                 substate = talloc_zero(state, struct takeover_failed_state);
1103                 if (tevent_req_nomem(substate, req)) {
1104                         return;
1105                 }
1106                 substate->req = req;
1107                 substate->ret = ret;
1108
1109                 message.srvid = CTDB_SRVID_BANNING;
1110                 message.data.pnn = max_pnn;
1111
1112                 subreq = ctdb_client_message_send(
1113                         state, state->ev, state->client,
1114                         ctdb_client_pnn(state->client),
1115                         &message);
1116                 if (subreq == NULL) {
1117                         D_ERR("failed to assign banning credits\n");
1118                         tevent_req_error(req, ret);
1119                         return;
1120                 }
1121                 tevent_req_set_callback(subreq, takeover_failed_done, substate);
1122         } else {
1123                 tevent_req_error(req, ret);
1124         }
1125 }
1126
1127 static void takeover_failed_done(struct tevent_req *subreq)
1128 {
1129         struct takeover_failed_state *substate = tevent_req_callback_data(
1130                 subreq, struct takeover_failed_state);
1131         struct tevent_req *req = substate->req;
1132         int ret;
1133         bool status;
1134
1135         status = ctdb_client_message_recv(subreq, &ret);
1136         TALLOC_FREE(subreq);
1137         if (! status) {
1138                 D_ERR("failed to assign banning credits, ret=%d\n", ret);
1139         }
1140
1141         ret = substate->ret;
1142         talloc_free(substate);
1143         tevent_req_error(req, ret);
1144 }
1145
1146 static void takeover_recv(struct tevent_req *req, int *perr)
1147 {
1148         generic_recv(req, perr);
1149 }
1150
1151 static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
1152 {
1153         char *strv = NULL;
1154         int num, i, ret;
1155         char *t;
1156         uint32_t *nodes;
1157
1158         ret = strv_split(mem_ctx, &strv, s, ",");
1159         if (ret != 0) {
1160                 D_ERR("out of memory\n");
1161                 return NULL;
1162         }
1163
1164         num = strv_count(strv);
1165
1166         nodes = talloc_array(mem_ctx, uint32_t, num);
1167         if (nodes == NULL) {
1168                 D_ERR("out of memory\n");
1169                 return NULL;
1170         }
1171
1172         t = NULL;
1173         for (i = 0; i < num; i++) {
1174                 t = strv_next(strv, t);
1175                 nodes[i] = atoi(t);
1176         }
1177
1178         return nodes;
1179 }
1180
1181 static void usage(const char *progname)
1182 {
1183         fprintf(stderr,
1184                 "\nUsage: %s <output-fd> <ctdb-socket-path> "
1185                 "[<force-rebalance-nodes>]\n",
1186                 progname);
1187 }
1188
1189 /*
1190  * Arguments - write fd, socket path
1191  */
1192 int main(int argc, const char *argv[])
1193 {
1194         int write_fd;
1195         const char *sockpath;
1196         TALLOC_CTX *mem_ctx;
1197         struct tevent_context *ev;
1198         struct ctdb_client_context *client;
1199         int ret;
1200         struct tevent_req *req;
1201         uint32_t *force_rebalance_nodes = NULL;
1202
1203         if (argc < 3 || argc > 4) {
1204                 usage(argv[0]);
1205                 exit(1);
1206         }
1207
1208         write_fd = atoi(argv[1]);
1209         sockpath = argv[2];
1210
1211         mem_ctx = talloc_new(NULL);
1212         if (mem_ctx == NULL) {
1213                 fprintf(stderr, "talloc_new() failed\n");
1214                 ret = ENOMEM;
1215                 goto done;
1216         }
1217
1218         if (argc == 4) {
1219                 force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
1220                 if (force_rebalance_nodes == NULL) {
1221                         usage(argv[0]);
1222                         ret = EINVAL;
1223                         goto done;
1224                 }
1225         }
1226
1227         ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
1228         if (ret != 0) {
1229                 fprintf(stderr,
1230                         "ctdb-takeover: Unable to initialize logging\n");
1231                 goto done;
1232         }
1233
1234         ev = tevent_context_init(mem_ctx);
1235         if (ev == NULL) {
1236                 D_ERR("tevent_context_init() failed\n");
1237                 ret = ENOMEM;
1238                 goto done;
1239         }
1240
1241         ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
1242         if (ret != 0) {
1243                 D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
1244                 goto done;
1245         }
1246
1247         req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
1248         if (req == NULL) {
1249                 D_ERR("takeover_send() failed\n");
1250                 ret = 1;
1251                 goto done;
1252         }
1253
1254         if (! tevent_req_poll(req, ev)) {
1255                 D_ERR("tevent_req_poll() failed\n");
1256                 ret = 1;
1257                 goto done;
1258         }
1259
1260         takeover_recv(req, &ret);
1261         TALLOC_FREE(req);
1262         if (ret != 0) {
1263                 D_ERR("takeover run failed, ret=%d\n", ret);
1264         }
1265
1266 done:
1267         sys_write_v(write_fd, &ret, sizeof(ret));
1268
1269         talloc_free(mem_ctx);
1270         return ret;
1271 }