2 ctdb main protocol code
4 Copyright (C) Andrew Tridgell 2006
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tdb/include/tdb.h"
22 #include "lib/events/events.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "../include/ctdb_private.h"
29 choose the transport we will use
31 int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport)
33 ctdb->transport = talloc_strdup(ctdb, transport);
34 CTDB_NO_MEMORY(ctdb, ctdb->transport);
40 Check whether an ip is a valid node ip
41 Returns the node id for this ip address or -1
43 int ctdb_ip_to_nodeid(struct ctdb_context *ctdb, const char *nodeip)
47 for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) {
48 if (ctdb->nodes[nodeid]->flags & NODE_FLAGS_DELETED) {
51 if (!strcmp(ctdb->nodes[nodeid]->address.address, nodeip)) {
60 choose the recovery lock file
62 int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file)
64 if (ctdb->recovery_lock_file != NULL) {
65 talloc_free(ctdb->recovery_lock_file);
66 ctdb->recovery_lock_file = NULL;
70 DEBUG(DEBUG_ALERT,("Recovery lock file set to \"\". Disabling recovery lock checking\n"));
71 ctdb->tunable.verify_recovery_lock = 0;
75 ctdb->recovery_lock_file = talloc_strdup(ctdb, file);
76 CTDB_NO_MEMORY(ctdb, ctdb->recovery_lock_file);
82 set the directory for the local databases
84 int ctdb_set_tdb_dir(struct ctdb_context *ctdb, const char *dir)
86 ctdb->db_directory = talloc_strdup(ctdb, dir);
87 if (ctdb->db_directory == NULL) {
94 set the directory for the persistent databases
96 int ctdb_set_tdb_dir_persistent(struct ctdb_context *ctdb, const char *dir)
98 ctdb->db_directory_persistent = talloc_strdup(ctdb, dir);
99 if (ctdb->db_directory_persistent == NULL) {
106 set the directory for internal state databases
108 int ctdb_set_tdb_dir_state(struct ctdb_context *ctdb, const char *dir)
110 ctdb->db_directory_state = talloc_strdup(ctdb, dir);
111 if (ctdb->db_directory_state == NULL) {
118 add a node to the list of nodes
120 static int ctdb_add_node(struct ctdb_context *ctdb, char *nstr)
122 struct ctdb_node *node, **nodep;
124 nodep = talloc_realloc(ctdb, ctdb->nodes, struct ctdb_node *, ctdb->num_nodes+1);
125 CTDB_NO_MEMORY(ctdb, nodep);
128 nodep = &ctdb->nodes[ctdb->num_nodes];
129 (*nodep) = talloc_zero(ctdb->nodes, struct ctdb_node);
130 CTDB_NO_MEMORY(ctdb, *nodep);
133 if (ctdb_parse_address(ctdb, node, nstr, &node->address) != 0) {
137 node->name = talloc_asprintf(node, "%s:%u",
138 node->address.address,
140 /* this assumes that the nodes are kept in sorted order, and no gaps */
141 node->pnn = ctdb->num_nodes;
143 /* nodes start out disconnected and unhealthy */
144 node->flags = (NODE_FLAGS_DISCONNECTED | NODE_FLAGS_UNHEALTHY);
146 if (ctdb->address.address &&
147 ctdb_same_address(&ctdb->address, &node->address)) {
148 /* for automatic binding to interfaces, see tcp_connect.c */
149 ctdb->pnn = node->pnn;
150 node->flags &= ~NODE_FLAGS_DISCONNECTED;
152 /* do we start out in DISABLED mode? */
153 if (ctdb->start_as_disabled != 0) {
154 DEBUG(DEBUG_INFO, ("This node is configured to start in DISABLED state\n"));
155 node->flags |= NODE_FLAGS_DISABLED;
157 /* do we start out in STOPPED mode? */
158 if (ctdb->start_as_stopped != 0) {
159 DEBUG(DEBUG_INFO, ("This node is configured to start in STOPPED state\n"));
160 node->flags |= NODE_FLAGS_STOPPED;
165 node->dead_count = 0;
171 add an entry for a "deleted" node to the list of nodes.
172 a "deleted" node is a node that is commented out from the nodes file.
173 this is used to prevent that subsequent nodes in the nodes list
174 change their pnn value if a node is "delete" by commenting it out and then
175 using "ctdb reloadnodes" at runtime.
177 static int ctdb_add_deleted_node(struct ctdb_context *ctdb)
179 struct ctdb_node *node, **nodep;
181 nodep = talloc_realloc(ctdb, ctdb->nodes, struct ctdb_node *, ctdb->num_nodes+1);
182 CTDB_NO_MEMORY(ctdb, nodep);
185 nodep = &ctdb->nodes[ctdb->num_nodes];
186 (*nodep) = talloc_zero(ctdb->nodes, struct ctdb_node);
187 CTDB_NO_MEMORY(ctdb, *nodep);
190 if (ctdb_parse_address(ctdb, node, "0.0.0.0", &node->address) != 0) {
191 DEBUG(DEBUG_ERR,("Failed to setup deleted node %d\n", ctdb->num_nodes));
195 node->name = talloc_strdup(node, "0.0.0.0:0");
197 /* this assumes that the nodes are kept in sorted order, and no gaps */
198 node->pnn = ctdb->num_nodes;
200 /* this node is permanently deleted/disconnected */
201 node->flags = NODE_FLAGS_DELETED|NODE_FLAGS_DISCONNECTED;
204 node->dead_count = 0;
211 setup the node list from a file
213 int ctdb_set_nlist(struct ctdb_context *ctdb, const char *nlist)
217 int i, j, num_present;
219 talloc_free(ctdb->nodes);
223 lines = file_lines_load(nlist, &nlines, ctdb);
225 ctdb_set_error(ctdb, "Failed to load nlist '%s'\n", nlist);
228 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
233 for (i=0; i < nlines; i++) {
237 /* strip leading spaces */
238 while((*node == ' ') || (*node == '\t')) {
242 if (ctdb_add_deleted_node(ctdb) != 0) {
248 if (strcmp(node, "") == 0) {
251 if (ctdb_add_node(ctdb, node) != 0) {
258 /* initialize the vnn mapping table now that we have the nodes list,
259 skipping any deleted nodes
261 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
262 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
264 ctdb->vnn_map->generation = INVALID_GENERATION;
265 ctdb->vnn_map->size = num_present;
266 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
267 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
269 for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
270 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
273 ctdb->vnn_map->map[j] = i;
283 setup the local node address
285 int ctdb_set_address(struct ctdb_context *ctdb, const char *address)
287 if (ctdb_parse_address(ctdb, ctdb, address, &ctdb->address) != 0) {
291 ctdb->name = talloc_asprintf(ctdb, "%s:%u",
292 ctdb->address.address,
299 return the number of active nodes
301 uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb)
305 for (i=0; i < ctdb->num_nodes; i++) {
306 if (!(ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE)) {
315 called when we need to process a packet. This can be a requeued packet
316 after a lockwait, or a real packet from another node
318 void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
322 /* place the packet as a child of the tmp_ctx. We then use
323 talloc_free() below to free it. If any of the calls want
324 to keep it, then they will steal it somewhere else, and the
325 talloc_free() will only free the tmp_ctx */
326 tmp_ctx = talloc_new(ctdb);
327 talloc_steal(tmp_ctx, hdr);
329 DEBUG(DEBUG_DEBUG,(__location__ " ctdb request %u of type %u length %u from "
330 "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
331 hdr->srcnode, hdr->destnode));
333 switch (hdr->operation) {
335 case CTDB_REPLY_CALL:
336 case CTDB_REQ_DMASTER:
337 case CTDB_REPLY_DMASTER:
338 /* we dont allow these calls when banned */
339 if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_BANNED) {
340 DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
342 " length %u from node %u to %u while node"
344 hdr->operation, hdr->reqid,
346 hdr->srcnode, hdr->destnode));
350 /* for ctdb_call inter-node operations verify that the
351 remote node that sent us the call is running in the
352 same generation instance as this node
354 if (ctdb->vnn_map->generation != hdr->generation) {
355 DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
357 " length %u from node %u to %u had an"
358 " invalid generation id:%u while our"
359 " generation id is:%u\n",
360 hdr->operation, hdr->reqid,
362 hdr->srcnode, hdr->destnode,
363 hdr->generation, ctdb->vnn_map->generation));
368 switch (hdr->operation) {
370 ctdb->statistics.node.req_call++;
371 ctdb_request_call(ctdb, hdr);
374 case CTDB_REPLY_CALL:
375 ctdb->statistics.node.reply_call++;
376 ctdb_reply_call(ctdb, hdr);
379 case CTDB_REPLY_ERROR:
380 ctdb->statistics.node.reply_error++;
381 ctdb_reply_error(ctdb, hdr);
384 case CTDB_REQ_DMASTER:
385 ctdb->statistics.node.req_dmaster++;
386 ctdb_request_dmaster(ctdb, hdr);
389 case CTDB_REPLY_DMASTER:
390 ctdb->statistics.node.reply_dmaster++;
391 ctdb_reply_dmaster(ctdb, hdr);
394 case CTDB_REQ_MESSAGE:
395 ctdb->statistics.node.req_message++;
396 ctdb_request_message(ctdb, hdr);
399 case CTDB_REQ_CONTROL:
400 ctdb->statistics.node.req_control++;
401 ctdb_request_control(ctdb, hdr);
404 case CTDB_REPLY_CONTROL:
405 ctdb->statistics.node.reply_control++;
406 ctdb_reply_control(ctdb, hdr);
409 case CTDB_REQ_KEEPALIVE:
410 ctdb->statistics.keepalive_packets_recv++;
414 DEBUG(DEBUG_CRIT,("%s: Packet with unknown operation %u\n",
415 __location__, hdr->operation));
420 talloc_free(tmp_ctx);
425 called by the transport layer when a node is dead
427 void ctdb_node_dead(struct ctdb_node *node)
429 if (node->flags & NODE_FLAGS_DISCONNECTED) {
430 DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n",
431 node->ctdb->name, node->name,
432 node->ctdb->num_connected));
435 node->ctdb->num_connected--;
436 node->flags |= NODE_FLAGS_DISCONNECTED | NODE_FLAGS_UNHEALTHY;
438 node->dead_count = 0;
440 DEBUG(DEBUG_NOTICE,("%s: node %s is dead: %u connected\n",
441 node->ctdb->name, node->name, node->ctdb->num_connected));
442 ctdb_daemon_cancel_controls(node->ctdb, node);
444 if (node->ctdb->methods == NULL) {
445 DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n"));
449 node->ctdb->methods->restart(node);
453 called by the transport layer when a node is connected
455 void ctdb_node_connected(struct ctdb_node *node)
457 if (!(node->flags & NODE_FLAGS_DISCONNECTED)) {
458 DEBUG(DEBUG_INFO,("%s: node %s is already marked connected: %u connected\n",
459 node->ctdb->name, node->name,
460 node->ctdb->num_connected));
463 node->ctdb->num_connected++;
464 node->dead_count = 0;
465 node->flags &= ~NODE_FLAGS_DISCONNECTED;
466 node->flags |= NODE_FLAGS_UNHEALTHY;
467 DEBUG(DEBUG_INFO,("%s: connected to %s - %u connected\n",
468 node->ctdb->name, node->name, node->ctdb->num_connected));
472 struct ctdb_context *ctdb;
473 struct ctdb_req_header *hdr;
478 triggered when a deferred packet is due
480 static void queue_next_trigger(struct event_context *ev, struct timed_event *te,
481 struct timeval t, void *private_data)
483 struct queue_next *q = talloc_get_type(private_data, struct queue_next);
484 ctdb_input_pkt(q->ctdb, q->hdr);
489 defer a packet, so it is processed on the next event loop
490 this is used for sending packets to ourselves
492 static void ctdb_defer_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
494 struct queue_next *q;
495 q = talloc(ctdb, struct queue_next);
497 DEBUG(DEBUG_ERR,(__location__ " Failed to allocate deferred packet\n"));
501 q->hdr = talloc_memdup(ctdb, hdr, hdr->length);
502 if (q->hdr == NULL) {
503 DEBUG(DEBUG_ERR,("Error copying deferred packet to self\n"));
507 /* use this to put packets directly into our recv function */
508 ctdb_input_pkt(q->ctdb, q->hdr);
510 event_add_timed(ctdb->ev, q, timeval_zero(), queue_next_trigger, q);
516 broadcast a packet to all nodes
518 static void ctdb_broadcast_packet_all(struct ctdb_context *ctdb,
519 struct ctdb_req_header *hdr)
522 for (i=0; i < ctdb->num_nodes; i++) {
523 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
526 hdr->destnode = ctdb->nodes[i]->pnn;
527 ctdb_queue_packet(ctdb, hdr);
532 broadcast a packet to all nodes in the current vnnmap
534 static void ctdb_broadcast_packet_vnnmap(struct ctdb_context *ctdb,
535 struct ctdb_req_header *hdr)
538 for (i=0;i<ctdb->vnn_map->size;i++) {
539 hdr->destnode = ctdb->vnn_map->map[i];
540 ctdb_queue_packet(ctdb, hdr);
545 broadcast a packet to all connected nodes
547 static void ctdb_broadcast_packet_connected(struct ctdb_context *ctdb,
548 struct ctdb_req_header *hdr)
551 for (i=0; i < ctdb->num_nodes; i++) {
552 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
555 if (!(ctdb->nodes[i]->flags & NODE_FLAGS_DISCONNECTED)) {
556 hdr->destnode = ctdb->nodes[i]->pnn;
557 ctdb_queue_packet(ctdb, hdr);
563 queue a packet or die
565 void ctdb_queue_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
567 struct ctdb_node *node;
569 switch (hdr->destnode) {
570 case CTDB_BROADCAST_ALL:
571 ctdb_broadcast_packet_all(ctdb, hdr);
573 case CTDB_BROADCAST_VNNMAP:
574 ctdb_broadcast_packet_vnnmap(ctdb, hdr);
576 case CTDB_BROADCAST_CONNECTED:
577 ctdb_broadcast_packet_connected(ctdb, hdr);
581 ctdb->statistics.node_packets_sent++;
583 if (!ctdb_validate_pnn(ctdb, hdr->destnode)) {
584 DEBUG(DEBUG_CRIT,(__location__ " cant send to node %u that does not exist\n",
589 node = ctdb->nodes[hdr->destnode];
591 if (node->flags & NODE_FLAGS_DELETED) {
592 DEBUG(DEBUG_ERR, (__location__ " Can not queue packet to DELETED node %d\n", hdr->destnode));
596 if (node->pnn == ctdb->pnn) {
597 ctdb_defer_packet(ctdb, hdr);
601 if (ctdb->methods == NULL) {
602 DEBUG(DEBUG_ALERT, (__location__ " Can not queue packet. "
603 "Transport is DOWN\n"));
608 if (ctdb->methods->queue_pkt(node, (uint8_t *)hdr, hdr->length) != 0) {
609 ctdb_fatal(ctdb, "Unable to queue packet\n");
617 a valgrind hack to allow us to get opcode specific backtraces
618 very ugly, and relies on no compiler optimisation!
620 void ctdb_queue_packet_opcode(struct ctdb_context *ctdb, struct ctdb_req_header *hdr, unsigned opcode)
623 #define DO_OP(x) case x: ctdb_queue_packet(ctdb, hdr); break
725 ctdb_queue_packet(ctdb, hdr);