rec->culprit_counter++;
}
+/*
+ remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ if (rec->last_culprit != culprit ||
+ timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+ DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
+ /* either a new node is the culprit, or we've decided to forgive them */
+ rec->last_culprit = culprit;
+ rec->first_recover_time = timeval_current();
+ rec->culprit_counter = 0;
+ }
+ rec->culprit_counter += count;
+}
/* this callback is called for every node that failed to execute the
start recovery event
DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
return;
}
- ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
+ if (node_pnn < ctdb->num_nodes) {
+ ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
+ }
}
/*
/*
pull all the remote database contents into the recdb
*/
-static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+static int pull_remote_database(struct ctdb_context *ctdb,
+ struct ctdb_recoverd *rec,
+ struct ctdb_node_map *nodemap,
struct tdb_wrap *recdb, uint32_t dbid)
{
int j;
if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
nodemap->nodes[j].pnn));
+ ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
return -1;
}
}
/*
update flags on all active nodes
*/
-static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
{
- int i;
- for (i=0;i<nodemap->num;i++) {
- struct ctdb_node_flag_change c;
- TDB_DATA data;
-
- c.pnn = nodemap->nodes[i].pnn;
- c.old_flags = nodemap->nodes[i].flags;
- c.new_flags = nodemap->nodes[i].flags;
-
- data.dptr = (uint8_t *)&c;
- data.dsize = sizeof(c);
-
- ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
- CTDB_SRVID_NODE_FLAGS_CHANGED, data);
+ int ret;
+ ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+ return -1;
}
- return 0;
-}
-
-static int update_our_flags_on_all_nodes(struct ctdb_context *ctdb, uint32_t pnn, struct ctdb_node_map *nodemap)
-{
- struct ctdb_node_flag_change c;
- TDB_DATA data;
-
- c.pnn = nodemap->nodes[pnn].pnn;
- c.old_flags = nodemap->nodes[pnn].flags;
- c.new_flags = nodemap->nodes[pnn].flags;
-
- data.dptr = (uint8_t *)&c;
- data.dsize = sizeof(c);
-
- ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
- CTDB_SRVID_NODE_FLAGS_CHANGED, data);
return 0;
}
r = (struct ctdb_rec_data *)&recs->data[0];
if (recs->count == 0) {
+ talloc_free(tmp_ctx);
return;
}
for (v=rec->vacuum_info;v;v=v->next) {
if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
/* we're already working on records from this node */
+ talloc_free(tmp_ctx);
return;
}
}
v = talloc_zero(rec, struct vacuum_info);
if (v == NULL) {
DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
+ talloc_free(tmp_ctx);
return;
}
if (v->recs == NULL) {
DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
talloc_free(v);
+ talloc_free(tmp_ctx);
return;
}
v->r = (struct ctdb_rec_data *)&v->recs->data[0];
talloc_set_destructor(v, vacuum_info_destructor);
vacuum_fetch_next(v);
+ talloc_free(tmp_ctx);
}
return MONITOR_FAILED;
}
if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
- struct ctdb_node_flag_change c;
- TDB_DATA data;
+ int ban_changed = (nodemap->nodes[j].flags ^ remote_nodemap->nodes[j].flags) & NODE_FLAGS_BANNED;
+
+ if (ban_changed) {
+ DEBUG(DEBUG_NOTICE,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
+ nodemap->nodes[j].pnn,
+ remote_nodemap->nodes[j].flags,
+ nodemap->nodes[j].flags));
+ }
/* We should tell our daemon about this so it
updates its flags or else we will log the same
Since we are the recovery master we can just as
well update the flags on all nodes.
*/
- c.pnn = nodemap->nodes[j].pnn;
- c.old_flags = nodemap->nodes[j].flags;
- c.new_flags = remote_nodemap->nodes[j].flags;
-
- data.dptr = (uint8_t *)&c;
- data.dsize = sizeof(c);
-
- ctdb_send_message(ctdb, ctdb->pnn,
- CTDB_SRVID_NODE_FLAGS_CHANGED,
- data);
+ ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+ return -1;
+ }
/* Update our local copy of the flags in the recovery
daemon.
/* If the BANNED flag has changed for the node
this is a good reason to do a new election.
*/
- if ((c.old_flags ^ c.new_flags) & NODE_FLAGS_BANNED) {
- DEBUG(DEBUG_NOTICE,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
- nodemap->nodes[j].pnn, c.new_flags,
- c.old_flags));
+ if (ban_changed) {
talloc_free(mem_ctx);
return MONITOR_ELECTION_NEEDED;
}
}
/* pull all remote databases onto the recdb */
- ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
+ ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
return -1;
return 0;
}
-
+/*
+ reload the nodes file
+*/
+static void reload_nodes_file(struct ctdb_context *ctdb)
+{
+ ctdb->nodes = NULL;
+ ctdb_load_nodes_file(ctdb);
+}
+
+
/*
we are the recmaster, and recovery is needed - start a recovery run
*/
DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
+ if (ctdb->num_nodes != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
+ reload_nodes_file(ctdb);
+ return -1;
+ }
+
/* if recovery fails, force it again */
rec->need_recovery = true;
if (rec->culprit_counter > 2*nodemap->num) {
DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
- culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
+ rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
ctdb->tunable.recovery_ban_period));
- ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
+ ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
}
if (!ctdb_recovery_lock(ctdb, true)) {
/*
update all nodes to have the same flags that we have
*/
- ret = update_flags_on_all_nodes(ctdb, nodemap);
- if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes\n"));
- return -1;
+ for (i=0;i<nodemap->num;i++) {
+ if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+
+ ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+ return -1;
+ }
}
-
+
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
/* disable recovery mode */
if (data.dsize != sizeof(struct rd_memdump_reply)) {
DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+ talloc_free(tmp_ctx);
return;
}
rd = (struct rd_memdump_reply *)data.dptr;
ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
+ talloc_free(tmp_ctx);
return;
}
changed_flags = c->old_flags ^ c->new_flags;
- /* Dont let messages from remote nodes change the DISCONNECTED flag.
- This flag is handled locally based on whether the local node
- can communicate with the node or not.
- */
- c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
- if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
- c->new_flags |= NODE_FLAGS_DISCONNECTED;
- }
-
if (nodemap->nodes[i].flags != c->new_flags) {
DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
}
talloc_free(tmp_ctx);
}
+/*
+ handler for when we need to push out flag changes ot all other nodes
+*/
+static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
+ TDB_DATA data, void *private_data)
+{
+ int ret;
+ struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+
+ ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+ }
+}
struct verify_recmode_normal_data {
if (timeval_compare(&uptime1->last_recovery_started,
&uptime2->last_recovery_started) != 0) {
DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
+ talloc_free(mem_ctx);
return 0;
}
if (timeval_compare(&uptime1->last_recovery_finished,
&uptime2->last_recovery_finished) != 0) {
DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
+ talloc_free(mem_ctx);
return 0;
}
if (timeval_compare(&uptime1->last_recovery_finished,
&uptime1->last_recovery_started) != 1) {
DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
+ talloc_free(mem_ctx);
return 0;
}
return 0;
}
+
+static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+ struct ctdb_node_map **remote_nodemaps = callback_data;
+
+ if (node_pnn >= ctdb->num_nodes) {
+ DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
+ return;
+ }
+
+ remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
+
+}
+
+static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+ struct ctdb_node_map *nodemap,
+ struct ctdb_node_map **remote_nodemaps)
+{
+ uint32_t *nodes;
+
+ nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
+ nodes,
+ CONTROL_TIMEOUT(), false, tdb_null,
+ async_getnodemap_callback,
+ NULL,
+ remote_nodemaps) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
+
+ return -1;
+ }
+
+ return 0;
+}
+
/*
the main monitoring loop
*/
uint32_t pnn;
TALLOC_CTX *mem_ctx=NULL;
struct ctdb_node_map *nodemap=NULL;
- struct ctdb_node_map *remote_nodemap=NULL;
+ struct ctdb_node_map *recmaster_nodemap=NULL;
+ struct ctdb_node_map **remote_nodemaps=NULL;
struct ctdb_vnn_map *vnnmap=NULL;
struct ctdb_vnn_map *remote_vnnmap=NULL;
int32_t debug_level;
/* register a message port for recovery elections */
ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
- /* and one for when nodes are disabled/enabled */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
+ /* when nodes are disabled/enabled */
+ ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
+
+ /* when we are asked to puch out a flag change */
+ ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
- /* and one for when nodes are banned */
+ /* when nodes are banned */
ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
/* and one for when nodes are unbanned */
/* grap the nodemap from the recovery master to check if it is banned */
ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
- mem_ctx, &remote_nodemap);
+ mem_ctx, &recmaster_nodemap);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
nodemap->nodes[j].pnn));
}
- if (remote_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
force_election(rec, pnn, nodemap);
goto again;
}
- /* verify that we and the recmaster agrees on our flags */
- if (nodemap->nodes[pnn].flags != remote_nodemap->nodes[pnn].flags) {
- DEBUG(DEBUG_ERR, (__location__ " Recmaster disagrees on our flags flags:0x%x recmaster_flags:0x%x Broadcasting out flags.\n", nodemap->nodes[pnn].flags, remote_nodemap->nodes[pnn].flags));
-
- update_our_flags_on_all_nodes(ctdb, pnn, nodemap);
- }
-
-
/* verify that we have all ip addresses we should have and we dont
* have addresses we shouldnt have.
*/
- if (verify_ip_allocation(ctdb, pnn) != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
- goto again;
+ if (ctdb->do_checkpublicip) {
+ if (verify_ip_allocation(ctdb, pnn) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
+ goto again;
+ }
}
/* update the list of public ips that a node can handle for
all connected nodes
*/
+ if (ctdb->num_nodes != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
+ reload_nodes_file(ctdb);
+ goto again;
+ }
for (j=0; j<nodemap->num; j++) {
if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
continue;
goto again;
}
- /* get the nodemap for all active remote nodes and verify
- they are the same as for this node
+
+ /* get the nodemap for all active remote nodes
*/
+ remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
+ if (remote_nodemaps == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
+ goto again;
+ }
+ for(i=0; i<nodemap->num; i++) {
+ remote_nodemaps[i] = NULL;
+ }
+ if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
+ goto again;
+ }
+
+ /* verify that all other nodes have the same nodemap as we have
+ */
for (j=0; j<nodemap->num; j++) {
if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
continue;
}
- if (nodemap->nodes[j].pnn == pnn) {
- continue;
- }
- ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
- mem_ctx, &remote_nodemap);
- if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
- nodemap->nodes[j].pnn));
+ if (remote_nodemaps[j] == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
+ ctdb_set_culprit(rec, j);
+
goto again;
}
- /* if the nodes disagree on how many nodes there are
+ /* if the nodes disagree on how many nodes there are
then this is a good reason to try recovery
*/
- if (remote_nodemap->num != nodemap->num) {
+ if (remote_nodemaps[j]->num != nodemap->num) {
DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
- nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
+ nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
goto again;
}
active, then that is also a good reason to do recovery
*/
for (i=0;i<nodemap->num;i++) {
- if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
+ if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
nodemap->nodes[j].pnn, i,
- remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
- do_recovery(rec, mem_ctx, pnn, nodemap,
- vnnmap, nodemap->nodes[j].pnn);
- goto again;
- }
- if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
- (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
- DEBUG(DEBUG_WARNING, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
- nodemap->nodes[j].pnn, i,
- remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
+ remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap, nodemap->nodes[j].pnn);
goto again;
}
}
+ /* verify the flags are consistent
+ */
+ for (i=0; i<nodemap->num; i++) {
+ if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+
+ if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
+ DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
+ nodemap->nodes[j].pnn,
+ nodemap->nodes[i].pnn,
+ remote_nodemaps[j]->nodes[i].flags,
+ nodemap->nodes[j].flags));
+ if (i == j) {
+ DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
+ update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
+ do_recovery(rec, mem_ctx, pnn, nodemap,
+ vnnmap, nodemap->nodes[j].pnn);
+ goto again;
+ } else {
+ DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
+ update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
+ do_recovery(rec, mem_ctx, pnn, nodemap,
+ vnnmap, nodemap->nodes[j].pnn);
+ goto again;
+ }
+ }
+ }
}
}
- DEBUG(DEBUG_DEBUG, (__location__ " Update flags on all nodes\n"));
- /*
- update all nodes to have the same flags that we have
- */
- ret = update_flags_on_all_nodes(ctdb, nodemap);
- if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes\n"));
- goto again;
- }
-
goto again;
}
*/
int ctdb_start_recoverd(struct ctdb_context *ctdb)
{
- int ret;
int fd[2];
struct signal_event *se;
close(fd[1]);
- /* shutdown the transport */
- if (ctdb->methods) {
- ctdb->methods->shutdown(ctdb);
- }
-
- /* get a new event context */
- talloc_free(ctdb->ev);
- ctdb->ev = event_context_init(ctdb);
-
- event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
- ctdb_recoverd_parent, &fd[0]);
-
- close(ctdb->daemon.sd);
- ctdb->daemon.sd = -1;
-
srandom(getpid() ^ time(NULL));
- /* the recovery daemon does not need to be realtime */
- if (ctdb->do_setsched) {
- ctdb_restore_scheduler(ctdb);
- }
-
- /* initialise ctdb */
- ret = ctdb_socket_connect(ctdb);
- if (ret != 0) {
- DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb\n"));
+ if (switch_from_server_to_client(ctdb) != 0) {
+ DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
exit(1);
}
+ event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+ ctdb_recoverd_parent, &fd[0]);
+
/* set up a handler to pick up sigchld */
se = event_add_signal(ctdb->ev, ctdb,
SIGCHLD, 0,