X-Git-Url: http://git.samba.org/?a=blobdiff_plain;f=server%2Fctdb_recover.c;h=eb3bf0a50c7dca2f9cbb31304d64b0e6ed9a4c95;hb=1.3;hp=e1c7b16e988678ce3ae853c08dc66c62963c7065;hpb=13e58d92f5f1723e850a82ae030d0ca57e89b1ee;p=sahlberg%2Fctdb.git diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c index e1c7b16e..eb3bf0a5 100644 --- a/server/ctdb_recover.c +++ b/server/ctdb_recover.c @@ -57,7 +57,11 @@ static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t prio if (strstr(ctdb_db->db_name, "notify") != NULL) { continue; } + if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) { + return -1; + } if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) { + tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb); return -1; } } @@ -68,7 +72,11 @@ static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t prio if (strstr(ctdb_db->db_name, "notify") == NULL) { continue; } + if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) { + return -1; + } if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) { + tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb); return -1; } } @@ -95,6 +103,7 @@ static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t pr if (ctdb_db->priority != priority) { continue; } + tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb); if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) { return -1; } @@ -340,10 +349,8 @@ static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, } params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len); if (params->pulldata == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Failed to expand pulldb_data to %u (%u records)\n", - rec->length + params->len, params->pulldata->count)); - params->failed = true; - return -1; + DEBUG(DEBUG_CRIT,(__location__ " Failed to expand pulldb_data to %u\n", rec->length + params->len)); + ctdb_fatal(params->ctdb, "failed to allocate memory for recovery. shutting down\n"); } params->pulldata->count++; memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length); @@ -577,7 +584,7 @@ static int set_recmode_destructor(struct ctdb_set_recmode_state *state) { double l = timeval_elapsed(&state->start_time); - ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l); + CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock", reclock.ctdbd, l); if (state->fd[0] != -1) { state->fd[0] = -1; @@ -623,6 +630,11 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde, state->ctdb->recovery_mode = state->recmode; + /* release any deferred attach calls from clients */ + if (state->recmode == CTDB_RECOVERY_NORMAL) { + ctdb_process_deferred_attach(state->ctdb); + } + ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL); talloc_free(state); return; @@ -641,6 +653,22 @@ ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te, ctdb_release_all_ips(ctdb); } +/* + * Set up an event to drop all public ips if we remain in recovery for too + * long + */ +int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb) +{ + if (ctdb->release_ips_ctx != NULL) { + talloc_free(ctdb->release_ips_ctx); + } + ctdb->release_ips_ctx = talloc_new(ctdb); + CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx); + + event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb); + return 0; +} + /* set the recovery mode */ @@ -661,11 +689,9 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, talloc_free(ctdb->release_ips_ctx); ctdb->release_ips_ctx = NULL; } else { - talloc_free(ctdb->release_ips_ctx); - ctdb->release_ips_ctx = talloc_new(ctdb); - CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx); - - event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb); + if (ctdb_deferred_drop_all_ips(ctdb) != 0) { + DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n")); + } } if (recmode != ctdb->recovery_mode) { @@ -695,6 +721,11 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, state->fd[0] = -1; state->fd[1] = -1; + /* release any deferred attach calls from clients */ + if (recmode == CTDB_RECOVERY_NORMAL) { + ctdb_process_deferred_attach(ctdb); + } + if (ctdb->tunable.verify_recovery_lock == 0) { /* dont need to verify the reclock file */ ctdb->recovery_mode = recmode; @@ -940,7 +971,7 @@ static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, vo struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state); ctdb_enable_monitoring(ctdb); - ctdb->statistics.num_recoveries++; + CTDB_INCREMENT_STAT(ctdb, num_recoveries); if (status != 0) { DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status)); @@ -967,6 +998,8 @@ int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb, DEBUG(DEBUG_NOTICE,("Recovery has finished\n")); + ctdb_persistent_finish_trans3_commits(ctdb); + state = talloc(ctdb, struct recovery_callback_state); CTDB_NO_MEMORY(ctdb, state); @@ -1164,18 +1197,10 @@ static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event return; } - DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Shutting down ctdb daemon. (This can be caused if the cluster filesystem has hung)\n")); + DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n")); ctdb_stop_recoverd(ctdb); - ctdb_stop_keepalive(ctdb); - ctdb_stop_monitoring(ctdb); - ctdb_release_all_ips(ctdb); - if (ctdb->methods != NULL) { - ctdb->methods->shutdown(ctdb); - } - ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN); - DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Daemon has been shut down.\n")); - exit(0); + ctdb_start_recoverd(ctdb); } /* The recovery daemon will ping us at regular intervals.