ctdb-vacuum: Fix the incorrect counting of remote errors
authorAmitay Isaacs <amitay@gmail.com>
Wed, 14 Feb 2018 04:18:17 +0000 (15:18 +1100)
committerAmitay Isaacs <amitay@samba.org>
Mon, 8 Oct 2018 00:46:21 +0000 (02:46 +0200)
If a node fails to delete a record in TRY_DELETE_RECORDS control during
vacuuming, then it's possible that other nodes also may fail to delete a
record.  So instead of deleting the record from RB tree on first failure,
keep track of the remote failures.

Update delete_list.remote_error and delete_list.left statistics only
once per record during the delete_record_traverse.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=13641

Signed-off-by: Amitay Isaacs <amitay@gmail.com>
Reviewed-by: Martin Schwenke <martin@meltin.net>
ctdb/server/ctdb_vacuum.c

index 5aa0ca7dcc045a12da8e2ec76136ee855851e5fa..8faf803efb91a42df90ac93ff28d4466798d3f80 100644 (file)
@@ -107,6 +107,7 @@ struct delete_record_data {
        struct ctdb_context *ctdb;
        struct ctdb_db_context *ctdb_db;
        struct ctdb_ltdb_header hdr;
+       uint32_t remote_fail_count;
        TDB_DATA key;
        uint8_t keydata[1];
 };
@@ -149,6 +150,7 @@ static int insert_delete_record_data_into_tree(struct ctdb_context *ctdb,
        memcpy(dd->keydata, key.dptr, key.dsize);
 
        dd->hdr = *hdr;
+       dd->remote_fail_count = 0;
 
        hash = ctdb_hash(&key);
 
@@ -451,6 +453,13 @@ static int delete_record_traverse(void *param, void *data)
        uint32_t lmaster;
        uint32_t hash = ctdb_hash(&(dd->key));
 
+       if (dd->remote_fail_count > 0) {
+               vdata->count.delete_list.remote_error++;
+               vdata->count.delete_list.left--;
+               talloc_free(dd);
+               return 0;
+       }
+
        res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
        if (res != 0) {
                DEBUG(DEBUG_ERR,
@@ -828,22 +837,17 @@ static void ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
                                        ctdb_hash(&reckey));
                        if (dd != NULL) {
                                /*
-                                * The other node could not delete the
-                                * record and it is the first node that
-                                * failed. So we should remove it from
-                                * the tree and update statistics.
+                                * The remote node could not delete the
+                                * record.  Since other remote nodes can
+                                * also fail, we just mark the record.
                                 */
-                               talloc_free(dd);
-                               vdata->count.delete_list.remote_error++;
-                               vdata->count.delete_list.left--;
+                               dd->remote_fail_count++;
                        } else {
                                DEBUG(DEBUG_ERR, (__location__ " Failed to "
                                      "find record with hash 0x%08x coming "
                                      "back from TRY_DELETE_RECORDS "
                                      "control in delete list.\n",
                                      ctdb_hash(&reckey)));
-                               vdata->count.delete_list.local_error++;
-                               vdata->count.delete_list.left--;
                        }
 
                        rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);