Remove explicit include of lib/tevent/tevent.h.
[garming/samba-autobuild/.git] / ctdb / server / ctdb_vacuum.c
index 5e58e3fd53bfb52a9afdb6d83f93a913f92b5976..c5b4d9dc1b4eb15963ddd6ea51fc2d9b731cdabf 100644 (file)
@@ -20,7 +20,6 @@
 */
 
 #include "includes.h"
-#include "lib/tevent/tevent.h"
 #include "lib/tdb/include/tdb.h"
 #include "system/network.h"
 #include "system/filesys.h"
@@ -28,7 +27,6 @@
 #include "../include/ctdb_private.h"
 #include "db_wrap.h"
 #include "lib/util/dlinklist.h"
-#include "lib/tevent/tevent.h"
 #include "../include/ctdb_private.h"
 #include "../common/rb_tree.h"
 
@@ -333,12 +331,16 @@ static int delete_queue_traverse(void *param, void *data)
        struct ctdb_ltdb_header *header;
        TDB_DATA tdb_data;
        uint32_t lmaster;
+       uint32_t hash = ctdb_hash(&(dd->key));
 
        vdata->fast_total++;
 
        res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
        if (res != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Error getting chainlock.\n"));
+               DEBUG(DEBUG_ERR,
+                     (__location__ " Error getting chainlock on record with "
+                      "key hash [0x%08x] on database db[%s].\n",
+                      hash, ctdb_db->db_name));
                vdata->fast_error++;
                return 0;
        }
@@ -361,7 +363,6 @@ static int delete_queue_traverse(void *param, void *data)
                goto skipped;
        }
 
-
        if (header->rsn != dd->hdr.rsn) {
                /*
                 * The record has been migrated off the node and back again.
@@ -412,10 +413,15 @@ static int delete_queue_traverse(void *param, void *data)
 
                if (res != 0) {
                        DEBUG(DEBUG_ERR,
-                             (__location__ " Error deleting record from local "
-                              "data base.\n"));
+                             (__location__ " Error deleting record with key "
+                              "hash [0x%08x] from local data base db[%s].\n",
+                              hash, ctdb_db->db_name));
                        vdata->fast_error++;
                } else {
+                       DEBUG(DEBUG_DEBUG,
+                             (__location__ " Deleted record with key hash "
+                              "[0x%08x] from local data base db[%s].\n",
+                              hash, ctdb_db->db_name));
                        vdata->fast_deleted++;
                }
        }
@@ -451,10 +457,14 @@ static int delete_record_traverse(void *param, void *data)
        TDB_DATA tdb_data;
        uint32_t lmaster;
        bool deleted = false;
+       uint32_t hash = ctdb_hash(&(dd->key));
 
        res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
        if (res != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Error getting chainlock.\n"));
+               DEBUG(DEBUG_ERR,
+                     (__location__ " Error getting chainlock on record with "
+                      "key hash [0x%08x] on database db[%s].\n",
+                      hash, ctdb_db->db_name));
                vdata->delete_local_error++;
                return 0;
        }
@@ -479,6 +489,12 @@ static int delete_record_traverse(void *param, void *data)
 
        header = (struct ctdb_ltdb_header *)tdb_data.dptr;
 
+       if (header->flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE)) {
+         /* The record has readonly flags set. skip deleting */
+               vdata->delete_skipped++;
+               goto done;
+       }
+
        if (header->dmaster != ctdb->pnn) {
                /* The record has been migrated off the node. Skip. */
                vdata->delete_skipped++;
@@ -507,14 +523,19 @@ static int delete_record_traverse(void *param, void *data)
 
        if (res != 0) {
                DEBUG(DEBUG_ERR,
-                     (__location__ " Error deleting record from local "
-                      "data base.\n"));
+                     (__location__ " Error deleting record with key hash "
+                      "[0x%08x] from local data base db[%s].\n",
+                      hash, ctdb_db->db_name));
                vdata->delete_local_error++;
                goto done;
        }
 
        deleted = true;
 
+       DEBUG(DEBUG_DEBUG,
+             (__location__ " Deleted record with key hash [0x%08x] from "
+              "local data base db[%s].\n", hash, ctdb_db->db_name));
+
 done:
        if (tdb_data.dptr != NULL) {
                free(tdb_data.dptr);
@@ -658,113 +679,18 @@ static int ctdb_process_vacuum_fetch_lists(struct ctdb_db_context *ctdb_db,
 }
 
 /**
- * Vacuum a DB:
- *  - Always do the fast vacuuming run, which traverses
- *    the in-memory delete queue: these records have been
- *    scheduled for deletion.
- *  - Only if explicitly requested, the database is traversed
- *    in order to use the traditional heuristics on empty records
- *    to trigger deletion.
- *    This is done only every VacuumFastPathCount'th vacuuming run.
- *
- * The traverse runs fill two lists:
- *
- * - The delete_list:
- *   This is the list of empty records the current
- *   node is lmaster and dmaster for. These records are later
- *   deleted first on other nodes and then locally.
- *
- *   The fast vacuuming run has a short cut for those records
- *   that have never been migrated with data: these records
- *   are immediately deleted locally, since they have left
- *   no trace on other nodes.
- *
- * - The vacuum_fetch lists
- *   (one for each other lmaster node):
- *   The records in this list are sent for deletion to
- *   their lmaster in a bulk VACUUM_FETCH message.
- *
- *   The lmaster then migrates all these records to itelf
- *   so that they can be vacuumed there.
- *
- * This executes in the child context.
+ * Proces the delete list:
+ * Send the records to delete to all other nodes with the
+ * try_delete_records control.
  */
-static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
-                         struct vacuum_data *vdata,
-                         bool full_vacuum_run)
+static int ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
+                                   struct vacuum_data *vdata)
 {
+       int ret, i;
        struct ctdb_context *ctdb = ctdb_db->ctdb;
-       int ret, i, pnn;
-
-       DEBUG(DEBUG_INFO, (__location__ " Entering %s vacuum run for db "
-                          "%s db_id[0x%08x]\n",
-                          full_vacuum_run ? "full" : "fast",
-                          ctdb_db->db_name, ctdb_db->db_id));
-
-       ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR, ("Unable to get vnnmap from local node\n"));
-               return ret;
-       }
-
-       pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
-       if (pnn == -1) {
-               DEBUG(DEBUG_ERR, ("Unable to get pnn from local node\n"));
-               return -1;
-       }
 
-       ctdb->pnn = pnn;
+       vdata->delete_left = vdata->delete_count;
 
-       vdata->fast_added_to_delete_list = 0;
-       vdata->fast_added_to_vacuum_fetch_list = 0;
-       vdata->fast_deleted = 0;
-       vdata->fast_skipped = 0;
-       vdata->fast_error = 0;
-       vdata->fast_total = 0;
-       vdata->full_added_to_delete_list = 0;
-       vdata->full_added_to_vacuum_fetch_list = 0;
-       vdata->full_skipped = 0;
-       vdata->full_error = 0;
-       vdata->full_total = 0;
-       vdata->delete_count = 0;
-       vdata->delete_left = 0;
-       vdata->delete_remote_error = 0;
-       vdata->delete_local_error = 0;
-       vdata->delete_skipped = 0;
-       vdata->delete_deleted = 0;
-
-       /* the list needs to be of length num_nodes */
-       vdata->vacuum_fetch_list = talloc_array(vdata,
-                                               struct ctdb_marshall_buffer *,
-                                               ctdb->num_nodes);
-       if (vdata->vacuum_fetch_list == NULL) {
-               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
-               return -1;
-       }
-       for (i = 0; i < ctdb->num_nodes; i++) {
-               vdata->vacuum_fetch_list[i] = (struct ctdb_marshall_buffer *)
-                       talloc_zero_size(vdata->vacuum_fetch_list,
-                                        offsetof(struct ctdb_marshall_buffer, data));
-               if (vdata->vacuum_fetch_list[i] == NULL) {
-                       DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
-                       return -1;
-               }
-               vdata->vacuum_fetch_list[i]->db_id = ctdb_db->db_id;
-       }
-
-       ctdb_vacuum_db_fast(ctdb_db, vdata);
-
-       ret = ctdb_vacuum_db_full(ctdb_db, vdata, full_vacuum_run);
-       if (ret != 0) {
-               return ret;
-       }
-
-       ret = ctdb_process_vacuum_fetch_lists(ctdb_db, vdata);
-       if (ret != 0) {
-               return ret;
-       }
-
-       /* Process all records we can delete (if any) */
        if (vdata->delete_count > 0) {
                struct delete_records_list *recs;
                TDB_DATA indata, outdata;
@@ -773,8 +699,6 @@ static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
                uint32_t *active_nodes;
                int num_active_nodes;
 
-               vdata->delete_left = vdata->delete_count;
-
                recs = talloc_zero(vdata, struct delete_records_list);
                if (recs == NULL) {
                        DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
@@ -838,6 +762,17 @@ static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
                         * outdata contains the list of records coming back
                         * from the node: These are the records that the
                         * remote node could not delete.
+                        *
+                        * NOTE: There is a problem here:
+                        *
+                        * When a node failed to delete the record, but
+                        * others succeeded, we may have created gaps in the
+                        * history of the record. Hence when a node dies, an
+                        * closed file handle might be resurrected or an open
+                        * file handle might be lost, leading to blocked access
+                        * or data corruption.
+                        *
+                        * TODO: This needs to be fixed!
                         */
                        records = (struct ctdb_marshall_buffer *)outdata.dptr;
                        rec = (struct ctdb_rec_data *)&records->data[0];
@@ -893,6 +828,158 @@ static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
                                     delete_record_traverse, vdata);
        }
 
+       if (vdata->delete_count > 0) {
+               DEBUG(DEBUG_INFO,
+                     (__location__
+                      " vacuum delete list statistics: "
+                      "db[%s] "
+                      "coll[%u] "
+                      "rem.err[%u] "
+                      "loc.err[%u] "
+                      "skip[%u] "
+                      "del[%u] "
+                      "left[%u]\n",
+                      ctdb_db->db_name,
+                      (unsigned)vdata->delete_count,
+                      (unsigned)vdata->delete_remote_error,
+                      (unsigned)vdata->delete_local_error,
+                      (unsigned)vdata->delete_skipped,
+                      (unsigned)vdata->delete_deleted,
+                      (unsigned)vdata->delete_left));
+       }
+
+       return 0;
+}
+
+/**
+ * initialize the vacuum_data
+ */
+static int ctdb_vacuum_init_vacuum_data(struct ctdb_db_context *ctdb_db,
+                                       struct vacuum_data *vdata)
+{
+       int i;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+       vdata->fast_added_to_delete_list = 0;
+       vdata->fast_added_to_vacuum_fetch_list = 0;
+       vdata->fast_deleted = 0;
+       vdata->fast_skipped = 0;
+       vdata->fast_error = 0;
+       vdata->fast_total = 0;
+       vdata->full_added_to_delete_list = 0;
+       vdata->full_added_to_vacuum_fetch_list = 0;
+       vdata->full_skipped = 0;
+       vdata->full_error = 0;
+       vdata->full_total = 0;
+       vdata->delete_count = 0;
+       vdata->delete_left = 0;
+       vdata->delete_remote_error = 0;
+       vdata->delete_local_error = 0;
+       vdata->delete_skipped = 0;
+       vdata->delete_deleted = 0;
+
+       /* the list needs to be of length num_nodes */
+       vdata->vacuum_fetch_list = talloc_zero_array(vdata,
+                                               struct ctdb_marshall_buffer *,
+                                               ctdb->num_nodes);
+       if (vdata->vacuum_fetch_list == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               return -1;
+       }
+       for (i = 0; i < ctdb->num_nodes; i++) {
+               vdata->vacuum_fetch_list[i] = (struct ctdb_marshall_buffer *)
+                       talloc_zero_size(vdata->vacuum_fetch_list,
+                                        offsetof(struct ctdb_marshall_buffer, data));
+               if (vdata->vacuum_fetch_list[i] == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+                       return -1;
+               }
+               vdata->vacuum_fetch_list[i]->db_id = ctdb_db->db_id;
+       }
+
+       return 0;
+}
+
+/**
+ * Vacuum a DB:
+ *  - Always do the fast vacuuming run, which traverses
+ *    the in-memory delete queue: these records have been
+ *    scheduled for deletion.
+ *  - Only if explicitly requested, the database is traversed
+ *    in order to use the traditional heuristics on empty records
+ *    to trigger deletion.
+ *    This is done only every VacuumFastPathCount'th vacuuming run.
+ *
+ * The traverse runs fill two lists:
+ *
+ * - The delete_list:
+ *   This is the list of empty records the current
+ *   node is lmaster and dmaster for. These records are later
+ *   deleted first on other nodes and then locally.
+ *
+ *   The fast vacuuming run has a short cut for those records
+ *   that have never been migrated with data: these records
+ *   are immediately deleted locally, since they have left
+ *   no trace on other nodes.
+ *
+ * - The vacuum_fetch lists
+ *   (one for each other lmaster node):
+ *   The records in this list are sent for deletion to
+ *   their lmaster in a bulk VACUUM_FETCH message.
+ *
+ *   The lmaster then migrates all these records to itelf
+ *   so that they can be vacuumed there.
+ *
+ * This executes in the child context.
+ */
+static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
+                         struct vacuum_data *vdata,
+                         bool full_vacuum_run)
+{
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       int ret, pnn;
+
+       DEBUG(DEBUG_INFO, (__location__ " Entering %s vacuum run for db "
+                          "%s db_id[0x%08x]\n",
+                          full_vacuum_run ? "full" : "fast",
+                          ctdb_db->db_name, ctdb_db->db_id));
+
+       ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get vnnmap from local node\n"));
+               return ret;
+       }
+
+       pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+       if (pnn == -1) {
+               DEBUG(DEBUG_ERR, ("Unable to get pnn from local node\n"));
+               return -1;
+       }
+
+       ctdb->pnn = pnn;
+
+       ret = ctdb_vacuum_init_vacuum_data(ctdb_db, vdata);
+       if (ret != 0) {
+               return ret;
+       }
+
+       ctdb_vacuum_db_fast(ctdb_db, vdata);
+
+       ret = ctdb_vacuum_db_full(ctdb_db, vdata, full_vacuum_run);
+       if (ret != 0) {
+               return ret;
+       }
+
+       ret = ctdb_process_vacuum_fetch_lists(ctdb_db, vdata);
+       if (ret != 0) {
+               return ret;
+       }
+
+       ret = ctdb_process_delete_list(ctdb_db, vdata);
+       if (ret != 0) {
+               return ret;
+       }
+
        /* this ensures we run our event queue */
        ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
 
@@ -1335,7 +1422,8 @@ static void remove_record_from_delete_queue(struct ctdb_db_context *ctdb_db,
        hash = (uint32_t)ctdb_hash(&key);
 
        DEBUG(DEBUG_DEBUG, (__location__
-                           " remove_record_from_delete_queue: db[%s] "
+                           " remove_record_from_delete_queue: "
+                           "db[%s] "
                            "db_id[0x%08x] "
                            "key_hash[0x%08x] "
                            "lmaster[%u] "
@@ -1347,15 +1435,29 @@ static void remove_record_from_delete_queue(struct ctdb_db_context *ctdb_db,
 
        kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash);
        if (kd == NULL) {
+               DEBUG(DEBUG_DEBUG, (__location__
+                                   " remove_record_from_delete_queue: "
+                                   "record not in queue (hash[0x%08x])\n.",
+                                   hash));
                return;
        }
-       if (kd->key.dsize != key.dsize) {
-               return;
-       }
-       if (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0) {
+
+       if ((kd->key.dsize != key.dsize) ||
+           (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
+       {
+               DEBUG(DEBUG_DEBUG, (__location__
+                                   " remove_record_from_delete_queue: "
+                                   "hash collision for key with hash[0x%08x] "
+                                   "in db[%s] - skipping\n",
+                                   hash, ctdb_db->db_name));
                return;
        }
 
+       DEBUG(DEBUG_DEBUG, (__location__
+                           " remove_record_from_delete_queue: "
+                           "removing key with hash[0x%08x]\n",
+                            hash));
+
        talloc_free(kd);
 
        return;
@@ -1375,7 +1477,7 @@ static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
 
        hash = (uint32_t)ctdb_hash(&key);
 
-       DEBUG(DEBUG_INFO, (__location__ " Schedule for deletion: db[%s] "
+       DEBUG(DEBUG_INFO, (__location__ " schedule for deletion: db[%s] "
                           "db_id[0x%08x] "
                           "key_hash[0x%08x] "
                           "lmaster[%u] "
@@ -1391,13 +1493,15 @@ static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
                    (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
                {
                        DEBUG(DEBUG_INFO,
-                             ("schedule for deletion: Hash collision (0x%08x)."
-                              " Skipping the record.\n", hash));
+                             (__location__ " schedule for deletion: "
+                              "hash collision for key hash [0x%08x]. "
+                              "Skipping the record.\n", hash));
                        return 0;
                } else {
                        DEBUG(DEBUG_DEBUG,
-                             ("schedule for deletion: Overwriting entry for "
-                              "key with hash 0x%08x.\n", hash));
+                             (__location__ " schedule for deletion: "
+                              "updating entry for key with hash [0x%08x].\n",
+                              hash));
                }
        }
 
@@ -1405,6 +1509,10 @@ static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
                                                  ctdb_db->delete_queue,
                                                  hdr, key);
        if (ret != 0) {
+               DEBUG(DEBUG_INFO,
+                     (__location__ " schedule for deletion: error "
+                      "inserting key with hash [0x%08x] into delete queue\n",
+                      hash));
                return -1;
        }
 
@@ -1456,8 +1564,16 @@ int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db,
                return ret;
        }
 
-       /* child process: send the main daemon a control */
+       /* if we dont have a connection to the daemon we can not send
+          a control. For example sometimes from update_record control child
+          process.
+       */
+       if (!ctdb_db->ctdb->can_send_controls) {
+               return -1;
+       }
+
 
+       /* child process: send the main daemon a control */
        indata.dsize = offsetof(struct ctdb_control_schedule_for_deletion, key) + key.dsize;
        indata.dptr = talloc_zero_array(ctdb_db, uint8_t, indata.dsize);
        if (indata.dptr == NULL) {