2 ldb database library using mdb back end
4 Copyright (C) Jakub Hrozek 2014
5 Copyright (C) Catalyst.Net Ltd 2017
7 ** NOTE! The following LGPL license applies to the ldb
8 ** library. This does NOT imply that all of Samba is released
11 This library is free software; you can redistribute it and/or
12 modify it under the terms of the GNU Lesser General Public
13 License as published by the Free Software Foundation; either
14 version 3 of the License, or (at your option) any later version.
16 This library is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License for more details.
21 You should have received a copy of the GNU Lesser General Public
22 License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 #include "../ldb_key_value/ldb_kv.h"
27 #include "include/dlinklist.h"
29 #define MDB_URL_PREFIX "mdb://"
30 #define MDB_URL_PREFIX_SIZE (sizeof(MDB_URL_PREFIX)-1)
32 #define LDB_MDB_MAX_KEY_LENGTH 511
34 #define GIGABYTE (1024*1024*1024)
36 int ldb_mdb_err_map(int lmdb_err)
42 return LDB_ERR_OPERATIONS_ERROR;
46 case MDB_INCOMPATIBLE:
49 return LDB_ERR_UNAVAILABLE;
57 return LDB_ERR_PROTOCOL_ERROR;
60 case MDB_READERS_FULL:
66 return LDB_ERR_ENTRY_ALREADY_EXISTS;
69 return LDB_ERR_NO_SUCH_OBJECT;
71 return LDB_ERR_INSUFFICIENT_ACCESS_RIGHTS;
78 #define ldb_mdb_error(ldb, ecode) lmdb_error_at(ldb, ecode, __FILE__, __LINE__)
79 static int lmdb_error_at(struct ldb_context *ldb,
84 int ldb_err = ldb_mdb_err_map(ecode);
85 char *reason = mdb_strerror(ecode);
86 ldb_asprintf_errstring(ldb,
95 static bool lmdb_transaction_active(struct ldb_kv_private *ldb_kv)
97 return ldb_kv->lmdb_private->txlist != NULL;
100 static MDB_txn *lmdb_trans_get_tx(struct lmdb_trans *ltx)
109 static void trans_push(struct lmdb_private *lmdb, struct lmdb_trans *ltx)
112 talloc_steal(lmdb->txlist, ltx);
115 DLIST_ADD(lmdb->txlist, ltx);
118 static void trans_finished(struct lmdb_private *lmdb, struct lmdb_trans *ltx)
120 DLIST_REMOVE(lmdb->txlist, ltx);
125 static struct lmdb_trans *lmdb_private_trans_head(struct lmdb_private *lmdb)
127 struct lmdb_trans *ltx;
134 static MDB_txn *get_current_txn(struct lmdb_private *lmdb)
138 txn = lmdb_trans_get_tx(lmdb_private_trans_head(lmdb));
142 if (lmdb->read_txn != NULL) {
143 return lmdb->read_txn;
145 lmdb->error = MDB_BAD_TXN;
146 ldb_set_errstring(lmdb->ldb, __location__":No active transaction\n");
150 static int lmdb_store(struct ldb_kv_private *ldb_kv,
155 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
162 if (ldb_kv->read_only) {
163 return LDB_ERR_UNWILLING_TO_PERFORM;
166 txn = lmdb_trans_get_tx(lmdb_private_trans_head(lmdb));
168 ldb_debug(lmdb->ldb, LDB_DEBUG_FATAL, "No transaction");
169 lmdb->error = MDB_PANIC;
170 return ldb_mdb_error(lmdb->ldb, lmdb->error);
173 lmdb->error = mdb_dbi_open(txn, NULL, 0, &dbi);
174 if (lmdb->error != MDB_SUCCESS) {
175 return ldb_mdb_error(lmdb->ldb, lmdb->error);
178 mdb_key.mv_size = key.length;
179 mdb_key.mv_data = key.data;
181 mdb_data.mv_size = data.length;
182 mdb_data.mv_data = data.data;
184 if (flags == TDB_INSERT) {
185 mdb_flags = MDB_NOOVERWRITE;
186 } else if ((flags == TDB_MODIFY)) {
188 * Modifying a record, ensure that it exists.
189 * This mimics the TDB semantics
192 lmdb->error = mdb_get(txn, dbi, &mdb_key, &value);
193 if (lmdb->error != MDB_SUCCESS) {
194 return ldb_mdb_error(lmdb->ldb, lmdb->error);
201 lmdb->error = mdb_put(txn, dbi, &mdb_key, &mdb_data, mdb_flags);
202 if (lmdb->error != MDB_SUCCESS) {
203 return ldb_mdb_error(lmdb->ldb, lmdb->error);
206 return ldb_mdb_err_map(lmdb->error);
209 static int lmdb_delete(struct ldb_kv_private *ldb_kv, struct ldb_val key)
211 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
216 if (ldb_kv->read_only) {
217 return LDB_ERR_UNWILLING_TO_PERFORM;
220 txn = lmdb_trans_get_tx(lmdb_private_trans_head(lmdb));
222 ldb_debug(lmdb->ldb, LDB_DEBUG_FATAL, "No transaction");
223 lmdb->error = MDB_PANIC;
224 return ldb_mdb_error(lmdb->ldb, lmdb->error);
227 lmdb->error = mdb_dbi_open(txn, NULL, 0, &dbi);
228 if (lmdb->error != MDB_SUCCESS) {
229 return ldb_mdb_error(lmdb->ldb, lmdb->error);
232 mdb_key.mv_size = key.length;
233 mdb_key.mv_data = key.data;
235 lmdb->error = mdb_del(txn, dbi, &mdb_key, NULL);
236 if (lmdb->error != MDB_SUCCESS) {
237 return ldb_mdb_error(lmdb->ldb, lmdb->error);
239 return ldb_mdb_err_map(lmdb->error);
242 static int lmdb_traverse_fn(struct ldb_kv_private *ldb_kv,
243 ldb_kv_traverse_fn fn,
246 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
251 MDB_cursor *cursor = NULL;
254 txn = get_current_txn(lmdb);
256 ldb_debug(lmdb->ldb, LDB_DEBUG_FATAL, "No transaction");
257 lmdb->error = MDB_PANIC;
258 return ldb_mdb_error(lmdb->ldb, lmdb->error);
261 lmdb->error = mdb_dbi_open(txn, NULL, 0, &dbi);
262 if (lmdb->error != MDB_SUCCESS) {
263 return ldb_mdb_error(lmdb->ldb, lmdb->error);
266 lmdb->error = mdb_cursor_open(txn, dbi, &cursor);
267 if (lmdb->error != MDB_SUCCESS) {
271 while ((lmdb->error = mdb_cursor_get(
273 &mdb_data, MDB_NEXT)) == MDB_SUCCESS) {
275 struct ldb_val key = {
276 .length = mdb_key.mv_size,
277 .data = mdb_key.mv_data,
279 struct ldb_val data = {
280 .length = mdb_data.mv_size,
281 .data = mdb_data.mv_data,
284 ret = fn(ldb_kv, key, data, ctx);
289 if (lmdb->error == MDB_NOTFOUND) {
290 lmdb->error = MDB_SUCCESS;
293 if (cursor != NULL) {
294 mdb_cursor_close(cursor);
297 if (lmdb->error != MDB_SUCCESS) {
298 return ldb_mdb_error(lmdb->ldb, lmdb->error);
300 return ldb_mdb_err_map(lmdb->error);
303 static int lmdb_update_in_iterate(struct ldb_kv_private *ldb_kv,
309 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
311 int ret = LDB_SUCCESS;
314 * Need to take a copy of the data as the delete operation alters the
315 * data, as it is in private lmdb memory.
317 copy.length = data.length;
318 copy.data = talloc_memdup(ldb_kv, data.data, data.length);
319 if (copy.data == NULL) {
320 lmdb->error = MDB_PANIC;
321 return ldb_oom(lmdb->ldb);
324 lmdb->error = lmdb_delete(ldb_kv, key);
325 if (lmdb->error != MDB_SUCCESS) {
329 "Failed to delete %*.*s "
330 "for rekey as %*.*s: %s",
331 (int)key.length, (int)key.length,
332 (const char *)key.data,
333 (int)key2.length, (int)key2.length,
334 (const char *)key.data,
335 mdb_strerror(lmdb->error));
336 ret = ldb_mdb_error(lmdb->ldb, lmdb->error);
340 lmdb->error = lmdb_store(ldb_kv, key2, copy, 0);
341 if (lmdb->error != MDB_SUCCESS) {
345 "Failed to rekey %*.*s as %*.*s: %s",
346 (int)key.length, (int)key.length,
347 (const char *)key.data,
348 (int)key2.length, (int)key2.length,
349 (const char *)key.data,
350 mdb_strerror(lmdb->error));
351 ret = ldb_mdb_error(lmdb->ldb, lmdb->error);
356 if (copy.data != NULL) {
357 TALLOC_FREE(copy.data);
362 * Explicity invalidate the data, as the delete has done this
370 /* Handles only a single record */
371 static int lmdb_parse_record(struct ldb_kv_private *ldb_kv,
373 int (*parser)(struct ldb_val key,
378 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
385 txn = get_current_txn(lmdb);
387 ldb_debug(lmdb->ldb, LDB_DEBUG_FATAL, "No transaction active");
388 lmdb->error = MDB_PANIC;
389 return ldb_mdb_error(lmdb->ldb, lmdb->error);
392 lmdb->error = mdb_dbi_open(txn, NULL, 0, &dbi);
393 if (lmdb->error != MDB_SUCCESS) {
394 return ldb_mdb_error(lmdb->ldb, lmdb->error);
397 mdb_key.mv_size = key.length;
398 mdb_key.mv_data = key.data;
400 lmdb->error = mdb_get(txn, dbi, &mdb_key, &mdb_data);
401 if (lmdb->error != MDB_SUCCESS) {
402 /* TODO closing a handle should not even be necessary */
403 mdb_dbi_close(lmdb->env, dbi);
404 if (lmdb->error == MDB_NOTFOUND) {
405 return LDB_ERR_NO_SUCH_OBJECT;
407 return ldb_mdb_error(lmdb->ldb, lmdb->error);
409 data.data = mdb_data.mv_data;
410 data.length = mdb_data.mv_size;
412 /* TODO closing a handle should not even be necessary */
413 mdb_dbi_close(lmdb->env, dbi);
415 return parser(key, data, ctx);
419 static int lmdb_lock_read(struct ldb_module *module)
421 void *data = ldb_module_get_private(module);
422 struct ldb_kv_private *ldb_kv =
423 talloc_get_type(data, struct ldb_kv_private);
424 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
425 pid_t pid = getpid();
427 if (pid != lmdb->pid) {
428 ldb_asprintf_errstring(
430 __location__": Reusing ldb opened by pid %d in "
434 lmdb->error = MDB_BAD_TXN;
435 return LDB_ERR_PROTOCOL_ERROR;
438 lmdb->error = MDB_SUCCESS;
439 if (lmdb_transaction_active(ldb_kv) == false &&
440 ldb_kv->read_lock_count == 0) {
441 lmdb->error = mdb_txn_begin(lmdb->env,
446 if (lmdb->error != MDB_SUCCESS) {
447 return ldb_mdb_error(lmdb->ldb, lmdb->error);
450 ldb_kv->read_lock_count++;
451 return ldb_mdb_err_map(lmdb->error);
454 static int lmdb_unlock_read(struct ldb_module *module)
456 void *data = ldb_module_get_private(module);
457 struct ldb_kv_private *ldb_kv =
458 talloc_get_type(data, struct ldb_kv_private);
460 if (lmdb_transaction_active(ldb_kv) == false &&
461 ldb_kv->read_lock_count == 1) {
462 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
463 mdb_txn_commit(lmdb->read_txn);
464 lmdb->read_txn = NULL;
465 ldb_kv->read_lock_count--;
468 ldb_kv->read_lock_count--;
472 static int lmdb_transaction_start(struct ldb_kv_private *ldb_kv)
474 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
475 struct lmdb_trans *ltx;
476 struct lmdb_trans *ltx_head;
478 pid_t pid = getpid();
480 /* Do not take out the transaction lock on a read-only DB */
481 if (ldb_kv->read_only) {
482 return LDB_ERR_UNWILLING_TO_PERFORM;
485 ltx = talloc_zero(lmdb, struct lmdb_trans);
487 return ldb_oom(lmdb->ldb);
490 if (pid != lmdb->pid) {
491 ldb_asprintf_errstring(
493 __location__": Reusing ldb opened by pid %d in "
497 lmdb->error = MDB_BAD_TXN;
498 return LDB_ERR_PROTOCOL_ERROR;
501 ltx_head = lmdb_private_trans_head(lmdb);
503 tx_parent = lmdb_trans_get_tx(ltx_head);
505 lmdb->error = mdb_txn_begin(lmdb->env, tx_parent, 0, <x->tx);
506 if (lmdb->error != MDB_SUCCESS) {
507 return ldb_mdb_error(lmdb->ldb, lmdb->error);
510 trans_push(lmdb, ltx);
512 return ldb_mdb_err_map(lmdb->error);
515 static int lmdb_transaction_cancel(struct ldb_kv_private *ldb_kv)
517 struct lmdb_trans *ltx;
518 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
520 ltx = lmdb_private_trans_head(lmdb);
522 return LDB_ERR_OPERATIONS_ERROR;
525 mdb_txn_abort(ltx->tx);
526 trans_finished(lmdb, ltx);
530 static int lmdb_transaction_prepare_commit(struct ldb_kv_private *ldb_kv)
532 /* No need to prepare a commit */
536 static int lmdb_transaction_commit(struct ldb_kv_private *ldb_kv)
538 struct lmdb_trans *ltx;
539 struct lmdb_private *lmdb = ldb_kv->lmdb_private;
541 ltx = lmdb_private_trans_head(lmdb);
543 return LDB_ERR_OPERATIONS_ERROR;
546 lmdb->error = mdb_txn_commit(ltx->tx);
547 trans_finished(lmdb, ltx);
552 static int lmdb_error(struct ldb_kv_private *ldb_kv)
554 return ldb_mdb_err_map(ldb_kv->lmdb_private->error);
557 static const char *lmdb_errorstr(struct ldb_kv_private *ldb_kv)
559 return mdb_strerror(ldb_kv->lmdb_private->error);
562 static const char *lmdb_name(struct ldb_kv_private *ldb_kv)
567 static bool lmdb_changed(struct ldb_kv_private *ldb_kv)
570 * lmdb does no provide a quick way to determine if the database
571 * has changed. This function always returns true.
573 * Note that tdb uses a sequence number that allows this function
574 * to be implemented efficiently.
579 static struct kv_db_ops lmdb_key_value_ops = {
581 .delete = lmdb_delete,
582 .iterate = lmdb_traverse_fn,
583 .update_in_iterate = lmdb_update_in_iterate,
584 .fetch_and_parse = lmdb_parse_record,
585 .lock_read = lmdb_lock_read,
586 .unlock_read = lmdb_unlock_read,
587 .begin_write = lmdb_transaction_start,
588 .prepare_write = lmdb_transaction_prepare_commit,
589 .finish_write = lmdb_transaction_commit,
590 .abort_write = lmdb_transaction_cancel,
592 .errorstr = lmdb_errorstr,
594 .has_changed = lmdb_changed,
595 .transaction_active = lmdb_transaction_active,
598 static const char *lmdb_get_path(const char *url)
603 if (strchr(url, ':')) {
604 if (strncmp(url, MDB_URL_PREFIX, MDB_URL_PREFIX_SIZE) != 0) {
607 path = url + MDB_URL_PREFIX_SIZE;
615 static int lmdb_pvt_destructor(struct lmdb_private *lmdb)
617 struct lmdb_trans *ltx = NULL;
619 /* Check if this is a forked child */
620 if (getpid() != lmdb->pid) {
623 * We cannot call mdb_env_close or commit any transactions,
624 * otherwise they might appear finished in the parent.
628 if (mdb_env_get_fd(lmdb->env, &fd) == 0) {
632 /* Remove the pointer, so that no access should occur */
639 * Close the read transaction if it's open
641 if (lmdb->read_txn != NULL) {
642 mdb_txn_abort(lmdb->read_txn);
645 if (lmdb->env == NULL) {
650 * Abort any currently active transactions
652 ltx = lmdb_private_trans_head(lmdb);
653 while (ltx != NULL) {
654 mdb_txn_abort(ltx->tx);
655 trans_finished(lmdb, ltx);
656 ltx = lmdb_private_trans_head(lmdb);
663 struct mdb_env_wrap {
664 struct mdb_env_wrap *next, *prev;
671 static struct mdb_env_wrap *mdb_list;
673 /* destroy the last connection to an mdb */
674 static int mdb_env_wrap_destructor(struct mdb_env_wrap *w)
676 mdb_env_close(w->env);
677 DLIST_REMOVE(mdb_list, w);
681 static int lmdb_open_env(TALLOC_CTX *mem_ctx,
683 struct ldb_context *ldb,
688 const size_t mmap_size = 8LL * GIGABYTE;
689 unsigned int mdb_flags = MDB_NOSUBDIR|MDB_NOTLS;
691 * MDB_NOSUBDIR implies there is a separate file called path and a
692 * separate lockfile called path-lock
695 struct mdb_env_wrap *w;
697 pid_t pid = getpid();
701 if (stat(path, &st) == 0) {
702 for (w=mdb_list;w;w=w->next) {
703 if (st.st_dev == w->device &&
704 st.st_ino == w->inode &&
707 * We must have only one MDB_env per process
709 if (!talloc_reference(mem_ctx, w)) {
718 w = talloc(mem_ctx, struct mdb_env_wrap);
723 ret = mdb_env_create(env);
725 ldb_asprintf_errstring(
727 "Could not create MDB environment %s: %s\n",
730 return ldb_mdb_err_map(ret);
734 * Currently we set a 8Gb maximum database size
735 * via the constant mmap_size above
737 ret = mdb_env_set_mapsize(*env, mmap_size);
739 ldb_asprintf_errstring(
741 "Could not set MDB mmap() size to %llu on %s: %s\n",
742 (unsigned long long)(mmap_size),
746 return ldb_mdb_err_map(ret);
749 mdb_env_set_maxreaders(*env, 100000);
751 * As we ensure that there is only one MDB_env open per database per
752 * process. We can not use the MDB_RDONLY flag, as another ldb may be
753 * opened in read write mode
755 if (flags & LDB_FLG_NOSYNC) {
756 mdb_flags |= MDB_NOSYNC;
758 ret = mdb_env_open(*env, path, mdb_flags, 0644);
760 ldb_asprintf_errstring(ldb,
761 "Could not open DB %s: %s\n",
762 path, mdb_strerror(ret));
764 return ldb_mdb_err_map(ret);
767 ret = mdb_env_get_fd(*env, &fd);
769 ldb_asprintf_errstring(ldb,
770 "Could not obtain DB FD %s: %s\n",
771 path, mdb_strerror(ret));
773 return ldb_mdb_err_map(ret);
776 /* Just as for TDB: on exec, don't inherit the fd */
777 v = fcntl(fd, F_GETFD, 0);
778 fcntl(fd, F_SETFD, v | FD_CLOEXEC);
780 if (fstat(fd, &st) != 0) {
781 ldb_asprintf_errstring(
783 "Could not stat %s:\n",
786 return LDB_ERR_OPERATIONS_ERROR;
789 w->device = st.st_dev;
790 w->inode = st.st_ino;
793 talloc_set_destructor(w, mdb_env_wrap_destructor);
795 DLIST_ADD(mdb_list, w);
801 static int lmdb_pvt_open(struct lmdb_private *lmdb,
802 struct ldb_context *ldb,
807 int lmdb_max_key_length;
809 if (flags & LDB_FLG_DONT_CREATE_DB) {
811 if (stat(path, &st) != 0) {
812 return LDB_ERR_UNAVAILABLE;
816 ret = lmdb_open_env(lmdb, &lmdb->env, ldb, path, flags);
821 /* Close when lmdb is released */
822 talloc_set_destructor(lmdb, lmdb_pvt_destructor);
824 /* Store the original pid during the LMDB open */
825 lmdb->pid = getpid();
827 lmdb_max_key_length = mdb_env_get_maxkeysize(lmdb->env);
829 /* This will never happen, but if it does make sure to freak out */
830 if (lmdb_max_key_length < LDB_MDB_MAX_KEY_LENGTH) {
831 return ldb_operr(ldb);
837 int lmdb_connect(struct ldb_context *ldb,
840 const char *options[],
841 struct ldb_module **_module)
843 const char *path = NULL;
844 struct lmdb_private *lmdb = NULL;
845 struct ldb_kv_private *ldb_kv = NULL;
849 * We hold locks, so we must use a private event context
850 * on each returned handle
852 ldb_set_require_private_event_context(ldb);
854 path = lmdb_get_path(url);
856 ldb_debug(ldb, LDB_DEBUG_ERROR, "Invalid mdb URL '%s'", url);
857 return LDB_ERR_OPERATIONS_ERROR;
860 ldb_kv = talloc_zero(ldb, struct ldb_kv_private);
863 return LDB_ERR_OPERATIONS_ERROR;
866 lmdb = talloc_zero(ldb_kv, struct lmdb_private);
872 ldb_kv->kv_ops = &lmdb_key_value_ops;
874 ret = lmdb_pvt_open(lmdb, ldb, path, flags);
875 if (ret != LDB_SUCCESS) {
880 ldb_kv->lmdb_private = lmdb;
881 if (flags & LDB_FLG_RDONLY) {
882 ldb_kv->read_only = true;
886 * This maximum length becomes encoded in the index values so
887 * must never change even if LMDB starts to allow longer keys.
888 * The override option is max_key_len_for_self_test, and is
889 * used for testing only.
891 ldb_kv->max_key_length = LDB_MDB_MAX_KEY_LENGTH;
893 return ldb_kv_init_store(
894 ldb_kv, "ldb_mdb backend", ldb, options, _module);