2 Unix SMB/CIFS implementation.
3 global locks based on dbwrap and messaging
4 Copyright (C) 2009 by Volker Lendecke
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
25 #include "ctdbd_conn.h"
26 #include "../lib/util/select.h"
27 #include "system/select.h"
30 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
31 struct server_id pid);
34 struct db_context *db;
35 struct messaging_context *msg;
39 * The "g_lock.tdb" file contains records, indexed by the 0-terminated
40 * lockname. The record contains an array of "struct g_lock_rec"
41 * structures. Waiters have the lock_type with G_LOCK_PENDING or'ed.
45 enum g_lock_type lock_type;
49 struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
50 struct messaging_context *msg)
52 struct g_lock_ctx *result;
54 result = talloc(mem_ctx, struct g_lock_ctx);
60 result->db = db_open(result, lock_path("g_lock.tdb"), 0,
61 TDB_CLEAR_IF_FIRST|TDB_INCOMPATIBLE_HASH,
62 O_RDWR|O_CREAT, 0600);
63 if (result->db == NULL) {
64 DEBUG(1, ("g_lock_init: Could not open g_lock.tdb"));
71 static bool g_lock_conflicts(enum g_lock_type lock_type,
72 const struct g_lock_rec *rec)
74 enum g_lock_type rec_lock = rec->lock_type;
76 if ((rec_lock & G_LOCK_PENDING) != 0) {
81 * Only tested write locks so far. Very likely this routine
82 * needs to be fixed for read locks....
84 if ((lock_type == G_LOCK_READ) && (rec_lock == G_LOCK_READ)) {
90 static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
91 int *pnum_locks, struct g_lock_rec **plocks)
94 struct g_lock_rec *locks;
96 if ((data.dsize % sizeof(struct g_lock_rec)) != 0) {
97 DEBUG(1, ("invalid lock record length %d\n", (int)data.dsize));
101 num_locks = data.dsize / sizeof(struct g_lock_rec);
102 locks = talloc_array(mem_ctx, struct g_lock_rec, num_locks);
104 DEBUG(1, ("talloc failed\n"));
108 memcpy(locks, data.dptr, data.dsize);
110 DEBUG(10, ("locks:\n"));
111 for (i=0; i<num_locks; i++) {
112 DEBUGADD(10, ("%s: %s %s\n",
113 server_id_str(talloc_tos(), &locks[i].pid),
114 ((locks[i].lock_type & 1) == G_LOCK_READ) ?
116 (locks[i].lock_type & G_LOCK_PENDING) ?
117 "(pending)" : "(owner)"));
119 if (((locks[i].lock_type & G_LOCK_PENDING) == 0)
120 && !process_exists(locks[i].pid)) {
122 DEBUGADD(10, ("lock owner %s died -- discarding\n",
123 server_id_str(talloc_tos(),
126 if (i < (num_locks-1)) {
127 locks[i] = locks[num_locks-1];
134 *pnum_locks = num_locks;
138 static void g_lock_cleanup(int *pnum_locks, struct g_lock_rec *locks)
142 num_locks = *pnum_locks;
144 DEBUG(10, ("g_lock_cleanup: %d locks\n", num_locks));
146 for (i=0; i<num_locks; i++) {
147 if (process_exists(locks[i].pid)) {
150 DEBUGADD(10, ("%s does not exist -- discarding\n",
151 server_id_str(talloc_tos(), &locks[i].pid)));
153 if (i < (num_locks-1)) {
154 locks[i] = locks[num_locks-1];
158 *pnum_locks = num_locks;
162 static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
163 struct g_lock_rec *locks,
165 const struct server_id pid,
166 enum g_lock_type lock_type)
168 struct g_lock_rec *result;
169 int num_locks = *pnum_locks;
171 result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
173 if (result == NULL) {
177 result[num_locks].pid = pid;
178 result[num_locks].lock_type = lock_type;
183 static void g_lock_got_retry(struct messaging_context *msg,
186 struct server_id server_id,
189 static NTSTATUS g_lock_trylock(struct g_lock_ctx *ctx, const char *name,
190 enum g_lock_type lock_type)
192 struct db_record *rec = NULL;
193 struct g_lock_rec *locks = NULL;
195 struct server_id self;
198 NTSTATUS status = NT_STATUS_OK;
199 NTSTATUS store_status;
202 rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
203 string_term_tdb_data(name));
205 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
206 status = NT_STATUS_LOCK_NOT_GRANTED;
210 if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
211 DEBUG(10, ("g_lock_parse for %s failed\n", name));
212 status = NT_STATUS_INTERNAL_ERROR;
216 self = messaging_server_id(ctx->msg);
219 for (i=0; i<num_locks; i++) {
220 if (procid_equal(&self, &locks[i].pid)) {
221 if (our_index != -1) {
222 DEBUG(1, ("g_lock_trylock: Added ourself "
224 status = NT_STATUS_INTERNAL_ERROR;
227 if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
228 DEBUG(1, ("g_lock_trylock: Found ourself not "
230 status = NT_STATUS_INTERNAL_ERROR;
236 /* never conflict with ourself */
239 if (g_lock_conflicts(lock_type, &locks[i])) {
240 struct server_id pid = locks[i].pid;
242 if (!process_exists(pid)) {
245 status = g_lock_force_unlock(ctx, name, pid);
246 if (!NT_STATUS_IS_OK(status)) {
247 DEBUG(1, ("Could not unlock dead lock "
253 lock_type |= G_LOCK_PENDING;
257 if (our_index == -1) {
258 /* First round, add ourself */
260 locks = g_lock_addrec(talloc_tos(), locks, &num_locks,
263 DEBUG(10, ("g_lock_addrec failed\n"));
264 status = NT_STATUS_NO_MEMORY;
269 * Retry. We were pending last time. Overwrite the
270 * stored lock_type with what we calculated, we might
271 * have acquired the lock this time.
273 locks[our_index].lock_type = lock_type;
276 if (NT_STATUS_IS_OK(status) && ((lock_type & G_LOCK_PENDING) == 0)) {
278 * Walk through the list of locks, search for dead entries
280 g_lock_cleanup(&num_locks, locks);
283 data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
284 store_status = rec->store(rec, data, 0);
285 if (!NT_STATUS_IS_OK(store_status)) {
286 DEBUG(1, ("rec->store failed: %s\n",
287 nt_errstr(store_status)));
288 status = store_status;
295 if (NT_STATUS_IS_OK(status) && (lock_type & G_LOCK_PENDING) != 0) {
296 return STATUS_PENDING;
302 NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
303 enum g_lock_type lock_type, struct timeval timeout)
305 struct tevent_timer *te = NULL;
308 struct timeval timeout_end;
309 struct timeval time_now;
311 DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
314 if (lock_type & ~1) {
315 DEBUG(1, ("Got invalid lock type %d for %s\n",
316 (int)lock_type, name));
317 return NT_STATUS_INVALID_PARAMETER;
320 #ifdef CLUSTER_SUPPORT
321 if (lp_clustering()) {
322 status = ctdb_watch_us(messaging_ctdbd_connection());
323 if (!NT_STATUS_IS_OK(status)) {
324 DEBUG(10, ("could not register retry with ctdb: %s\n",
331 status = messaging_register(ctx->msg, &retry, MSG_DBWRAP_G_LOCK_RETRY,
333 if (!NT_STATUS_IS_OK(status)) {
334 DEBUG(10, ("messaging_register failed: %s\n",
339 time_now = timeval_current();
340 timeout_end = timeval_sum(&time_now, &timeout);
343 struct pollfd *pollfds;
347 struct timeval timeout_remaining, select_timeout;
349 status = g_lock_trylock(ctx, name, lock_type);
350 if (NT_STATUS_IS_OK(status)) {
351 DEBUG(10, ("Got lock %s\n", name));
354 if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
355 DEBUG(10, ("g_lock_trylock failed: %s\n",
360 DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
362 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
363 * !!! HACK ALERT --- FIX ME !!!
364 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
365 * What we really want to do here is to react to
366 * MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
367 * by a client doing g_lock_unlock or by ourselves when
368 * we receive a CTDB_SRVID_SAMBA_NOTIFY or
369 * CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
370 * either a client holding a lock or a complete node
373 * Doing this properly involves calling tevent_loop_once(),
374 * but doing this here with the main ctdbd messaging context
375 * creates a nested event loop when g_lock_lock() is called
376 * from the main event loop, e.g. in a tcon_and_X where the
377 * share_info.tdb needs to be initialized and is locked by
378 * another process, or when the remore registry is accessed
379 * for writing and some other process already holds a lock
380 * on the registry.tdb.
382 * So as a quick fix, we act a little coarsely here: we do
383 * a select on the ctdb connection fd and when it is readable
384 * or we get EINTR, then we retry without actually parsing
385 * any ctdb packages or dispatching messages. This means that
386 * we retry more often than intended by design, but this does
387 * not harm and it is unobtrusive. When we have finished,
388 * the main loop will pick up all the messages and ctdb
389 * packets. The only extra twist is that we cannot use timed
390 * events here but have to handcode a timeout.
394 * We allocate 2 entries here. One is needed anyway for
395 * sys_poll and in the clustering case we might have to add
396 * the ctdb fd. This avoids the realloc then.
398 pollfds = talloc_array(talloc_tos(), struct pollfd, 2);
399 if (pollfds == NULL) {
400 status = NT_STATUS_NO_MEMORY;
405 #ifdef CLUSTER_SUPPORT
406 if (lp_clustering()) {
407 struct ctdbd_connection *conn;
408 conn = messaging_ctdbd_connection();
410 pollfds[0].fd = ctdbd_conn_get_fd(conn);
411 pollfds[0].events = POLLIN|POLLHUP;
417 time_now = timeval_current();
418 timeout_remaining = timeval_until(&time_now, &timeout_end);
419 select_timeout = timeval_set(60, 0);
421 select_timeout = timeval_min(&select_timeout,
424 ret = sys_poll(pollfds, num_pollfds,
425 timeval_to_msec(select_timeout));
428 * We're not *really interested in the actual flags. We just
429 * need to retry this whole thing.
432 TALLOC_FREE(pollfds);
436 if (errno != EINTR) {
437 DEBUG(1, ("error calling select: %s\n",
439 status = NT_STATUS_INTERNAL_ERROR;
444 * This means a signal was received.
445 * It might have been a MSG_DBWRAP_G_LOCK_RETRY message.
448 } else if (ret == 0) {
449 if (timeval_expired(&timeout_end)) {
450 DEBUG(10, ("g_lock_lock timed out\n"));
451 status = NT_STATUS_LOCK_NOT_GRANTED;
454 DEBUG(10, ("select returned 0 but timeout not "
455 "not expired, retrying\n"));
457 } else if (ret != 1) {
458 DEBUG(1, ("invalid return code of select: %d\n", ret));
459 status = NT_STATUS_INTERNAL_ERROR;
464 * This means ctdbd has sent us some data.
465 * Might be a CTDB_SRVID_RECONFIGURE or a
466 * CTDB_SRVID_SAMBA_NOTIFY message.
471 #ifdef CLUSTER_SUPPORT
475 if (!NT_STATUS_IS_OK(status)) {
476 NTSTATUS unlock_status;
478 unlock_status = g_lock_unlock(ctx, name);
480 if (!NT_STATUS_IS_OK(unlock_status)) {
481 DEBUG(1, ("Could not remove ourself from the locking "
482 "db: %s\n", nt_errstr(status)));
486 messaging_deregister(ctx->msg, MSG_DBWRAP_G_LOCK_RETRY, &retry);
492 static void g_lock_got_retry(struct messaging_context *msg,
495 struct server_id server_id,
498 bool *pretry = (bool *)private_data;
500 DEBUG(10, ("Got retry message from pid %s\n",
501 server_id_str(talloc_tos(), &server_id)));
506 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
507 struct server_id pid)
509 struct db_record *rec = NULL;
510 struct g_lock_rec *locks = NULL;
512 enum g_lock_type lock_type;
515 rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
516 string_term_tdb_data(name));
518 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
519 status = NT_STATUS_INTERNAL_ERROR;
523 if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
524 DEBUG(10, ("g_lock_parse for %s failed\n", name));
525 status = NT_STATUS_INTERNAL_ERROR;
529 for (i=0; i<num_locks; i++) {
530 if (procid_equal(&pid, &locks[i].pid)) {
535 if (i == num_locks) {
536 DEBUG(10, ("g_lock_force_unlock: Lock not found\n"));
537 status = NT_STATUS_INTERNAL_ERROR;
541 lock_type = locks[i].lock_type;
543 if (i < (num_locks-1)) {
544 locks[i] = locks[num_locks-1];
548 if (num_locks == 0) {
549 status = rec->delete_rec(rec);
552 data = make_tdb_data((uint8_t *)locks,
553 sizeof(struct g_lock_rec) * num_locks);
554 status = rec->store(rec, data, 0);
557 if (!NT_STATUS_IS_OK(status)) {
558 DEBUG(1, ("g_lock_force_unlock: Could not store record: %s\n",
565 if ((lock_type & G_LOCK_PENDING) == 0) {
569 * We've been the lock holder. Others to retry. Don't
570 * tell all others to avoid a thundering herd. In case
571 * this leads to a complete stall because we miss some
572 * processes, the loop in g_lock_lock tries at least
576 for (i=0; i<num_locks; i++) {
577 if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
580 if (!process_exists(locks[i].pid)) {
585 * Ping all waiters to retry
587 status = messaging_send(ctx->msg, locks[i].pid,
588 MSG_DBWRAP_G_LOCK_RETRY,
590 if (!NT_STATUS_IS_OK(status)) {
591 DEBUG(1, ("sending retry to %s failed: %s\n",
592 server_id_str(talloc_tos(),
598 if (num_wakeups > 5) {
605 * For the error path, TALLOC_FREE(rec) as well. In the good
606 * path we have already freed it.
614 NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, const char *name)
618 status = g_lock_force_unlock(ctx, name, messaging_server_id(ctx->msg));
620 #ifdef CLUSTER_SUPPORT
621 if (lp_clustering()) {
622 ctdb_unwatch(messaging_ctdbd_connection());
628 struct g_lock_locks_state {
629 int (*fn)(const char *name, void *private_data);
633 static int g_lock_locks_fn(struct db_record *rec, void *priv)
635 struct g_lock_locks_state *state = (struct g_lock_locks_state *)priv;
637 if ((rec->key.dsize == 0) || (rec->key.dptr[rec->key.dsize-1] != 0)) {
638 DEBUG(1, ("invalid key in g_lock.tdb, ignoring\n"));
641 return state->fn((char *)rec->key.dptr, state->private_data);
644 int g_lock_locks(struct g_lock_ctx *ctx,
645 int (*fn)(const char *name, void *private_data),
648 struct g_lock_locks_state state;
651 state.private_data = private_data;
653 return ctx->db->traverse_read(ctx->db, g_lock_locks_fn, &state);
656 NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, const char *name,
657 int (*fn)(struct server_id pid,
658 enum g_lock_type lock_type,
664 struct g_lock_rec *locks = NULL;
667 if (ctx->db->fetch(ctx->db, talloc_tos(), string_term_tdb_data(name),
669 return NT_STATUS_NOT_FOUND;
672 if ((data.dsize == 0) || (data.dptr == NULL)) {
676 ret = g_lock_parse(talloc_tos(), data, &num_locks, &locks);
678 TALLOC_FREE(data.dptr);
681 DEBUG(10, ("g_lock_parse for %s failed\n", name));
682 return NT_STATUS_INTERNAL_ERROR;
685 for (i=0; i<num_locks; i++) {
686 if (fn(locks[i].pid, locks[i].lock_type, private_data) != 0) {
694 struct g_lock_get_state {
696 struct server_id *pid;
699 static int g_lock_get_fn(struct server_id pid, enum g_lock_type lock_type,
702 struct g_lock_get_state *state = (struct g_lock_get_state *)priv;
704 if ((lock_type & G_LOCK_PENDING) != 0) {
713 NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
714 struct server_id *pid)
716 struct g_lock_get_state state;
722 status = g_lock_dump(ctx, name, g_lock_get_fn, &state);
723 if (!NT_STATUS_IS_OK(status)) {
727 return NT_STATUS_NOT_FOUND;
732 static bool g_lock_init_all(TALLOC_CTX *mem_ctx,
733 struct tevent_context **pev,
734 struct messaging_context **pmsg,
735 const struct server_id self,
736 struct g_lock_ctx **pg_ctx)
738 struct tevent_context *ev = NULL;
739 struct messaging_context *msg = NULL;
740 struct g_lock_ctx *g_ctx = NULL;
742 ev = tevent_context_init(mem_ctx);
744 d_fprintf(stderr, "ERROR: could not init event context\n");
747 msg = messaging_init(mem_ctx, self, ev);
749 d_fprintf(stderr, "ERROR: could not init messaging context\n");
752 g_ctx = g_lock_ctx_init(mem_ctx, msg);
754 d_fprintf(stderr, "ERROR: could not init g_lock context\n");
769 NTSTATUS g_lock_do(const char *name, enum g_lock_type lock_type,
770 struct timeval timeout, const struct server_id self,
771 void (*fn)(void *private_data), void *private_data)
773 struct tevent_context *ev = NULL;
774 struct messaging_context *msg = NULL;
775 struct g_lock_ctx *g_ctx = NULL;
778 if (!g_lock_init_all(talloc_tos(), &ev, &msg, self, &g_ctx)) {
779 status = NT_STATUS_ACCESS_DENIED;
783 status = g_lock_lock(g_ctx, name, lock_type, timeout);
784 if (!NT_STATUS_IS_OK(status)) {
788 g_lock_unlock(g_ctx, name);