r23784: use the GPLv3 boilerplate as recommended by the FSF and the license text
[tprouty/samba.git] / source / locking / brlock.c
index e6f0dd1c729baa9dfd546acc4fcac9e8ec78f1fd..228a30b7af42d45d4614f38d902a2d0ce864b7a5 100644 (file)
@@ -8,7 +8,7 @@
    
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
+   the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.
    
    This program is distributed in the hope that it will be useful,
@@ -17,8 +17,7 @@
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 /* This module implements a tdb based byte range locking service,
 
 #define ZERO_ZERO 0
 
-/* This contains elements that differentiate locks. The smbpid is a
-   client supplied pid, and is essentially the locking context for
-   this client */
-
-struct lock_context {
-       uint16 smbpid;
-       uint16 tid;
-       struct process_id pid;
-};
-
-/* The data in brlock records is an unsorted linear array of these
-   records.  It is unnecessary to store the count as tdb provides the
-   size of the record */
-
-struct lock_struct {
-       struct lock_context context;
-       br_off start;
-       br_off size;
-       int fnum;
-       enum brl_type lock_type;
-       enum brl_flavour lock_flav;
-};
-
 /* The open brlock.tdb database. */
 
-static TDB_CONTEXT *tdb;
+static struct db_context *brlock_db;
 
 /****************************************************************************
  Debug info at level 10 for lock struct.
@@ -83,7 +59,7 @@ static void print_lock_struct(unsigned int i, struct lock_struct *pls)
  See if two locking contexts are equal.
 ****************************************************************************/
 
-static BOOL brl_same_context(const struct lock_context *ctx1, 
+BOOL brl_same_context(const struct lock_context *ctx1, 
                             const struct lock_context *ctx2)
 {
        return (procid_equal(&ctx1->pid, &ctx2->pid) &&
@@ -121,7 +97,7 @@ static BOOL brl_conflict(const struct lock_struct *lck1,
                         const struct lock_struct *lck2)
 {
        /* Ignore PENDING locks. */
-       if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
                return False;
 
        /* Read locks never conflict. */
@@ -152,7 +128,7 @@ static BOOL brl_conflict_posix(const struct lock_struct *lck1,
 #endif
 
        /* Ignore PENDING locks. */
-       if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
                return False;
 
        /* Read locks never conflict. */
@@ -174,7 +150,7 @@ static BOOL brl_conflict_posix(const struct lock_struct *lck1,
 static BOOL brl_conflict1(const struct lock_struct *lck1, 
                         const struct lock_struct *lck2)
 {
-       if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
                return False;
 
        if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) {
@@ -207,7 +183,7 @@ static BOOL brl_conflict1(const struct lock_struct *lck1,
 
 static BOOL brl_conflict_other(const struct lock_struct *lck1, const struct lock_struct *lck2)
 {
-       if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
                return False;
 
        if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) 
@@ -234,30 +210,47 @@ static BOOL brl_conflict_other(const struct lock_struct *lck1, const struct lock
 } 
 
 /****************************************************************************
- Amazingly enough, w2k3 "remembers" whether the last lock failure
+ Check if an unlock overlaps a pending lock.
+****************************************************************************/
+
+static BOOL brl_pending_overlap(const struct lock_struct *lock, const struct lock_struct *pend_lock)
+{
+       if ((lock->start <= pend_lock->start) && (lock->start + lock->size > pend_lock->start))
+               return True;
+       if ((lock->start >= pend_lock->start) && (lock->start <= pend_lock->start + pend_lock->size))
+               return True;
+       return False;
+}
+
+/****************************************************************************
+ Amazingly enough, w2k3 "remembers" whether the last lock failure on a fnum
  is the same as this one and changes its error code. I wonder if any
  app depends on this ?
 ****************************************************************************/
 
-static NTSTATUS brl_lock_failed(const struct lock_struct *lock)
+static NTSTATUS brl_lock_failed(files_struct *fsp, const struct lock_struct *lock, BOOL blocking_lock)
 {
-       static struct lock_struct last_lock_failure;
-
-       if (brl_same_context(&lock->context, &last_lock_failure.context) &&
-                       lock->fnum == last_lock_failure.fnum &&
-                       lock->start == last_lock_failure.start &&
-                       lock->size == last_lock_failure.size) {
-               return NT_STATUS_FILE_LOCK_CONFLICT;
-       }
-       last_lock_failure = *lock;
-       if (lock->start >= 0xEF000000 &&
-                       (lock->start >> 63) == 0) {
+       if (lock->start >= 0xEF000000 && (lock->start >> 63) == 0) {
                /* amazing the little things you learn with a test
                   suite. Locks beyond this offset (as a 64 bit
                   number!) always generate the conflict error code,
                   unless the top bit is set */
+               if (!blocking_lock) {
+                       fsp->last_lock_failure = *lock;
+               }
+               return NT_STATUS_FILE_LOCK_CONFLICT;
+       }
+
+       if (procid_equal(&lock->context.pid, &fsp->last_lock_failure.context.pid) &&
+                       lock->context.tid == fsp->last_lock_failure.context.tid &&
+                       lock->fnum == fsp->last_lock_failure.fnum &&
+                       lock->start == fsp->last_lock_failure.start) {
                return NT_STATUS_FILE_LOCK_CONFLICT;
        }
+
+       if (!blocking_lock) {
+               fsp->last_lock_failure = *lock;
+       }
        return NT_STATUS_LOCK_NOT_GRANTED;
 }
 
@@ -267,14 +260,16 @@ static NTSTATUS brl_lock_failed(const struct lock_struct *lock)
 
 void brl_init(int read_only)
 {
-       if (tdb) {
+       if (brlock_db) {
                return;
        }
-       tdb = tdb_open_log(lock_path("brlock.tdb"),
-                       lp_open_files_db_hash_size(),
-                       TDB_DEFAULT|(read_only?0x0:TDB_CLEAR_IF_FIRST),
-                       read_only?O_RDONLY:(O_RDWR|O_CREAT), 0644 );
-       if (!tdb) {
+       brlock_db = db_open(NULL, lock_path("brlock.tdb"),
+                           lp_open_files_db_hash_size(),
+                           TDB_DEFAULT
+                           |TDB_VOLATILE
+                           |(read_only?0x0:TDB_CLEAR_IF_FIRST),
+                           read_only?O_RDONLY:(O_RDWR|O_CREAT), 0644 );
+       if (!brlock_db) {
                DEBUG(0,("Failed to open byte range locking database %s\n",
                        lock_path("brlock.tdb")));
                return;
@@ -287,10 +282,10 @@ void brl_init(int read_only)
 
 void brl_shutdown(int read_only)
 {
-       if (!tdb) {
+       if (!brlock_db) {
                return;
        }
-       tdb_close(tdb);
+       TALLOC_FREE(brlock_db);
 }
 
 #if ZERO_ZERO
@@ -316,22 +311,18 @@ static int lock_compare(const struct lock_struct *lck1,
 ****************************************************************************/
 
 static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck,
-                       const struct lock_struct *plock,
-                       BOOL *my_lock_ctx)
+                       struct lock_struct *plock, BOOL blocking_lock)
 {
        unsigned int i;
        files_struct *fsp = br_lck->fsp;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       struct lock_struct *locks = br_lck->lock_data;
 
        for (i=0; i < br_lck->num_locks; i++) {
                /* Do any Windows or POSIX locks conflict ? */
                if (brl_conflict(&locks[i], plock)) {
-                       NTSTATUS status = brl_lock_failed(plock);;
-                       /* Did we block ourselves ? */
-                       if (brl_same_context(&locks[i].context, &plock->context)) {
-                               *my_lock_ctx = True;
-                       }
-                       return status;
+                       /* Remember who blocked us. */
+                       plock->context.smbpid = locks[i].context.smbpid;
+                       return brl_lock_failed(fsp,plock,blocking_lock);
                }
 #if ZERO_ZERO
                if (plock->start == 0 && plock->size == 0 && 
@@ -343,13 +334,23 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck,
 
        /* We can get the Windows lock, now see if it needs to
           be mapped into a lower level POSIX one, and if so can
-          we get it ? We tell the lower lock layer about the
-          lock type so it can cope with the difference between
-          Windows "stacking" locks and POSIX "flat" ones. */
-
-       if ((plock->lock_type != PENDING_LOCK) && lp_posix_locking(SNUM(fsp->conn))) {
-               if (!set_posix_lock(fsp, plock->start, plock->size, plock->lock_type, WINDOWS_LOCK)) {
-                       if (errno == EACCES || errno == EAGAIN) {
+          we get it ? */
+
+       if (!IS_PENDING_LOCK(plock->lock_type) && lp_posix_locking(fsp->conn->params)) {
+               int errno_ret;
+               if (!set_posix_lock_windows_flavour(fsp,
+                               plock->start,
+                               plock->size,
+                               plock->lock_type,
+                               &plock->context,
+                               locks,
+                               br_lck->num_locks,
+                               &errno_ret)) {
+
+                       /* We don't know who blocked us. */
+                       plock->context.smbpid = 0xFFFFFFFF;
+
+                       if (errno_ret == EACCES || errno_ret == EAGAIN) {
                                return NT_STATUS_FILE_LOCK_CONFLICT;
                        } else {
                                return map_nt_error_from_unix(errno);
@@ -365,7 +366,7 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck,
 
        memcpy(&locks[br_lck->num_locks], plock, sizeof(struct lock_struct));
        br_lck->num_locks += 1;
-       br_lck->lock_data = (void *)locks;
+       br_lck->lock_data = locks;
        br_lck->modified = True;
 
        return NT_STATUS_OK;
@@ -375,9 +376,9 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck,
  Cope with POSIX range splits and merges.
 ****************************************************************************/
 
-static unsigned int brlock_posix_split_merge(struct lock_struct *lck_arr,
-                                               const struct lock_struct *ex,
-                                               const struct lock_struct *plock,
+static unsigned int brlock_posix_split_merge(struct lock_struct *lck_arr,              /* Output array. */
+                                               const struct lock_struct *ex,           /* existing lock. */
+                                               const struct lock_struct *plock,        /* proposed lock. */
                                                BOOL *lock_was_added)
 {
        BOOL lock_types_differ = (ex->lock_type != plock->lock_type);
@@ -398,26 +399,26 @@ static unsigned int brlock_posix_split_merge(struct lock_struct *lck_arr,
                                              +---------+
                                              | ex      |
                                              +---------+
-                                +-------+
-                                | plock |
-                                +-------+
+                              +-------+
+                              | plock |
+                              +-------+
 OR....
              +---------+
              |  ex     |
              +---------+
 **********************************************/
 
-       if ( (ex->start >= (plock->start + plock->size)) ||
-                       (plock->start >= (ex->start + ex->size))) {
+       if ( (ex->start > (plock->start + plock->size)) ||
+                       (plock->start > (ex->start + ex->size))) {
                /* No overlap with this lock - copy existing. */
                memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
                return 1;
        }
 
 /*********************************************
-                +---------+
-                |  ex     |
-                +---------+
+        +---------------------------+
+        |          ex               |
+        +---------------------------+
         +---------------------------+
         |       plock               | -> replace with plock.
         +---------------------------+
@@ -431,24 +432,32 @@ OR....
        }
 
 /*********************************************
-                +---------------+
-                |  ex           |
-                +---------------+
+        +-----------------------+
+        |          ex           |
+        +-----------------------+
         +---------------+
         |   plock       |
         +---------------+
+OR....
+                        +-------+
+                        |  ex   |
+                        +-------+
+        +---------------+
+        |   plock       |
+        +---------------+
+
 BECOMES....
         +---------------+-------+
         |   plock       | ex    | - different lock types.
         +---------------+-------+
-OR....
+OR.... (merge)
         +-----------------------+
         |   ex                  | - same lock type.
         +-----------------------+
 **********************************************/
 
        if ( (ex->start >= plock->start) &&
-                               (ex->start < plock->start + plock->size) &&
+                               (ex->start <= plock->start + plock->size) &&
                                (ex->start + ex->size > plock->start + plock->size) ) {
 
                *lock_was_added = True;
@@ -475,9 +484,16 @@ OR....
        }
 
 /*********************************************
-   +---------------+
-   |  ex           |
-   +---------------+
+   +-----------------------+
+   |  ex                   |
+   +-----------------------+
+           +---------------+
+           |   plock       |
+           +---------------+
+OR....
+   +-------+        
+   |  ex   |
+   +-------+
            +---------------+
            |   plock       |
            +---------------+
@@ -486,7 +502,7 @@ BECOMES....
    | ex    |   plock       | - different lock types
    +-------+---------------+
 
-OR
+OR.... (merge)
    +-----------------------+
    | ex                    | - same lock type.
    +-----------------------+
@@ -494,7 +510,7 @@ OR
 **********************************************/
 
        if ( (ex->start < plock->start) &&
-                       (ex->start + ex->size > plock->start) &&
+                       (ex->start + ex->size >= plock->start) &&
                        (ex->start + ex->size <= plock->start + plock->size) ) {
 
                *lock_was_added = True;
@@ -560,9 +576,11 @@ OR
        }
 
        /* Never get here. */
-       smb_panic("brlock_posix_split_merge\n");
+       smb_panic("brlock_posix_split_merge");
        /* Notreached. */
-       abort();
+
+       /* Keep some compilers happy. */
+       return 0;
 }
 
 /****************************************************************************
@@ -570,14 +588,15 @@ OR
  We must cope with range splits and merges.
 ****************************************************************************/
 
-static NTSTATUS brl_lock_posix(struct byte_range_lock *br_lck,
-                       const struct lock_struct *plock,
-                       BOOL *my_lock_ctx)
+static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx,
+                              struct byte_range_lock *br_lck,
+                              struct lock_struct *plock)
 {
        unsigned int i, count;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       struct lock_struct *locks = br_lck->lock_data;
        struct lock_struct *tp;
        BOOL lock_was_added = False;
+       BOOL signal_pending_read = False;
 
        /* No zero-zero locks for POSIX. */
        if (plock->start == 0 && plock->size == 0) {
@@ -601,48 +620,69 @@ static NTSTATUS brl_lock_posix(struct byte_range_lock *br_lck,
        
        count = 0;
        for (i=0; i < br_lck->num_locks; i++) {
-               if (locks[i].lock_flav == WINDOWS_LOCK) {
+               struct lock_struct *curr_lock = &locks[i];
+
+               /* If we have a pending read lock, a lock downgrade should
+                  trigger a lock re-evaluation. */
+               if (curr_lock->lock_type == PENDING_READ_LOCK &&
+                               brl_pending_overlap(plock, curr_lock)) {
+                       signal_pending_read = True;
+               }
+
+               if (curr_lock->lock_flav == WINDOWS_LOCK) {
                        /* Do any Windows flavour locks conflict ? */
-                       if (brl_conflict(&locks[i], plock)) {
-                               /* Did we block ourselves ? */
-                               if (brl_same_context(&locks[i].context, &plock->context)) {
-                                       *my_lock_ctx = True;
-                               }
+                       if (brl_conflict(curr_lock, plock)) {
                                /* No games with error messages. */
                                SAFE_FREE(tp);
+                               /* Remember who blocked us. */
+                               plock->context.smbpid = curr_lock->context.smbpid;
                                return NT_STATUS_FILE_LOCK_CONFLICT;
                        }
                        /* Just copy the Windows lock into the new array. */
-                       memcpy(&tp[count], &locks[i], sizeof(struct lock_struct));
+                       memcpy(&tp[count], curr_lock, sizeof(struct lock_struct));
                        count++;
                } else {
                        /* POSIX conflict semantics are different. */
-                       if (brl_conflict_posix(&locks[i], plock)) {
+                       if (brl_conflict_posix(curr_lock, plock)) {
                                /* Can't block ourselves with POSIX locks. */
                                /* No games with error messages. */
                                SAFE_FREE(tp);
+                               /* Remember who blocked us. */
+                               plock->context.smbpid = curr_lock->context.smbpid;
                                return NT_STATUS_FILE_LOCK_CONFLICT;
                        }
 
                        /* Work out overlaps. */
-                       count += brlock_posix_split_merge(&tp[count], &locks[i], plock, &lock_was_added);
+                       count += brlock_posix_split_merge(&tp[count], curr_lock, plock, &lock_was_added);
                }
        }
 
+       if (!lock_was_added) {
+               memcpy(&tp[count], plock, sizeof(struct lock_struct));
+               count++;
+       }
+
        /* We can get the POSIX lock, now see if it needs to
           be mapped into a lower level POSIX one, and if so can
-          we get it ? We well the lower lock layer about the
-          lock type so it can cope with the difference between
-          Windows "stacking" locks and POSIX "flat" ones. */
+          we get it ? */
+
+       if (!IS_PENDING_LOCK(plock->lock_type) && lp_posix_locking(br_lck->fsp->conn->params)) {
+               int errno_ret;
 
-#if 0
-       /* FIXME - this call doesn't work correctly yet for POSIX locks... */
+               /* The lower layer just needs to attempt to
+                  get the system POSIX lock. We've weeded out
+                  any conflicts above. */
 
-       if ((plock->lock_type != PENDING_LOCK) && lp_posix_locking(SNUM(fsp->conn))) {
-               files_struct *fsp = br_lck->fsp;
+               if (!set_posix_lock_posix_flavour(br_lck->fsp,
+                               plock->start,
+                               plock->size,
+                               plock->lock_type,
+                               &errno_ret)) {
 
-               if (!set_posix_lock(fsp, plock->start, plock->size, plock->lock_type, POSIX_LOCK)) {
-                       if (errno == EACCES || errno == EAGAIN) {
+                       /* We don't know who blocked us. */
+                       plock->context.smbpid = 0xFFFFFFFF;
+
+                       if (errno_ret == EACCES || errno_ret == EAGAIN) {
                                SAFE_FREE(tp);
                                return NT_STATUS_FILE_LOCK_CONFLICT;
                        } else {
@@ -651,12 +691,6 @@ static NTSTATUS brl_lock_posix(struct byte_range_lock *br_lck,
                        }
                }
        }
-#endif
-
-       if (!lock_was_added) {
-               memcpy(&tp[count], plock, sizeof(struct lock_struct));
-               count++;
-       }
 
        /* Realloc so we don't leak entries per lock call. */
        tp = (struct lock_struct *)SMB_REALLOC(tp, count * sizeof(*locks));
@@ -664,8 +698,35 @@ static NTSTATUS brl_lock_posix(struct byte_range_lock *br_lck,
                return NT_STATUS_NO_MEMORY;
        }
        br_lck->num_locks = count;
-       br_lck->lock_data = (void *)tp;
+       SAFE_FREE(br_lck->lock_data);
+       br_lck->lock_data = tp;
+       locks = tp;
        br_lck->modified = True;
+
+       /* A successful downgrade from write to read lock can trigger a lock
+          re-evalutation where waiting readers can now proceed. */
+
+       if (signal_pending_read) {
+               /* Send unlock messages to any pending read waiters that overlap. */
+               for (i=0; i < br_lck->num_locks; i++) {
+                       struct lock_struct *pend_lock = &locks[i];
+
+                       /* Ignore non-pending locks. */
+                       if (!IS_PENDING_LOCK(pend_lock->lock_type)) {
+                               continue;
+                       }
+
+                       if (pend_lock->lock_type == PENDING_READ_LOCK &&
+                                       brl_pending_overlap(plock, pend_lock)) {
+                               DEBUG(10,("brl_lock_posix: sending unlock message to pid %s\n",
+                                       procid_str_static(&pend_lock->context.pid )));
+
+                               messaging_send(msg_ctx, pend_lock->context.pid,
+                                              MSG_SMB_UNLOCK, &data_blob_null);
+                       }
+               }
+       }
+
        return NT_STATUS_OK;
 }
 
@@ -673,20 +734,20 @@ static NTSTATUS brl_lock_posix(struct byte_range_lock *br_lck,
  Lock a range of bytes.
 ****************************************************************************/
 
-NTSTATUS brl_lock(struct byte_range_lock *br_lck,
-               uint16 smbpid,
-               struct process_id pid,
+NTSTATUS brl_lock(struct messaging_context *msg_ctx,
+               struct byte_range_lock *br_lck,
+               uint32 smbpid,
+               struct server_id pid,
                br_off start,
                br_off size, 
                enum brl_type lock_type,
                enum brl_flavour lock_flav,
-               BOOL *my_lock_ctx)
+               BOOL blocking_lock,
+               uint32 *psmbpid)
 {
        NTSTATUS ret;
        struct lock_struct lock;
 
-       *my_lock_ctx = False;
-
 #if !ZERO_ZERO
        if (start == 0 && size == 0) {
                DEBUG(0,("client sent 0/0 lock - please report this\n"));
@@ -703,9 +764,9 @@ NTSTATUS brl_lock(struct byte_range_lock *br_lck,
        lock.lock_flav = lock_flav;
 
        if (lock_flav == WINDOWS_LOCK) {
-               ret = brl_lock_windows(br_lck, &lock, my_lock_ctx);
+               ret = brl_lock_windows(br_lck, &lock, blocking_lock);
        } else {
-               ret = brl_lock_posix(br_lck, &lock, my_lock_ctx);
+               ret = brl_lock_posix(msg_ctx, br_lck, &lock);
        }
 
 #if ZERO_ZERO
@@ -713,35 +774,31 @@ NTSTATUS brl_lock(struct byte_range_lock *br_lck,
        qsort(br_lck->lock_data, (size_t)br_lck->num_locks, sizeof(lock), lock_compare);
 #endif
 
+       /* If we're returning an error, return who blocked us. */
+       if (!NT_STATUS_IS_OK(ret) && psmbpid) {
+               *psmbpid = lock.context.smbpid;
+       }
        return ret;
 }
 
-/****************************************************************************
- Check if an unlock overlaps a pending lock.
-****************************************************************************/
-
-static BOOL brl_pending_overlap(struct lock_struct *lock, struct lock_struct *pend_lock)
-{
-       if ((lock->start <= pend_lock->start) && (lock->start + lock->size > pend_lock->start))
-               return True;
-       if ((lock->start >= pend_lock->start) && (lock->start <= pend_lock->start + pend_lock->size))
-               return True;
-       return False;
-}
-
 /****************************************************************************
  Unlock a range of bytes - Windows semantics.
 ****************************************************************************/
 
-static BOOL brl_unlock_windows(struct byte_range_lock *br_lck, const struct lock_struct *plock)
+static BOOL brl_unlock_windows(struct messaging_context *msg_ctx,
+                              struct byte_range_lock *br_lck,
+                              const struct lock_struct *plock)
 {
        unsigned int i, j;
-       struct lock_struct *lock = NULL;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       struct lock_struct *locks = br_lck->lock_data;
+       enum brl_type deleted_lock_type = READ_LOCK; /* shut the compiler up.... */
 
 #if ZERO_ZERO
+       /* Delete write locks by preference... The lock list
+          is sorted in the zero zero case. */
+
        for (i = 0; i < br_lck->num_locks; i++) {
-               lock = &locks[i];
+               struct lock_struct *lock = &locks[i];
 
                if (lock->lock_type == WRITE_LOCK &&
                    brl_same_context(&lock->context, &plock->context) &&
@@ -751,20 +808,19 @@ static BOOL brl_unlock_windows(struct byte_range_lock *br_lck, const struct lock
                    lock->size == plock->size) {
 
                        /* found it - delete it */
-                       if (i < br_lck->num_locks - 1) {
-                               memmove(&locks[i], &locks[i+1], 
-                                       sizeof(*locks)*((br_lck->num_locks-1) - i));
-                       }
-
-                       br_lck->num_locks -= 1;
-                       br_lck->modified = True;
-                       return True;
+                       deleted_lock_type = lock->lock_type;
+                       break;
                }
        }
+
+       if (i != br_lck->num_locks) {
+               /* We found it - don't search again. */
+               goto unlock_continue;
+       }
 #endif
 
        for (i = 0; i < br_lck->num_locks; i++) {
-               lock = &locks[i];
+               struct lock_struct *lock = &locks[i];
 
                /* Only remove our own locks that match in start, size, and flavour. */
                if (brl_same_context(&lock->context, &plock->context) &&
@@ -772,6 +828,7 @@ static BOOL brl_unlock_windows(struct byte_range_lock *br_lck, const struct lock
                                        lock->lock_flav == WINDOWS_LOCK &&
                                        lock->start == plock->start &&
                                        lock->size == plock->size ) {
+                       deleted_lock_type = lock->lock_type;
                        break;
                }
        }
@@ -781,9 +838,28 @@ static BOOL brl_unlock_windows(struct byte_range_lock *br_lck, const struct lock
                return False;
        }
 
-       /* Unlock any POSIX regions. */
-       if(lp_posix_locking(br_lck->fsp->conn->cnum)) {
-               release_posix_lock(br_lck->fsp, plock->start, plock->size);
+#if ZERO_ZERO
+  unlock_continue:
+#endif
+
+       /* Actually delete the lock. */
+       if (i < br_lck->num_locks - 1) {
+               memmove(&locks[i], &locks[i+1], 
+                       sizeof(*locks)*((br_lck->num_locks-1) - i));
+       }
+
+       br_lck->num_locks -= 1;
+       br_lck->modified = True;
+
+       /* Unlock the underlying POSIX regions. */
+       if(lp_posix_locking(br_lck->fsp->conn->params)) {
+               release_posix_lock_windows_flavour(br_lck->fsp,
+                               plock->start,
+                               plock->size,
+                               deleted_lock_type,
+                               &plock->context,
+                               locks,
+                               br_lck->num_locks);
        }
 
        /* Send unlock messages to any pending waiters that overlap. */
@@ -791,31 +867,20 @@ static BOOL brl_unlock_windows(struct byte_range_lock *br_lck, const struct lock
                struct lock_struct *pend_lock = &locks[j];
 
                /* Ignore non-pending locks. */
-               if (pend_lock->lock_type != PENDING_LOCK) {
+               if (!IS_PENDING_LOCK(pend_lock->lock_type)) {
                        continue;
                }
 
                /* We could send specific lock info here... */
-               if (brl_pending_overlap(lock, pend_lock)) {
+               if (brl_pending_overlap(plock, pend_lock)) {
                        DEBUG(10,("brl_unlock: sending unlock message to pid %s\n",
                                procid_str_static(&pend_lock->context.pid )));
 
-                       become_root();
-                       message_send_pid(pend_lock->context.pid,
-                                       MSG_SMB_UNLOCK,
-                                       NULL, 0, True);
-                       unbecome_root();
+                       messaging_send(msg_ctx, pend_lock->context.pid,
+                                      MSG_SMB_UNLOCK, &data_blob_null);
                }
        }
 
-       /* Actually delete the lock. */
-       if (i < br_lck->num_locks - 1) {
-               memmove(&locks[i], &locks[i+1], 
-                       sizeof(*locks)*((br_lck->num_locks-1) - i));
-       }
-
-       br_lck->num_locks -= 1;
-       br_lck->modified = True;
        return True;
 }
 
@@ -823,12 +888,13 @@ static BOOL brl_unlock_windows(struct byte_range_lock *br_lck, const struct lock
  Unlock a range of bytes - POSIX semantics.
 ****************************************************************************/
 
-static BOOL brl_unlock_posix(struct byte_range_lock *br_lck, const struct lock_struct *plock)
+static BOOL brl_unlock_posix(struct messaging_context *msg_ctx,
+                            struct byte_range_lock *br_lck,
+                            const struct lock_struct *plock)
 {
        unsigned int i, j, count;
-       struct lock_struct *lock = NULL;
        struct lock_struct *tp;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       struct lock_struct *locks = br_lck->lock_data;
        BOOL overlap_found = False;
 
        /* No zero-zero locks for POSIX. */
@@ -855,14 +921,13 @@ static BOOL brl_unlock_posix(struct byte_range_lock *br_lck, const struct lock_s
 
        count = 0;
        for (i = 0; i < br_lck->num_locks; i++) {
+               struct lock_struct *lock = &locks[i];
                struct lock_struct tmp_lock[3];
                BOOL lock_was_added = False;
                unsigned int tmp_count;
 
-               lock = &locks[i];
-
                /* Only remove our own locks - ignore fnum. */
-               if (lock->lock_type == PENDING_LOCK ||
+               if (IS_PENDING_LOCK(lock->lock_type) ||
                                !brl_same_context(&lock->context, &plock->context)) {
                        memcpy(&tp[count], lock, sizeof(struct lock_struct));
                        count++;
@@ -893,19 +958,24 @@ static BOOL brl_unlock_posix(struct byte_range_lock *br_lck, const struct lock_s
                                SMB_ASSERT(tmp_lock[0].lock_type == locks[i].lock_type);
                                SMB_ASSERT(tmp_lock[1].lock_type == UNLOCK_LOCK);
                                memcpy(&tp[count], &tmp_lock[0], sizeof(struct lock_struct));
+                               if (tmp_lock[0].size != locks[i].size) {
+                                       overlap_found = True;
+                               }
                        } else {
                                SMB_ASSERT(tmp_lock[0].lock_type == UNLOCK_LOCK);
                                SMB_ASSERT(tmp_lock[1].lock_type == locks[i].lock_type);
                                memcpy(&tp[count], &tmp_lock[1], sizeof(struct lock_struct));
+                               if (tmp_lock[1].start != locks[i].start) {
+                                       overlap_found = True;
+                               }
                        }
                        count++;
-                       overlap_found = True;
                        continue;
                } else {
                        /* tmp_count == 3 - (we split a lock range in two). */
                        SMB_ASSERT(tmp_lock[0].lock_type == locks[i].lock_type);
                        SMB_ASSERT(tmp_lock[1].lock_type == UNLOCK_LOCK);
-                       SMB_ASSERT(tmp_lock[2].lock_type != locks[i].lock_type);
+                       SMB_ASSERT(tmp_lock[2].lock_type == locks[i].lock_type);
 
                        memcpy(&tp[count], &tmp_lock[0], sizeof(struct lock_struct));
                        count++;
@@ -931,14 +1001,15 @@ static BOOL brl_unlock_posix(struct byte_range_lock *br_lck, const struct lock_s
                return True;
        }
 
-#if 0
-       /* FIXME - this call doesn't work correctly yet for POSIX locks... */
-
        /* Unlock any POSIX regions. */
-       if(lp_posix_locking(br_lck->fsp->conn->cnum)) {
-               release_posix_lock(br_lck->fsp, plock->start, plock->size);
+       if(lp_posix_locking(br_lck->fsp->conn->params)) {
+               release_posix_lock_posix_flavour(br_lck->fsp,
+                                               plock->start,
+                                               plock->size,
+                                               &plock->context,
+                                               tp,
+                                               count);
        }
-#endif
 
        /* Realloc so we don't leak entries per unlock call. */
        if (count) {
@@ -954,30 +1025,28 @@ static BOOL brl_unlock_posix(struct byte_range_lock *br_lck, const struct lock_s
        }
 
        br_lck->num_locks = count;
-       br_lck->lock_data = (void *)tp;
+       SAFE_FREE(br_lck->lock_data);
+       locks = tp;
+       br_lck->lock_data = tp;
        br_lck->modified = True;
 
        /* Send unlock messages to any pending waiters that overlap. */
-       locks = tp;
 
        for (j=0; j < br_lck->num_locks; j++) {
                struct lock_struct *pend_lock = &locks[j];
 
                /* Ignore non-pending locks. */
-               if (pend_lock->lock_type != PENDING_LOCK) {
+               if (!IS_PENDING_LOCK(pend_lock->lock_type)) {
                        continue;
                }
 
                /* We could send specific lock info here... */
-               if (brl_pending_overlap(lock, pend_lock)) {
+               if (brl_pending_overlap(plock, pend_lock)) {
                        DEBUG(10,("brl_unlock: sending unlock message to pid %s\n",
                                procid_str_static(&pend_lock->context.pid )));
 
-                       become_root();
-                       message_send_pid(pend_lock->context.pid,
-                                       MSG_SMB_UNLOCK,
-                                       NULL, 0, True);
-                       unbecome_root();
+                       messaging_send(msg_ctx, pend_lock->context.pid,
+                                      MSG_SMB_UNLOCK, &data_blob_null);
                }
        }
 
@@ -988,9 +1057,10 @@ static BOOL brl_unlock_posix(struct byte_range_lock *br_lck, const struct lock_s
  Unlock a range of bytes.
 ****************************************************************************/
 
-BOOL brl_unlock(struct byte_range_lock *br_lck,
-               uint16 smbpid,
-               struct process_id pid,
+BOOL brl_unlock(struct messaging_context *msg_ctx,
+               struct byte_range_lock *br_lck,
+               uint32 smbpid,
+               struct server_id pid,
                br_off start,
                br_off size,
                enum brl_flavour lock_flav)
@@ -1007,9 +1077,9 @@ BOOL brl_unlock(struct byte_range_lock *br_lck,
        lock.lock_flav = lock_flav;
 
        if (lock_flav == WINDOWS_LOCK) {
-               return brl_unlock_windows(br_lck, &lock);
+               return brl_unlock_windows(msg_ctx, br_lck, &lock);
        } else {
-               return brl_unlock_posix(br_lck, &lock);
+               return brl_unlock_posix(msg_ctx, br_lck, &lock);
        }
 }
 
@@ -1019,8 +1089,8 @@ BOOL brl_unlock(struct byte_range_lock *br_lck,
 ****************************************************************************/
 
 BOOL brl_locktest(struct byte_range_lock *br_lck,
-               uint16 smbpid,
-               struct process_id pid,
+               uint32 smbpid,
+               struct server_id pid,
                br_off start,
                br_off size, 
                enum brl_type lock_type,
@@ -1029,7 +1099,7 @@ BOOL brl_locktest(struct byte_range_lock *br_lck,
        BOOL ret = True;
        unsigned int i;
        struct lock_struct lock;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       const struct lock_struct *locks = br_lck->lock_data;
        files_struct *fsp = br_lck->fsp;
 
        lock.context.smbpid = smbpid;
@@ -1057,7 +1127,7 @@ BOOL brl_locktest(struct byte_range_lock *br_lck,
         * This only conflicts with Windows locks, not POSIX locks.
         */
 
-       if(lp_posix_locking(fsp->conn->cnum) && (lock_flav == WINDOWS_LOCK)) {
+       if(lp_posix_locking(fsp->conn->params) && (lock_flav == WINDOWS_LOCK)) {
                ret = is_posix_locked(fsp, &start, &size, &lock_type, WINDOWS_LOCK);
 
                DEBUG(10,("brl_locktest: posix start=%.0f len=%.0f %s for fnum %d file %s\n",
@@ -1077,8 +1147,8 @@ BOOL brl_locktest(struct byte_range_lock *br_lck,
 ****************************************************************************/
 
 NTSTATUS brl_lockquery(struct byte_range_lock *br_lck,
-               uint16 *psmbpid,
-               struct process_id pid,
+               uint32 *psmbpid,
+               struct server_id pid,
                br_off *pstart,
                br_off *psize, 
                enum brl_type *plock_type,
@@ -1086,7 +1156,7 @@ NTSTATUS brl_lockquery(struct byte_range_lock *br_lck,
 {
        unsigned int i;
        struct lock_struct lock;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       const struct lock_struct *locks = br_lck->lock_data;
        files_struct *fsp = br_lck->fsp;
 
        lock.context.smbpid = *psmbpid;
@@ -1100,7 +1170,7 @@ NTSTATUS brl_lockquery(struct byte_range_lock *br_lck,
 
        /* Make sure existing locks don't conflict */
        for (i=0; i < br_lck->num_locks; i++) {
-               struct lock_struct *exlock = &locks[i];
+               const struct lock_struct *exlock = &locks[i];
                BOOL conflict = False;
 
                if (exlock->lock_flav == WINDOWS_LOCK) {
@@ -1123,7 +1193,7 @@ NTSTATUS brl_lockquery(struct byte_range_lock *br_lck,
         * see if there is a POSIX lock from a UNIX or NFS process.
         */
 
-       if(lp_posix_locking(fsp->conn->cnum)) {
+       if(lp_posix_locking(fsp->conn->params)) {
                BOOL ret = is_posix_locked(fsp, pstart, psize, plock_type, POSIX_LOCK);
 
                DEBUG(10,("brl_lockquery: posix start=%.0f len=%.0f %s for fnum %d file %s\n",
@@ -1140,20 +1210,19 @@ NTSTATUS brl_lockquery(struct byte_range_lock *br_lck,
        return NT_STATUS_OK;
 }
 
-
 /****************************************************************************
  Remove a particular pending lock.
 ****************************************************************************/
 
-BOOL brl_remove_pending_lock(struct byte_range_lock *br_lck,
-               uint16 smbpid,
-               struct process_id pid,
+BOOL brl_lock_cancel(struct byte_range_lock *br_lck,
+               uint32 smbpid,
+               struct server_id pid,
                br_off start,
                br_off size,
                enum brl_flavour lock_flav)
 {
        unsigned int i;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       struct lock_struct *locks = br_lck->lock_data;
        struct lock_context context;
 
        context.smbpid = smbpid;
@@ -1166,7 +1235,7 @@ BOOL brl_remove_pending_lock(struct byte_range_lock *br_lck,
                /* For pending locks we *always* care about the fnum. */
                if (brl_same_context(&lock->context, &context) &&
                                lock->fnum == br_lck->fsp->fnum &&
-                               lock->lock_type == PENDING_LOCK &&
+                               IS_PENDING_LOCK(lock->lock_type) &&
                                lock->lock_flav == lock_flav &&
                                lock->start == start &&
                                lock->size == size) {
@@ -1190,18 +1259,82 @@ BOOL brl_remove_pending_lock(struct byte_range_lock *br_lck,
        return True;
 }
 
-
 /****************************************************************************
  Remove any locks associated with a open file.
+ We return True if this process owns any other Windows locks on this
+ fd and so we should not immediately close the fd.
 ****************************************************************************/
 
-void brl_close_fnum(struct byte_range_lock *br_lck, struct process_id pid)
+void brl_close_fnum(struct messaging_context *msg_ctx,
+                   struct byte_range_lock *br_lck)
 {
        files_struct *fsp = br_lck->fsp;
        uint16 tid = fsp->conn->cnum;
        int fnum = fsp->fnum;
        unsigned int i, j, dcount=0;
-       struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
+       int num_deleted_windows_locks = 0;
+       struct lock_struct *locks = br_lck->lock_data;
+       struct server_id pid = procid_self();
+       BOOL unlock_individually = False;
+
+       if(lp_posix_locking(fsp->conn->params)) {
+
+               /* Check if there are any Windows locks associated with this dev/ino
+                  pair that are not this fnum. If so we need to call unlock on each
+                  one in order to release the system POSIX locks correctly. */
+
+               for (i=0; i < br_lck->num_locks; i++) {
+                       struct lock_struct *lock = &locks[i];
+
+                       if (!procid_equal(&lock->context.pid, &pid)) {
+                               continue;
+                       }
+
+                       if (lock->lock_type != READ_LOCK && lock->lock_type != WRITE_LOCK) {
+                               continue; /* Ignore pending. */
+                       }
+
+                       if (lock->context.tid != tid || lock->fnum != fnum) {
+                               unlock_individually = True;
+                               break;
+                       }
+               }
+
+               if (unlock_individually) {
+                       struct lock_struct *locks_copy;
+                       unsigned int num_locks_copy;
+
+                       /* Copy the current lock array. */
+                       if (br_lck->num_locks) {
+                               locks_copy = (struct lock_struct *)TALLOC_MEMDUP(br_lck, locks, br_lck->num_locks * sizeof(struct lock_struct));
+                               if (!locks_copy) {
+                                       smb_panic("brl_close_fnum: talloc failed");
+                               }
+                       } else {        
+                               locks_copy = NULL;
+                       }
+
+                       num_locks_copy = br_lck->num_locks;
+
+                       for (i=0; i < num_locks_copy; i++) {
+                               struct lock_struct *lock = &locks_copy[i];
+
+                               if (lock->context.tid == tid && procid_equal(&lock->context.pid, &pid) &&
+                                               (lock->fnum == fnum)) {
+                                       brl_unlock(msg_ctx,
+                                               br_lck,
+                                               lock->context.smbpid,
+                                               pid,
+                                               lock->start,
+                                               lock->size,
+                                               lock->lock_flav);
+                               }
+                       }
+                       return;
+               }
+       }
+
+       /* We can bulk delete - any POSIX locks will be removed when the fd closes. */
 
        /* Remove any existing locks for this fnum (or any fnum if they're POSIX). */
 
@@ -1212,6 +1345,7 @@ void brl_close_fnum(struct byte_range_lock *br_lck, struct process_id pid)
                if (lock->context.tid == tid && procid_equal(&lock->context.pid, &pid)) {
                        if ((lock->lock_flav == WINDOWS_LOCK) && (lock->fnum == fnum)) {
                                del_this_lock = True;
+                               num_deleted_windows_locks++;
                        } else if (lock->lock_flav == POSIX_LOCK) {
                                del_this_lock = True;
                        }
@@ -1223,7 +1357,7 @@ void brl_close_fnum(struct byte_range_lock *br_lck, struct process_id pid)
                                struct lock_struct *pend_lock = &locks[j];
 
                                /* Ignore our own or non-pending locks. */
-                               if (pend_lock->lock_type != PENDING_LOCK) {
+                               if (!IS_PENDING_LOCK(pend_lock->lock_type)) {
                                        continue;
                                }
 
@@ -1237,11 +1371,8 @@ void brl_close_fnum(struct byte_range_lock *br_lck, struct process_id pid)
 
                                /* We could send specific lock info here... */
                                if (brl_pending_overlap(lock, pend_lock)) {
-                                       become_root();
-                                       message_send_pid(pend_lock->context.pid,
-                                                       MSG_SMB_UNLOCK,
-                                                       NULL, 0, True);
-                                       unbecome_root();
+                                       messaging_send(msg_ctx, pend_lock->context.pid,
+                                                      MSG_SMB_UNLOCK, &data_blob_null);
                                }
                        }
 
@@ -1256,33 +1387,129 @@ void brl_close_fnum(struct byte_range_lock *br_lck, struct process_id pid)
                        dcount++;
                }
        }
+
+       if(lp_posix_locking(fsp->conn->params) && num_deleted_windows_locks) {
+               /* Reduce the Windows lock POSIX reference count on this dev/ino pair. */
+               reduce_windows_lock_ref_count(fsp, num_deleted_windows_locks);
+       }
 }
 
+/****************************************************************************
+ Ensure this set of lock entries is valid.
+****************************************************************************/
+
+static BOOL validate_lock_entries(unsigned int *pnum_entries, struct lock_struct **pplocks)
+{
+       unsigned int i;
+       unsigned int num_valid_entries = 0;
+       struct lock_struct *locks = *pplocks;
+
+       for (i = 0; i < *pnum_entries; i++) {
+               struct lock_struct *lock_data = &locks[i];
+               if (!process_exists(lock_data->context.pid)) {
+                       /* This process no longer exists - mark this
+                          entry as invalid by zeroing it. */
+                       ZERO_STRUCTP(lock_data);
+               } else {
+                       num_valid_entries++;
+               }
+       }
+
+       if (num_valid_entries != *pnum_entries) {
+               struct lock_struct *new_lock_data = NULL;
+
+               if (num_valid_entries) {
+                       new_lock_data = SMB_MALLOC_ARRAY(struct lock_struct, num_valid_entries);
+                       if (!new_lock_data) {
+                               DEBUG(3, ("malloc fail\n"));
+                               return False;
+                       }
+
+                       num_valid_entries = 0;
+                       for (i = 0; i < *pnum_entries; i++) {
+                               struct lock_struct *lock_data = &locks[i];
+                               if (lock_data->context.smbpid &&
+                                               lock_data->context.tid) {
+                                       /* Valid (nonzero) entry - copy it. */
+                                       memcpy(&new_lock_data[num_valid_entries],
+                                               lock_data, sizeof(struct lock_struct));
+                                       num_valid_entries++;
+                               }
+                       }
+               }
+
+               SAFE_FREE(*pplocks);
+               *pplocks = new_lock_data;
+               *pnum_entries = num_valid_entries;
+       }
+
+       return True;
+}
+
+struct brl_forall_cb {
+       void (*fn)(struct file_id id, struct server_id pid,
+                  enum brl_type lock_type,
+                  enum brl_flavour lock_flav,
+                  br_off start, br_off size,
+                  void *private_data);
+       void *private_data;
+};
+
 /****************************************************************************
  Traverse the whole database with this function, calling traverse_callback
  on each lock.
 ****************************************************************************/
 
-static int traverse_fn(TDB_CONTEXT *ttdb, TDB_DATA kbuf, TDB_DATA dbuf, void *state)
+static int traverse_fn(struct db_record *rec, void *state)
 {
+       struct brl_forall_cb *cb = (struct brl_forall_cb *)state;
        struct lock_struct *locks;
-       struct lock_key *key;
-       int i;
+       struct file_id *key;
+       unsigned int i;
+       unsigned int num_locks = 0;
+       unsigned int orig_num_locks = 0;
 
-       BRLOCK_FN(traverse_callback) = (BRLOCK_FN_CAST())state;
+       /* In a traverse function we must make a copy of
+          dbuf before modifying it. */
 
-       locks = (struct lock_struct *)dbuf.dptr;
-       key = (struct lock_key *)kbuf.dptr;
+       locks = (struct lock_struct *)memdup(rec->value.dptr,
+                                            rec->value.dsize);
+       if (!locks) {
+               return -1; /* Terminate traversal. */
+       }
+
+       key = (struct file_id *)rec->key.dptr;
+       orig_num_locks = num_locks = rec->value.dsize/sizeof(*locks);
+
+       /* Ensure the lock db is clean of entries from invalid processes. */
+
+       if (!validate_lock_entries(&num_locks, &locks)) {
+               SAFE_FREE(locks);
+               return -1; /* Terminate traversal */
+       }
+
+       if (orig_num_locks != num_locks) {
+               if (num_locks) {
+                       TDB_DATA data;
+                       data.dptr = (uint8_t *)locks;
+                       data.dsize = num_locks*sizeof(struct lock_struct);
+                       rec->store(rec, data, TDB_REPLACE);
+               } else {
+                       rec->delete_rec(rec);
+               }
+       }
 
-       for (i=0;i<dbuf.dsize/sizeof(*locks);i++) {
-               traverse_callback(key->device,
-                                 key->inode,
-                                 locks[i].context.pid,
-                                 locks[i].lock_type,
-                                 locks[i].lock_flav,
-                                 locks[i].start,
-                                 locks[i].size);
+       for ( i=0; i<num_locks; i++) {
+               cb->fn(*key,
+                      locks[i].context.pid,
+                      locks[i].lock_type,
+                      locks[i].lock_flav,
+                      locks[i].start,
+                      locks[i].size,
+                      cb->private_data);
        }
+
+       SAFE_FREE(locks);
        return 0;
 }
 
@@ -1290,12 +1517,21 @@ static int traverse_fn(TDB_CONTEXT *ttdb, TDB_DATA kbuf, TDB_DATA dbuf, void *st
  Call the specified function on each lock in the database.
 ********************************************************************/
 
-int brl_forall(BRLOCK_FN(fn))
+int brl_forall(void (*fn)(struct file_id id, struct server_id pid,
+                         enum brl_type lock_type,
+                         enum brl_flavour lock_flav,
+                         br_off start, br_off size,
+                         void *private_data),
+              void *private_data)
 {
-       if (!tdb) {
+       struct brl_forall_cb cb;
+
+       if (!brlock_db) {
                return 0;
        }
-       return tdb_traverse(tdb, traverse_fn, (void *)fn);
+       cb.fn = fn;
+       cb.private_data = private_data;
+       return brlock_db->traverse(brlock_db, traverse_fn, &cb);
 }
 
 /*******************************************************************
@@ -1304,12 +1540,16 @@ int brl_forall(BRLOCK_FN(fn))
  Unlock the record.
 ********************************************************************/
 
-int byte_range_lock_destructor(struct byte_range_lock *br_lck)
+static int byte_range_lock_destructor(struct byte_range_lock *br_lck)
 {
        TDB_DATA key;
 
-       key.dptr = (char *)&br_lck->key;
-       key.dsize = sizeof(struct lock_key);
+       key.dptr = (uint8 *)&br_lck->key;
+       key.dsize = sizeof(struct file_id);
+
+       if (br_lck->read_only) {
+               SMB_ASSERT(!br_lck->modified);
+       }
 
        if (!br_lck->modified) {
                goto done;
@@ -1317,37 +1557,45 @@ int byte_range_lock_destructor(struct byte_range_lock *br_lck)
 
        if (br_lck->num_locks == 0) {
                /* No locks - delete this entry. */
-               if (tdb_delete(tdb, key) == -1) {
-                       smb_panic("Could not delete byte range lock entry\n");
+               NTSTATUS status = br_lck->record->delete_rec(br_lck->record);
+               if (!NT_STATUS_IS_OK(status)) {
+                       DEBUG(0, ("delete_rec returned %s\n",
+                                 nt_errstr(status)));
+                       smb_panic("Could not delete byte range lock entry");
                }
        } else {
                TDB_DATA data;
-               data.dptr = br_lck->lock_data;
+               NTSTATUS status;
+
+               data.dptr = (uint8 *)br_lck->lock_data;
                data.dsize = br_lck->num_locks * sizeof(struct lock_struct);
 
-               if (tdb_store(tdb, key, data, TDB_REPLACE) == -1) {
-                       smb_panic("Could not store byte range mode entry\n");
+               status = br_lck->record->store(br_lck->record, data,
+                                              TDB_REPLACE);
+               if (!NT_STATUS_IS_OK(status)) {
+                       DEBUG(0, ("store returned %s\n", nt_errstr(status)));
+                       smb_panic("Could not store byte range mode entry");
                }
        }
 
  done:
 
-       tdb_chainunlock(tdb, key);
        SAFE_FREE(br_lck->lock_data);
-       SAFE_FREE(br_lck);
+       TALLOC_FREE(br_lck->record);
        return 0;
 }
 
 /*******************************************************************
  Fetch a set of byte range lock data from the database.
  Leave the record locked.
+ TALLOC_FREE(brl) will release the lock in the destructor.
 ********************************************************************/
 
-struct byte_range_lock *brl_get_locks(files_struct *fsp)
+static struct byte_range_lock *brl_get_locks_internal(TALLOC_CTX *mem_ctx,
+                                       files_struct *fsp, BOOL read_only)
 {
-       TDB_DATA key;
-       TDB_DATA data;
-       struct byte_range_lock *br_lck = SMB_MALLOC_P(struct byte_range_lock);
+       TDB_DATA key, data;
+       struct byte_range_lock *br_lck = TALLOC_P(mem_ctx, struct byte_range_lock);
 
        if (br_lck == NULL) {
                return NULL;
@@ -1356,32 +1604,202 @@ struct byte_range_lock *brl_get_locks(files_struct *fsp)
        br_lck->fsp = fsp;
        br_lck->num_locks = 0;
        br_lck->modified = False;
-       memset(&br_lck->key, '\0', sizeof(struct lock_key));
-       br_lck->key.device = fsp->dev;
-       br_lck->key.inode = fsp->inode;
+       memset(&br_lck->key, '\0', sizeof(struct file_id));
+       br_lck->key = fsp->file_id;
 
-       key.dptr = (char *)&br_lck->key;
-       key.dsize = sizeof(struct lock_key);
+       key.dptr = (uint8 *)&br_lck->key;
+       key.dsize = sizeof(struct file_id);
 
-       if (tdb_chainlock(tdb, key) != 0) {
-               DEBUG(3, ("Could not lock byte range lock entry\n"));
-               SAFE_FREE(br_lck);
-               return NULL;
+       if (!fsp->lockdb_clean) {
+               /* We must be read/write to clean
+                  the dead entries. */
+               read_only = False;
        }
 
-       data = tdb_fetch(tdb, key);
-       br_lck->lock_data = (void *)data.dptr;
+       if (read_only) {
+               if (brlock_db->fetch(brlock_db, br_lck, key, &data) == -1) {
+                       DEBUG(3, ("Could not fetch byte range lock record\n"));
+                       TALLOC_FREE(br_lck);
+                       return NULL;
+               }
+               br_lck->record = NULL;
+       }
+       else {
+               br_lck->record = brlock_db->fetch_locked(brlock_db, br_lck, key);
+
+               if (br_lck->record == NULL) {
+                       DEBUG(3, ("Could not lock byte range lock entry\n"));
+                       TALLOC_FREE(br_lck);
+                       return NULL;
+               }
+
+               data = br_lck->record->value;
+       }
+
+       br_lck->read_only = read_only;
+
+       talloc_set_destructor(br_lck, byte_range_lock_destructor);
+
        br_lck->num_locks = data.dsize / sizeof(struct lock_struct);
+       br_lck->lock_data = SMB_MALLOC_ARRAY(struct lock_struct, br_lck->num_locks);
+       if ((br_lck->num_locks != 0) && (br_lck->lock_data == NULL)) {
+               DEBUG(0, ("malloc failed\n"));
+               TALLOC_FREE(br_lck);
+               return NULL;
+       }
+
+       memcpy(br_lck->lock_data, data.dptr, data.dsize);
+       
+       if (!fsp->lockdb_clean) {
+               int orig_num_locks = br_lck->num_locks;
+
+               /* This is the first time we've accessed this. */
+               /* Go through and ensure all entries exist - remove any that don't. */
+               /* Makes the lockdb self cleaning at low cost. */
+
+               if (!validate_lock_entries(&br_lck->num_locks,
+                                          &br_lck->lock_data)) {
+                       SAFE_FREE(br_lck->lock_data);
+                       TALLOC_FREE(br_lck);
+                       return NULL;
+               }
+
+               /* Ensure invalid locks are cleaned up in the destructor. */
+               if (orig_num_locks != br_lck->num_locks) {
+                       br_lck->modified = True;
+               }
+
+               /* Mark the lockdb as "clean" as seen from this open file. */
+               fsp->lockdb_clean = True;
+       }
 
        if (DEBUGLEVEL >= 10) {
                unsigned int i;
-               struct lock_struct *locks = (struct lock_struct *)br_lck->lock_data;
-               DEBUG(10,("brl_get_locks: %u current locks on dev=%.0f, inode=%.0f\n",
+               struct lock_struct *locks = br_lck->lock_data;
+               DEBUG(10,("brl_get_locks_internal: %u current locks on file_id %s\n",
                        br_lck->num_locks,
-                       (double)fsp->dev, (double)fsp->inode ));
+                         file_id_static_string(&fsp->file_id)));
                for( i = 0; i < br_lck->num_locks; i++) {
                        print_lock_struct(i, &locks[i]);
                }
        }
        return br_lck;
 }
+
+struct byte_range_lock *brl_get_locks(TALLOC_CTX *mem_ctx,
+                                       files_struct *fsp)
+{
+       return brl_get_locks_internal(mem_ctx, fsp, False);
+}
+
+struct byte_range_lock *brl_get_locks_readonly(TALLOC_CTX *mem_ctx,
+                                       files_struct *fsp)
+{
+       return brl_get_locks_internal(mem_ctx, fsp, True);
+}
+
+struct brl_revalidate_state {
+       ssize_t array_size;
+       uint32 num_pids;
+       struct server_id *pids;
+};
+
+/*
+ * Collect PIDs of all processes with pending entries
+ */
+
+static void brl_revalidate_collect(struct file_id id, struct server_id pid,
+                                  enum brl_type lock_type,
+                                  enum brl_flavour lock_flav,
+                                  br_off start, br_off size,
+                                  void *private_data)
+{
+       struct brl_revalidate_state *state =
+               (struct brl_revalidate_state *)private_data;
+
+       if (!IS_PENDING_LOCK(lock_type)) {
+               return;
+       }
+
+       add_to_large_array(state, sizeof(pid), (void *)&pid,
+                          &state->pids, &state->num_pids,
+                          &state->array_size);
+}
+
+/*
+ * qsort callback to sort the processes
+ */
+
+static int compare_procids(const void *p1, const void *p2)
+{
+       const struct server_id *i1 = (struct server_id *)p1;
+       const struct server_id *i2 = (struct server_id *)p2;
+
+       if (i1->pid < i2->pid) return -1;
+       if (i2->pid > i2->pid) return 1;
+       return 0;
+}
+
+/*
+ * Send a MSG_SMB_UNLOCK message to all processes with pending byte range
+ * locks so that they retry. Mainly used in the cluster code after a node has
+ * died.
+ *
+ * Done in two steps to avoid double-sends: First we collect all entries in an
+ * array, then qsort that array and only send to non-dupes.
+ */
+
+static void brl_revalidate(struct messaging_context *msg_ctx,
+                          void *private_data,
+                          uint32_t msg_type,
+                          struct server_id server_id,
+                          DATA_BLOB *data)
+{
+       struct brl_revalidate_state *state;
+       uint32 i;
+       struct server_id last_pid;
+
+       if (!(state = TALLOC_ZERO_P(NULL, struct brl_revalidate_state))) {
+               DEBUG(0, ("talloc failed\n"));
+               return;
+       }
+
+       brl_forall(brl_revalidate_collect, state);
+
+       if (state->array_size == -1) {
+               DEBUG(0, ("talloc failed\n"));
+               goto done;
+       }
+
+       if (state->num_pids == 0) {
+               goto done;
+       }
+
+       qsort(state->pids, state->num_pids, sizeof(state->pids[0]),
+             compare_procids);
+
+       ZERO_STRUCT(last_pid);
+
+       for (i=0; i<state->num_pids; i++) {
+               if (procid_equal(&last_pid, &state->pids[i])) {
+                       /*
+                        * We've seen that one already
+                        */
+                       continue;
+               }
+
+               messaging_send(msg_ctx, state->pids[i], MSG_SMB_UNLOCK,
+                              &data_blob_null);
+               last_pid = state->pids[i];
+       }
+
+ done:
+       TALLOC_FREE(state);
+       return;
+}
+
+void brl_register_msgs(struct messaging_context *msg_ctx)
+{
+       messaging_register(msg_ctx, NULL, MSG_SMB_BRL_VALIDATE,
+                          brl_revalidate);
+}