s3: Use dbwrap_parse_record in fetch_share_mode_unlocked

[kai/samba.git] / source3 / locking / brlock.c
diff --git a/source3/locking/brlock.c b/source3/locking/brlock.c

index 20d76c9c7926eec9079d075fb2f725e67a40d92d..b7abaa9288c2e8c13764e5f6375d8d6fc0fda5a9 100644 (file)
--- a/source3/locking/brlock.c
+++ b/source3/locking/brlock.c
@@ -5,20 +5,19 @@
  
     Copyright (C) Andrew Tridgell 1992-2000
     Copyright (C) Jeremy Allison 1992-2000
-   
+
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
+   the Free Software Foundation; either version 3 of the License, or
     (at your option) any later version.
-   
+
     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-   
+
     You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
  
  /* This module implements a tdb based byte range locking service,
@@ -26,103 +25,143 @@
     used. This allows us to provide the same semantics as NT */
  
  #include "includes.h"
+#include "system/filesys.h"
+#include "locking/proto.h"
+#include "smbd/globals.h"
+#include "dbwrap/dbwrap.h"
+#include "dbwrap/dbwrap_open.h"
+#include "serverid.h"
+#include "messages.h"
  
-#define ZERO_ZERO 0
-
-/* This contains elements that differentiate locks. The smbpid is a
-   client supplied pid, and is essentially the locking context for
-   this client */
-
-struct lock_context {
-       uint16 smbpid;
-       uint16 tid;
-       pid_t pid;
-};
-
-/* The data in brlock records is an unsorted linear array of these
-   records.  It is unnecessary to store the count as tdb provides the
-   size of the record */
+#undef DBGC_CLASS
+#define DBGC_CLASS DBGC_LOCKING
  
-struct lock_struct {
-       struct lock_context context;
-       br_off start;
-       br_off size;
-       int fnum;
-       enum brl_type lock_type;
-};
-
-/* The key used in the brlock database. */
-
-struct lock_key {
-       SMB_DEV_T device;
-       SMB_INO_T inode;
-};
+#define ZERO_ZERO 0
  
  /* The open brlock.tdb database. */
  
-static TDB_CONTEXT *tdb;
+static struct db_context *brlock_db;
  
  /****************************************************************************
- Create a locking key - ensuring zero filled for pad purposes.
+ Debug info at level 10 for lock struct.
  ****************************************************************************/
  
-static TDB_DATA locking_key(SMB_DEV_T dev, SMB_INO_T inode)
+static void print_lock_struct(unsigned int i, const struct lock_struct *pls)
  {
-        static struct lock_key key;
-        TDB_DATA kbuf;
-
-        memset(&key, '\0', sizeof(key));
-        key.device = dev;
-        key.inode = inode;
-        kbuf.dptr = (char *)&key;
-        kbuf.dsize = sizeof(key);
-        return kbuf;
+       DEBUG(10,("[%u]: smblctx = %llu, tid = %u, pid = %s, ",
+                       i,
+                       (unsigned long long)pls->context.smblctx,
+                       (unsigned int)pls->context.tid,
+                       server_id_str(talloc_tos(), &pls->context.pid) ));
+
+       DEBUG(10,("start = %.0f, size = %.0f, fnum = %llu, %s %s\n",
+               (double)pls->start,
+               (double)pls->size,
+               (unsigned long long)pls->fnum,
+               lock_type_name(pls->lock_type),
+               lock_flav_name(pls->lock_flav) ));
  }
  
  /****************************************************************************
   See if two locking contexts are equal.
  ****************************************************************************/
  
-static BOOL brl_same_context(struct lock_context *ctx1, 
-                            struct lock_context *ctx2)
+bool brl_same_context(const struct lock_context *ctx1, 
+                            const struct lock_context *ctx2)
+{
+       return (serverid_equal(&ctx1->pid, &ctx2->pid) &&
+               (ctx1->smblctx == ctx2->smblctx) &&
+               (ctx1->tid == ctx2->tid));
+}
+
+/****************************************************************************
+ See if lck1 and lck2 overlap.
+****************************************************************************/
+
+static bool brl_overlap(const struct lock_struct *lck1,
+                        const struct lock_struct *lck2)
  {
-       return (ctx1->pid == ctx2->pid) &&
-               (ctx1->smbpid == ctx2->smbpid) &&
-               (ctx1->tid == ctx2->tid);
+       /* XXX Remove for Win7 compatibility. */
+       /* this extra check is not redundent - it copes with locks
+          that go beyond the end of 64 bit file space */
+       if (lck1->size != 0 &&
+           lck1->start == lck2->start &&
+           lck1->size == lck2->size) {
+               return True;
+       }
+
+       if (lck1->start >= (lck2->start+lck2->size) ||
+           lck2->start >= (lck1->start+lck1->size)) {
+               return False;
+       }
+       return True;
  }
  
  /****************************************************************************
   See if lock2 can be added when lock1 is in place.
  ****************************************************************************/
  
-static BOOL brl_conflict(struct lock_struct *lck1, 
-                        struct lock_struct *lck2)
+static bool brl_conflict(const struct lock_struct *lck1, 
+                        const struct lock_struct *lck2)
  {
-       if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
+       /* Ignore PENDING locks. */
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
                 return False;
  
+       /* Read locks never conflict. */
         if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) {
                 return False;
         }
  
-       if (brl_same_context(&lck1->context, &lck2->context) &&
-           lck2->lock_type == READ_LOCK && lck1->fnum == lck2->fnum) {
+       /* A READ lock can stack on top of a WRITE lock if they have the same
+        * context & fnum. */
+       if (lck1->lock_type == WRITE_LOCK && lck2->lock_type == READ_LOCK &&
+           brl_same_context(&lck1->context, &lck2->context) &&
+           lck1->fnum == lck2->fnum) {
                 return False;
         }
  
-       if (lck1->start >= (lck2->start + lck2->size) ||
-           lck2->start >= (lck1->start + lck1->size)) {
+       return brl_overlap(lck1, lck2);
+} 
+
+/****************************************************************************
+ See if lock2 can be added when lock1 is in place - when both locks are POSIX
+ flavour. POSIX locks ignore fnum - they only care about dev/ino which we
+ know already match.
+****************************************************************************/
+
+static bool brl_conflict_posix(const struct lock_struct *lck1, 
+                               const struct lock_struct *lck2)
+{
+#if defined(DEVELOPER)
+       SMB_ASSERT(lck1->lock_flav == POSIX_LOCK);
+       SMB_ASSERT(lck2->lock_flav == POSIX_LOCK);
+#endif
+
+       /* Ignore PENDING locks. */
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
+               return False;
+
+       /* Read locks never conflict. */
+       if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) {
                 return False;
         }
-           
-       return True;
+
+       /* Locks on the same context con't conflict. Ignore fnum. */
+       if (brl_same_context(&lck1->context, &lck2->context)) {
+               return False;
+       }
+
+       /* One is read, the other write, or the context is different,
+          do they overlap ? */
+       return brl_overlap(lck1, lck2);
  } 
  
  #if ZERO_ZERO
-static BOOL brl_conflict1(struct lock_struct *lck1, 
-                        struct lock_struct *lck2)
+static bool brl_conflict1(const struct lock_struct *lck1, 
+                        const struct lock_struct *lck2)
  {
-       if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
                 return False;
  
         if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) {
@@ -142,24 +181,31 @@ static BOOL brl_conflict1(struct lock_struct *lck1,
             lck2->start >= (lck1->start + lck1->size)) {
                 return False;
         }
-           
+
         return True;
  } 
  #endif
  
  /****************************************************************************
   Check to see if this lock conflicts, but ignore our own locks on the
- same fnum only.
+ same fnum only. This is the read/write lock check code path.
+ This is never used in the POSIX lock case.
  ****************************************************************************/
  
-static BOOL brl_conflict_other(struct lock_struct *lck1, struct lock_struct *lck2)
+static bool brl_conflict_other(const struct lock_struct *lck1, const struct lock_struct *lck2)
  {
-       if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
+       if (IS_PENDING_LOCK(lck1->lock_type) || IS_PENDING_LOCK(lck2->lock_type))
                 return False;
  
         if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) 
                 return False;
  
+       /* POSIX flavour locks never conflict here - this is only called
+          in the read/write path. */
+
+       if (lck1->lock_flav == POSIX_LOCK && lck2->lock_flav == POSIX_LOCK)
+               return False;
+
         /*
          * Incoming WRITE locks conflict with existing READ locks even
          * if the context is the same. JRA. See LOCKTEST7 in smbtorture.
@@ -171,126 +217,108 @@ static BOOL brl_conflict_other(struct lock_struct *lck1, struct lock_struct *lck
                         return False;
         }
  
-       if (lck1->start >= (lck2->start + lck2->size) ||
-           lck2->start >= (lck1->start + lck1->size)) return False;
-           
-       return True;
+       return brl_overlap(lck1, lck2);
  } 
  
-
-#if DONT_DO_THIS
-       /* doing this traversal could kill solaris machines under high load (tridge) */
-       /* delete any dead locks */
-
  /****************************************************************************
- Delete a record if it is for a dead process, if check_self is true, then
- delete any records belonging to this pid also (there shouldn't be any).
+ Check if an unlock overlaps a pending lock.
  ****************************************************************************/
  
-static int delete_fn(TDB_CONTEXT *ttdb, TDB_DATA kbuf, TDB_DATA dbuf, void *state)
+static bool brl_pending_overlap(const struct lock_struct *lock, const struct lock_struct *pend_lock)
  {
-       struct lock_struct *locks;
-       int count, i;
-       BOOL check_self = *(BOOL *)state;
-       pid_t mypid = sys_getpid();
-
-       tdb_chainlock(tdb, kbuf);
-
-       locks = (struct lock_struct *)dbuf.dptr;
-
-       count = dbuf.dsize / sizeof(*locks);
-       for (i=0; i<count; i++) {
-               struct lock_struct *lock = &locks[i];
-
-               /* If check_self is true we want to remove our own records. */
-               if (check_self && (mypid == lock->context.pid)) {
-
-                       DEBUG(0,("brlock : delete_fn. LOGIC ERROR ! Shutting down and a record for my pid (%u) exists !\n",
-                                       (unsigned int)lock->context.pid ));
-
-               } else if (process_exists(lock->context.pid)) {
-
-                       DEBUG(10,("brlock : delete_fn. pid %u exists.\n", (unsigned int)lock->context.pid ));
-                       continue;
-               }
+       if ((lock->start <= pend_lock->start) && (lock->start + lock->size > pend_lock->start))
+               return True;
+       if ((lock->start >= pend_lock->start) && (lock->start <= pend_lock->start + pend_lock->size))
+               return True;
+       return False;
+}
  
-               DEBUG(10,("brlock : delete_fn. Deleting record for process %u\n",
-                               (unsigned int)lock->context.pid ));
+/****************************************************************************
+ Amazingly enough, w2k3 "remembers" whether the last lock failure on a fnum
+ is the same as this one and changes its error code. I wonder if any
+ app depends on this ?
+****************************************************************************/
  
-               if (count > 1 && i < count-1) {
-                       memmove(&locks[i], &locks[i+1], 
-                               sizeof(*locks)*((count-1) - i));
+NTSTATUS brl_lock_failed(files_struct *fsp, const struct lock_struct *lock, bool blocking_lock)
+{
+       if (lock->start >= 0xEF000000 && (lock->start >> 63) == 0) {
+               /* amazing the little things you learn with a test
+                  suite. Locks beyond this offset (as a 64 bit
+                  number!) always generate the conflict error code,
+                  unless the top bit is set */
+               if (!blocking_lock) {
+                       fsp->last_lock_failure = *lock;
                 }
-               count--;
-               i--;
+               return NT_STATUS_FILE_LOCK_CONFLICT;
         }
  
-       if (count == 0) {
-               tdb_delete(tdb, kbuf);
-       } else if (count < (dbuf.dsize / sizeof(*locks))) {
-               dbuf.dsize = count * sizeof(*locks);
-               tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
+       if (serverid_equal(&lock->context.pid, &fsp->last_lock_failure.context.pid) &&
+                       lock->context.tid == fsp->last_lock_failure.context.tid &&
+                       lock->fnum == fsp->last_lock_failure.fnum &&
+                       lock->start == fsp->last_lock_failure.start) {
+               return NT_STATUS_FILE_LOCK_CONFLICT;
         }
  
-       tdb_chainunlock(tdb, kbuf);
-       return 0;
+       if (!blocking_lock) {
+               fsp->last_lock_failure = *lock;
+       }
+       return NT_STATUS_LOCK_NOT_GRANTED;
  }
-#endif
  
  /****************************************************************************
   Open up the brlock.tdb database.
  ****************************************************************************/
  
-void brl_init(int read_only)
+void brl_init(bool read_only)
  {
-       if (tdb)
-               return;
-       tdb = tdb_open_log(lock_path("brlock.tdb"), 0,  TDB_DEFAULT|(read_only?0x0:TDB_CLEAR_IF_FIRST),
-                      read_only?O_RDONLY:(O_RDWR|O_CREAT), 0644);
-       if (!tdb) {
-               DEBUG(0,("Failed to open byte range locking database\n"));
+       int tdb_flags;
+
+       if (brlock_db) {
                 return;
         }
  
-#if DONT_DO_THIS
-       /* doing this traversal could kill solaris machines under high load (tridge) */
-       /* delete any dead locks */
-       if (!read_only) {
-               BOOL check_self = False;
-               tdb_traverse(tdb, delete_fn, &check_self);
+       tdb_flags = TDB_DEFAULT|TDB_VOLATILE|TDB_CLEAR_IF_FIRST|TDB_INCOMPATIBLE_HASH;
+
+       if (!lp_clustering()) {
+               /*
+                * We can't use the SEQNUM trick to cache brlock
+                * entries in the clustering case because ctdb seqnum
+                * propagation has a delay.
+                */
+               tdb_flags |= TDB_SEQNUM;
+       }
+
+       brlock_db = db_open(NULL, lock_path("brlock.tdb"),
+                           lp_open_files_db_hash_size(), tdb_flags,
+                           read_only?O_RDONLY:(O_RDWR|O_CREAT), 0644,
+                           DBWRAP_LOCK_ORDER_2);
+       if (!brlock_db) {
+               DEBUG(0,("Failed to open byte range locking database %s\n",
+                       lock_path("brlock.tdb")));
+               return;
         }
-#endif
  }
  
  /****************************************************************************
   Close down the brlock.tdb database.
  ****************************************************************************/
  
-void brl_shutdown(int read_only)
+void brl_shutdown(void)
  {
-       if (!tdb)
-               return;
-
-#if DONT_DO_THIS
-       /* doing this traversal could kill solaris machines under high load (tridge) */
-       /* delete any dead locks */
-       if (!read_only) {
-               BOOL check_self = True;
-               tdb_traverse(tdb, delete_fn, &check_self);
-       }
-#endif
-
-       tdb_close(tdb);
+       TALLOC_FREE(brlock_db);
  }
  
  #if ZERO_ZERO
  /****************************************************************************
-compare two locks for sorting
+ Compare two locks for sorting.
  ****************************************************************************/
-static int lock_compare(struct lock_struct *lck1, 
-                        struct lock_struct *lck2)
+
+static int lock_compare(const struct lock_struct *lck1, 
+                        const struct lock_struct *lck2)
  {
-       if (lck1->start != lck2->start) return (lck1->start - lck2->start);
+       if (lck1->start != lck2->start) {
+               return (lck1->start - lck2->start);
+       }
         if (lck2->size != lck1->size) {
                 return ((int)lck1->size - (int)lck2->size);
         }
@@ -299,402 +327,1447 @@ static int lock_compare(struct lock_struct *lck1,
  #endif
  
  /****************************************************************************
- Lock a range of bytes.
+ Lock a range of bytes - Windows lock semantics.
  ****************************************************************************/
  
-NTSTATUS brl_lock(SMB_DEV_T dev, SMB_INO_T ino, int fnum,
-                 uint16 smbpid, pid_t pid, uint16 tid,
-                 br_off start, br_off size, 
-                 enum brl_type lock_type)
+NTSTATUS brl_lock_windows_default(struct byte_range_lock *br_lck,
+    struct lock_struct *plock, bool blocking_lock)
  {
-       TDB_DATA kbuf, dbuf;
-       int count, i;
-       struct lock_struct lock, *locks;
-       char *tp;
-       NTSTATUS status = NT_STATUS_OK;
-       static int last_failed = -1;
-       static br_off last_failed_start;
-
-       kbuf = locking_key(dev,ino);
+       unsigned int i;
+       files_struct *fsp = br_lck->fsp;
+       struct lock_struct *locks = br_lck->lock_data;
+       NTSTATUS status;
  
-       dbuf.dptr = NULL;
+       SMB_ASSERT(plock->lock_type != UNLOCK_LOCK);
  
-#if !ZERO_ZERO
-       if (start == 0 && size == 0) {
-               DEBUG(0,("client sent 0/0 lock - please report this\n"));
+       if ((plock->start + plock->size - 1 < plock->start) &&
+                       plock->size != 0) {
+               return NT_STATUS_INVALID_LOCK_RANGE;
         }
-#endif
  
-       tdb_chainlock(tdb, kbuf);
-       dbuf = tdb_fetch(tdb, kbuf);
+       for (i=0; i < br_lck->num_locks; i++) {
+               /* Do any Windows or POSIX locks conflict ? */
+               if (brl_conflict(&locks[i], plock)) {
+                       /* Remember who blocked us. */
+                       plock->context.smblctx = locks[i].context.smblctx;
+                       return brl_lock_failed(fsp,plock,blocking_lock);
+               }
+#if ZERO_ZERO
+               if (plock->start == 0 && plock->size == 0 && 
+                               locks[i].size == 0) {
+                       break;
+               }
+#endif
+       }
  
-       lock.context.smbpid = smbpid;
-       lock.context.pid = pid;
-       lock.context.tid = tid;
-       lock.start = start;
-       lock.size = size;
-       lock.fnum = fnum;
-       lock.lock_type = lock_type;
+       if (!IS_PENDING_LOCK(plock->lock_type)) {
+               contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WINDOWS_BRL);
+       }
  
-       if (dbuf.dptr) {
-               /* there are existing locks - make sure they don't conflict */
-               locks = (struct lock_struct *)dbuf.dptr;
-               count = dbuf.dsize / sizeof(*locks);
-               for (i=0; i<count; i++) {
-                       if (brl_conflict(&locks[i], &lock)) {
-                               status = NT_STATUS_LOCK_NOT_GRANTED;
+       /* We can get the Windows lock, now see if it needs to
+          be mapped into a lower level POSIX one, and if so can
+          we get it ? */
+
+       if (!IS_PENDING_LOCK(plock->lock_type) && lp_posix_locking(fsp->conn->params)) {
+               int errno_ret;
+               if (!set_posix_lock_windows_flavour(fsp,
+                               plock->start,
+                               plock->size,
+                               plock->lock_type,
+                               &plock->context,
+                               locks,
+                               br_lck->num_locks,
+                               &errno_ret)) {
+
+                       /* We don't know who blocked us. */
+                       plock->context.smblctx = 0xFFFFFFFFFFFFFFFFLL;
+
+                       if (errno_ret == EACCES || errno_ret == EAGAIN) {
+                               status = NT_STATUS_FILE_LOCK_CONFLICT;
+                               goto fail;
+                       } else {
+                               status = map_nt_error_from_unix(errno);
                                 goto fail;
                         }
-#if ZERO_ZERO
-                       if (lock.start == 0 && lock.size == 0 && 
-                           locks[i].size == 0) {
-                               break;
-                       }
-#endif
                 }
         }
  
         /* no conflicts - add it to the list of locks */
-       tp = Realloc(dbuf.dptr, dbuf.dsize + sizeof(*locks));
-       if (!tp) {
+       locks = (struct lock_struct *)SMB_REALLOC(locks, (br_lck->num_locks + 1) * sizeof(*locks));
+       if (!locks) {
                 status = NT_STATUS_NO_MEMORY;
                 goto fail;
-       } else {
-               dbuf.dptr = tp;
         }
-       memcpy(dbuf.dptr + dbuf.dsize, &lock, sizeof(lock));
-       dbuf.dsize += sizeof(lock);
-
-#if ZERO_ZERO
-       /* sort the lock list */
-       qsort(dbuf.dptr, dbuf.dsize/sizeof(lock), sizeof(lock), lock_compare);
-#endif
  
-       tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
+       memcpy(&locks[br_lck->num_locks], plock, sizeof(struct lock_struct));
+       br_lck->num_locks += 1;
+       br_lck->lock_data = locks;
+       br_lck->modified = True;
  
-       SAFE_FREE(dbuf.dptr);
-       tdb_chainunlock(tdb, kbuf);
         return NT_STATUS_OK;
-
   fail:
-       /* this is a nasty hack to try to simulate the lock result cache code in w2k.
-          It isn't completely accurate as I haven't yet worked out the correct
-          semantics (tridge)
-       */
-       if (last_failed == fnum &&
-           last_failed_start == start &&
-           NT_STATUS_EQUAL(status, NT_STATUS_LOCK_NOT_GRANTED)) {
-               status = NT_STATUS_FILE_LOCK_CONFLICT;
-       }
-       last_failed = fnum;
-       last_failed_start = start;
-
-       SAFE_FREE(dbuf.dptr);
-       tdb_chainunlock(tdb, kbuf);
+       if (!IS_PENDING_LOCK(plock->lock_type)) {
+               contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WINDOWS_BRL);
+       }
         return status;
  }
  
  /****************************************************************************
- Check if an unlock overlaps a pending lock.
+ Cope with POSIX range splits and merges.
  ****************************************************************************/
  
-static BOOL brl_pending_overlap(struct lock_struct *lock, struct lock_struct *pend_lock)
+static unsigned int brlock_posix_split_merge(struct lock_struct *lck_arr,      /* Output array. */
+                                               struct lock_struct *ex,         /* existing lock. */
+                                               struct lock_struct *plock)      /* proposed lock. */
  {
-       if ((lock->start <= pend_lock->start) && (lock->start + lock->size > pend_lock->start))
-               return True;
-       if ((lock->start >= pend_lock->start) && (lock->start <= pend_lock->start + pend_lock->size))
-               return True;
-       return False;
+       bool lock_types_differ = (ex->lock_type != plock->lock_type);
+
+       /* We can't merge non-conflicting locks on different context - ignore fnum. */
+
+       if (!brl_same_context(&ex->context, &plock->context)) {
+               /* Just copy. */
+               memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
+               return 1;
+       }
+
+       /* We now know we have the same context. */
+
+       /* Did we overlap ? */
+
+/*********************************************
+                                        +---------+
+                                        | ex      |
+                                        +---------+
+                         +-------+
+                         | plock |
+                         +-------+
+OR....
+        +---------+
+        |  ex     |
+        +---------+
+**********************************************/
+
+       if ( (ex->start > (plock->start + plock->size)) ||
+               (plock->start > (ex->start + ex->size))) {
+
+               /* No overlap with this lock - copy existing. */
+
+               memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
+               return 1;
+       }
+
+/*********************************************
+        +---------------------------+
+        |          ex               |
+        +---------------------------+
+        +---------------------------+
+        |       plock               | -> replace with plock.
+        +---------------------------+
+OR
+             +---------------+
+             |       ex      |
+             +---------------+
+        +---------------------------+
+        |       plock               | -> replace with plock.
+        +---------------------------+
+
+**********************************************/
+
+       if ( (ex->start >= plock->start) &&
+               (ex->start + ex->size <= plock->start + plock->size) ) {
+
+               /* Replace - discard existing lock. */
+
+               return 0;
+       }
+
+/*********************************************
+Adjacent after.
+                        +-------+
+                        |  ex   |
+                        +-------+
+        +---------------+
+        |   plock       |
+        +---------------+
+
+BECOMES....
+        +---------------+-------+
+        |   plock       | ex    | - different lock types.
+        +---------------+-------+
+OR.... (merge)
+        +-----------------------+
+        |   plock               | - same lock type.
+        +-----------------------+
+**********************************************/
+
+       if (plock->start + plock->size == ex->start) {
+
+               /* If the lock types are the same, we merge, if different, we
+                  add the remainder of the old lock. */
+
+               if (lock_types_differ) {
+                       /* Add existing. */
+                       memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
+                       return 1;
+               } else {
+                       /* Merge - adjust incoming lock as we may have more
+                        * merging to come. */
+                       plock->size += ex->size;
+                       return 0;
+               }
+       }
+
+/*********************************************
+Adjacent before.
+        +-------+
+        |  ex   |
+        +-------+
+                +---------------+
+                |   plock       |
+                +---------------+
+BECOMES....
+        +-------+---------------+
+        | ex    |   plock       | - different lock types
+        +-------+---------------+
+
+OR.... (merge)
+        +-----------------------+
+        |      plock            | - same lock type.
+        +-----------------------+
+
+**********************************************/
+
+       if (ex->start + ex->size == plock->start) {
+
+               /* If the lock types are the same, we merge, if different, we
+                  add the existing lock. */
+
+               if (lock_types_differ) {
+                       memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
+                       return 1;
+               } else {
+                       /* Merge - adjust incoming lock as we may have more
+                        * merging to come. */
+                       plock->start = ex->start;
+                       plock->size += ex->size;
+                       return 0;
+               }
+       }
+
+/*********************************************
+Overlap after.
+        +-----------------------+
+        |          ex           |
+        +-----------------------+
+        +---------------+
+        |   plock       |
+        +---------------+
+OR
+               +----------------+
+               |       ex       |
+               +----------------+
+        +---------------+
+        |   plock       |
+        +---------------+
+
+BECOMES....
+        +---------------+-------+
+        |   plock       | ex    | - different lock types.
+        +---------------+-------+
+OR.... (merge)
+        +-----------------------+
+        |   plock               | - same lock type.
+        +-----------------------+
+**********************************************/
+
+       if ( (ex->start >= plock->start) &&
+               (ex->start <= plock->start + plock->size) &&
+               (ex->start + ex->size > plock->start + plock->size) ) {
+
+               /* If the lock types are the same, we merge, if different, we
+                  add the remainder of the old lock. */
+
+               if (lock_types_differ) {
+                       /* Add remaining existing. */
+                       memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
+                       /* Adjust existing start and size. */
+                       lck_arr[0].start = plock->start + plock->size;
+                       lck_arr[0].size = (ex->start + ex->size) - (plock->start + plock->size);
+                       return 1;
+               } else {
+                       /* Merge - adjust incoming lock as we may have more
+                        * merging to come. */
+                       plock->size += (ex->start + ex->size) - (plock->start + plock->size);
+                       return 0;
+               }
+       }
+
+/*********************************************
+Overlap before.
+        +-----------------------+
+        |  ex                   |
+        +-----------------------+
+                +---------------+
+                |   plock       |
+                +---------------+
+OR
+        +-------------+
+        |  ex         |
+        +-------------+
+                +---------------+
+                |   plock       |
+                +---------------+
+
+BECOMES....
+        +-------+---------------+
+        | ex    |   plock       | - different lock types
+        +-------+---------------+
+
+OR.... (merge)
+        +-----------------------+
+        |      plock            | - same lock type.
+        +-----------------------+
+
+**********************************************/
+
+       if ( (ex->start < plock->start) &&
+                       (ex->start + ex->size >= plock->start) &&
+                       (ex->start + ex->size <= plock->start + plock->size) ) {
+
+               /* If the lock types are the same, we merge, if different, we
+                  add the truncated old lock. */
+
+               if (lock_types_differ) {
+                       memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
+                       /* Adjust existing size. */
+                       lck_arr[0].size = plock->start - ex->start;
+                       return 1;
+               } else {
+                       /* Merge - adjust incoming lock as we may have more
+                        * merging to come. MUST ADJUST plock SIZE FIRST ! */
+                       plock->size += (plock->start - ex->start);
+                       plock->start = ex->start;
+                       return 0;
+               }
+       }
+
+/*********************************************
+Complete overlap.
+        +---------------------------+
+        |        ex                 |
+        +---------------------------+
+                +---------+
+                |  plock  |
+                +---------+
+BECOMES.....
+        +-------+---------+---------+
+        | ex    |  plock  | ex      | - different lock types.
+        +-------+---------+---------+
+OR
+        +---------------------------+
+        |        plock              | - same lock type.
+        +---------------------------+
+**********************************************/
+
+       if ( (ex->start < plock->start) && (ex->start + ex->size > plock->start + plock->size) ) {
+
+               if (lock_types_differ) {
+
+                       /* We have to split ex into two locks here. */
+
+                       memcpy(&lck_arr[0], ex, sizeof(struct lock_struct));
+                       memcpy(&lck_arr[1], ex, sizeof(struct lock_struct));
+
+                       /* Adjust first existing size. */
+                       lck_arr[0].size = plock->start - ex->start;
+
+                       /* Adjust second existing start and size. */
+                       lck_arr[1].start = plock->start + plock->size;
+                       lck_arr[1].size = (ex->start + ex->size) - (plock->start + plock->size);
+                       return 2;
+               } else {
+                       /* Just eat the existing locks, merge them into plock. */
+                       plock->start = ex->start;
+                       plock->size = ex->size;
+                       return 0;
+               }
+       }
+
+       /* Never get here. */
+       smb_panic("brlock_posix_split_merge");
+       /* Notreached. */
+
+       /* Keep some compilers happy. */
+       return 0;
  }
  
  /****************************************************************************
- Unlock a range of bytes.
+ Lock a range of bytes - POSIX lock semantics.
+ We must cope with range splits and merges.
  ****************************************************************************/
  
-BOOL brl_unlock(SMB_DEV_T dev, SMB_INO_T ino, int fnum,
-               uint16 smbpid, pid_t pid, uint16 tid,
-               br_off start, br_off size,
-               BOOL remove_pending_locks_only,
-               void (*pre_unlock_fn)(void *),
-               void *pre_unlock_data)
+static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx,
+                              struct byte_range_lock *br_lck,
+                              struct lock_struct *plock)
  {
-       TDB_DATA kbuf, dbuf;
-       int count, i, j;
-       struct lock_struct *locks;
-       struct lock_context context;
-
-       kbuf = locking_key(dev,ino);
-
-       dbuf.dptr = NULL;
-
-       tdb_chainlock(tdb, kbuf);
-       dbuf = tdb_fetch(tdb, kbuf);
+       unsigned int i, count, posix_count;
+       struct lock_struct *locks = br_lck->lock_data;
+       struct lock_struct *tp;
+       bool signal_pending_read = False;
+       bool break_oplocks = false;
+       NTSTATUS status;
+
+       /* No zero-zero locks for POSIX. */
+       if (plock->start == 0 && plock->size == 0) {
+               return NT_STATUS_INVALID_PARAMETER;
+       }
  
-       if (!dbuf.dptr) {
-               DEBUG(10,("brl_unlock: tdb_fetch failed !\n"));
-               goto fail;
+       /* Don't allow 64-bit lock wrap. */
+       if (plock->start + plock->size - 1 < plock->start) {
+               return NT_STATUS_INVALID_PARAMETER;
         }
  
-       context.smbpid = smbpid;
-       context.pid = pid;
-       context.tid = tid;
+       /* The worst case scenario here is we have to split an
+          existing POSIX lock range into two, and add our lock,
+          so we need at most 2 more entries. */
  
-       /* there are existing locks - find a match */
-       locks = (struct lock_struct *)dbuf.dptr;
-       count = dbuf.dsize / sizeof(*locks);
+       tp = SMB_MALLOC_ARRAY(struct lock_struct, (br_lck->num_locks + 2));
+       if (!tp) {
+               return NT_STATUS_NO_MEMORY;
+       }
  
-#if ZERO_ZERO
-       for (i=0; i<count; i++) {
-               struct lock_struct *lock = &locks[i];
+       count = posix_count = 0;
  
-               if (lock->lock_type == WRITE_LOCK &&
-                   brl_same_context(&lock->context, &context) &&
-                   lock->fnum == fnum &&
-                   lock->start == start &&
-                   lock->size == size) {
+       for (i=0; i < br_lck->num_locks; i++) {
+               struct lock_struct *curr_lock = &locks[i];
  
-                       if (pre_unlock_fn)
-                               (*pre_unlock_fn)(pre_unlock_data);
+               /* If we have a pending read lock, a lock downgrade should
+                  trigger a lock re-evaluation. */
+               if (curr_lock->lock_type == PENDING_READ_LOCK &&
+                               brl_pending_overlap(plock, curr_lock)) {
+                       signal_pending_read = True;
+               }
  
-                       /* found it - delete it */
-                       if (count == 1) {
-                               tdb_delete(tdb, kbuf);
-                       } else {
-                               if (i < count-1) {
-                                       memmove(&locks[i], &locks[i+1], 
-                                               sizeof(*locks)*((count-1) - i));
-                               }
-                               dbuf.dsize -= sizeof(*locks);
-                               tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
+               if (curr_lock->lock_flav == WINDOWS_LOCK) {
+                       /* Do any Windows flavour locks conflict ? */
+                       if (brl_conflict(curr_lock, plock)) {
+                               /* No games with error messages. */
+                               SAFE_FREE(tp);
+                               /* Remember who blocked us. */
+                               plock->context.smblctx = curr_lock->context.smblctx;
+                               return NT_STATUS_FILE_LOCK_CONFLICT;
+                       }
+                       /* Just copy the Windows lock into the new array. */
+                       memcpy(&tp[count], curr_lock, sizeof(struct lock_struct));
+                       count++;
+               } else {
+                       unsigned int tmp_count = 0;
+
+                       /* POSIX conflict semantics are different. */
+                       if (brl_conflict_posix(curr_lock, plock)) {
+                               /* Can't block ourselves with POSIX locks. */
+                               /* No games with error messages. */
+                               SAFE_FREE(tp);
+                               /* Remember who blocked us. */
+                               plock->context.smblctx = curr_lock->context.smblctx;
+                               return NT_STATUS_FILE_LOCK_CONFLICT;
                         }
  
-                       SAFE_FREE(dbuf.dptr);
-                       tdb_chainunlock(tdb, kbuf);
-                       return True;
+                       /* Work out overlaps. */
+                       tmp_count += brlock_posix_split_merge(&tp[count], curr_lock, plock);
+                       posix_count += tmp_count;
+                       count += tmp_count;
                 }
         }
-#endif
  
-       locks = (struct lock_struct *)dbuf.dptr;
-       count = dbuf.dsize / sizeof(*locks);
-       for (i=0; i<count; i++) {
-               struct lock_struct *lock = &locks[i];
+       /*
+        * Break oplocks while we hold a brl. Since lock() and unlock() calls
+        * are not symetric with POSIX semantics, we cannot guarantee our
+        * contend_level2_oplocks_begin/end calls will be acquired and
+        * released one-for-one as with Windows semantics. Therefore we only
+        * call contend_level2_oplocks_begin if this is the first POSIX brl on
+        * the file.
+        */
+       break_oplocks = (!IS_PENDING_LOCK(plock->lock_type) &&
+                        posix_count == 0);
+       if (break_oplocks) {
+               contend_level2_oplocks_begin(br_lck->fsp,
+                                            LEVEL2_CONTEND_POSIX_BRL);
+       }
  
-               if (brl_same_context(&lock->context, &context) &&
-                               lock->fnum == fnum &&
-                               lock->start == start &&
-                               lock->size == size) {
+       /* Try and add the lock in order, sorted by lock start. */
+       for (i=0; i < count; i++) {
+               struct lock_struct *curr_lock = &tp[i];
  
-                       if (remove_pending_locks_only && lock->lock_type != PENDING_LOCK)
-                               continue;
+               if (curr_lock->start <= plock->start) {
+                       continue;
+               }
+       }
  
-                       if (lock->lock_type != PENDING_LOCK) {
+       if (i < count) {
+               memmove(&tp[i+1], &tp[i],
+                       (count - i)*sizeof(struct lock_struct));
+       }
+       memcpy(&tp[i], plock, sizeof(struct lock_struct));
+       count++;
  
-                               /* Do any POSIX unlocks needed. */
-                               if (pre_unlock_fn)
-                                       (*pre_unlock_fn)(pre_unlock_data);
+       /* We can get the POSIX lock, now see if it needs to
+          be mapped into a lower level POSIX one, and if so can
+          we get it ? */
  
-                               /* Send unlock messages to any pending waiters that overlap. */
-                               for (j=0; j<count; j++) {
-                                       struct lock_struct *pend_lock = &locks[j];
+       if (!IS_PENDING_LOCK(plock->lock_type) && lp_posix_locking(br_lck->fsp->conn->params)) {
+               int errno_ret;
  
-                                       /* Ignore non-pending locks. */
-                                       if (pend_lock->lock_type != PENDING_LOCK)
-                                               continue;
+               /* The lower layer just needs to attempt to
+                  get the system POSIX lock. We've weeded out
+                  any conflicts above. */
  
-                                       /* We could send specific lock info here... */
-                                       if (brl_pending_overlap(lock, pend_lock)) {
-                                               DEBUG(10,("brl_unlock: sending unlock message to pid %u\n",
-                                                                       (unsigned int)pend_lock->context.pid ));
+               if (!set_posix_lock_posix_flavour(br_lck->fsp,
+                               plock->start,
+                               plock->size,
+                               plock->lock_type,
+                               &errno_ret)) {
  
-                                               message_send_pid(pend_lock->context.pid,
-                                                               MSG_SMB_UNLOCK,
-                                                               NULL, 0, True);
-                                       }
-                               }
-                       }
+                       /* We don't know who blocked us. */
+                       plock->context.smblctx = 0xFFFFFFFFFFFFFFFFLL;
  
-                       /* found it - delete it */
-                       if (count == 1) {
-                               tdb_delete(tdb, kbuf);
+                       if (errno_ret == EACCES || errno_ret == EAGAIN) {
+                               SAFE_FREE(tp);
+                               status = NT_STATUS_FILE_LOCK_CONFLICT;
+                               goto fail;
                         } else {
-                               if (i < count-1) {
-                                       memmove(&locks[i], &locks[i+1], 
-                                               sizeof(*locks)*((count-1) - i));
-                               }
-                               dbuf.dsize -= sizeof(*locks);
-                               tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
+                               SAFE_FREE(tp);
+                               status = map_nt_error_from_unix(errno);
+                               goto fail;
                         }
+               }
+       }
  
-                       SAFE_FREE(dbuf.dptr);
-                       tdb_chainunlock(tdb, kbuf);
-                       return True;
+       /* If we didn't use all the allocated size,
+        * Realloc so we don't leak entries per lock call. */
+       if (count < br_lck->num_locks + 2) {
+               tp = (struct lock_struct *)SMB_REALLOC(tp, count * sizeof(*locks));
+               if (!tp) {
+                       status = NT_STATUS_NO_MEMORY;
+                       goto fail;
                 }
         }
  
-       /* we didn't find it */
+       br_lck->num_locks = count;
+       SAFE_FREE(br_lck->lock_data);
+       br_lck->lock_data = tp;
+       locks = tp;
+       br_lck->modified = True;
  
+       /* A successful downgrade from write to read lock can trigger a lock
+          re-evalutation where waiting readers can now proceed. */
+
+       if (signal_pending_read) {
+               /* Send unlock messages to any pending read waiters that overlap. */
+               for (i=0; i < br_lck->num_locks; i++) {
+                       struct lock_struct *pend_lock = &locks[i];
+
+                       /* Ignore non-pending locks. */
+                       if (!IS_PENDING_LOCK(pend_lock->lock_type)) {
+                               continue;
+                       }
+
+                       if (pend_lock->lock_type == PENDING_READ_LOCK &&
+                                       brl_pending_overlap(plock, pend_lock)) {
+                               DEBUG(10,("brl_lock_posix: sending unlock message to pid %s\n",
+                                       procid_str_static(&pend_lock->context.pid )));
+
+                               messaging_send(msg_ctx, pend_lock->context.pid,
+                                              MSG_SMB_UNLOCK, &data_blob_null);
+                       }
+               }
+       }
+
+       return NT_STATUS_OK;
   fail:
-       SAFE_FREE(dbuf.dptr);
-       tdb_chainunlock(tdb, kbuf);
-       return False;
+       if (break_oplocks) {
+               contend_level2_oplocks_end(br_lck->fsp,
+                                          LEVEL2_CONTEND_POSIX_BRL);
+       }
+       return status;
  }
  
+NTSTATUS smb_vfs_call_brl_lock_windows(struct vfs_handle_struct *handle,
+                                      struct byte_range_lock *br_lck,
+                                      struct lock_struct *plock,
+                                      bool blocking_lock,
+                                      struct blocking_lock_record *blr)
+{
+       VFS_FIND(brl_lock_windows);
+       return handle->fns->brl_lock_windows_fn(handle, br_lck, plock,
+                                               blocking_lock, blr);
+}
  
  /****************************************************************************
- Test if we could add a lock if we wanted to.
+ Lock a range of bytes.
  ****************************************************************************/
  
-BOOL brl_locktest(SMB_DEV_T dev, SMB_INO_T ino, int fnum,
-                 uint16 smbpid, pid_t pid, uint16 tid,
-                 br_off start, br_off size, 
-                 enum brl_type lock_type, int check_self)
+NTSTATUS brl_lock(struct messaging_context *msg_ctx,
+               struct byte_range_lock *br_lck,
+               uint64_t smblctx,
+               struct server_id pid,
+               br_off start,
+               br_off size, 
+               enum brl_type lock_type,
+               enum brl_flavour lock_flav,
+               bool blocking_lock,
+               uint64_t *psmblctx,
+               struct blocking_lock_record *blr)
  {
-       TDB_DATA kbuf, dbuf;
-       int count, i;
-       struct lock_struct lock, *locks;
+       NTSTATUS ret;
+       struct lock_struct lock;
  
-       kbuf = locking_key(dev,ino);
-
-       dbuf.dptr = NULL;
+#if !ZERO_ZERO
+       if (start == 0 && size == 0) {
+               DEBUG(0,("client sent 0/0 lock - please report this\n"));
+       }
+#endif
  
-       tdb_chainlock(tdb, kbuf);
-       dbuf = tdb_fetch(tdb, kbuf);
+#ifdef DEVELOPER
+       /* Quieten valgrind on test. */
+       memset(&lock, '\0', sizeof(lock));
+#endif
  
-       lock.context.smbpid = smbpid;
+       lock.context.smblctx = smblctx;
         lock.context.pid = pid;
-       lock.context.tid = tid;
+       lock.context.tid = br_lck->fsp->conn->cnum;
         lock.start = start;
         lock.size = size;
-       lock.fnum = fnum;
+       lock.fnum = br_lck->fsp->fnum;
         lock.lock_type = lock_type;
+       lock.lock_flav = lock_flav;
  
-       if (dbuf.dptr) {
-               /* there are existing locks - make sure they don't conflict */
-               locks = (struct lock_struct *)dbuf.dptr;
-               count = dbuf.dsize / sizeof(*locks);
-               for (i=0; i<count; i++) {
-                       if (check_self) {
-                               if (brl_conflict(&locks[i], &lock))
-                                       goto fail;
-                       } else {
-                               /*
-                                * Our own locks don't conflict.
-                                */
-                               if (brl_conflict_other(&locks[i], &lock))
-                                       goto fail;
-                       }
-               }
+       if (lock_flav == WINDOWS_LOCK) {
+               ret = SMB_VFS_BRL_LOCK_WINDOWS(br_lck->fsp->conn, br_lck,
+                   &lock, blocking_lock, blr);
+       } else {
+               ret = brl_lock_posix(msg_ctx, br_lck, &lock);
         }
  
-       /* no conflicts - we could have added it */
-       SAFE_FREE(dbuf.dptr);
-       tdb_chainunlock(tdb, kbuf);
-       return True;
+#if ZERO_ZERO
+       /* sort the lock list */
+       TYPESAFE_QSORT(br_lck->lock_data, (size_t)br_lck->num_locks, lock_compare);
+#endif
  
- fail:
-       SAFE_FREE(dbuf.dptr);
-       tdb_chainunlock(tdb, kbuf);
-       return False;
+       /* If we're returning an error, return who blocked us. */
+       if (!NT_STATUS_IS_OK(ret) && psmblctx) {
+               *psmblctx = lock.context.smblctx;
+       }
+       return ret;
  }
  
  /****************************************************************************
- Remove any locks associated with a open file.
+ Unlock a range of bytes - Windows semantics.
  ****************************************************************************/
  
-void brl_close(SMB_DEV_T dev, SMB_INO_T ino, pid_t pid, int tid, int fnum)
+bool brl_unlock_windows_default(struct messaging_context *msg_ctx,
+                              struct byte_range_lock *br_lck,
+                              const struct lock_struct *plock)
  {
-       TDB_DATA kbuf, dbuf;
-       int count, i, j, dcount=0;
-       struct lock_struct *locks;
+       unsigned int i, j;
+       struct lock_struct *locks = br_lck->lock_data;
+       enum brl_type deleted_lock_type = READ_LOCK; /* shut the compiler up.... */
  
-       kbuf = locking_key(dev,ino);
+       SMB_ASSERT(plock->lock_type == UNLOCK_LOCK);
  
-       dbuf.dptr = NULL;
+#if ZERO_ZERO
+       /* Delete write locks by preference... The lock list
+          is sorted in the zero zero case. */
  
-       tdb_chainlock(tdb, kbuf);
-       dbuf = tdb_fetch(tdb, kbuf);
+       for (i = 0; i < br_lck->num_locks; i++) {
+               struct lock_struct *lock = &locks[i];
  
-       if (!dbuf.dptr) goto fail;
+               if (lock->lock_type == WRITE_LOCK &&
+                   brl_same_context(&lock->context, &plock->context) &&
+                   lock->fnum == plock->fnum &&
+                   lock->lock_flav == WINDOWS_LOCK &&
+                   lock->start == plock->start &&
+                   lock->size == plock->size) {
  
-       /* there are existing locks - remove any for this fnum */
-       locks = (struct lock_struct *)dbuf.dptr;
-       count = dbuf.dsize / sizeof(*locks);
+                       /* found it - delete it */
+                       deleted_lock_type = lock->lock_type;
+                       break;
+               }
+       }
  
-       for (i=0; i<count; i++) {
-               struct lock_struct *lock = &locks[i];
+       if (i != br_lck->num_locks) {
+               /* We found it - don't search again. */
+               goto unlock_continue;
+       }
+#endif
  
-               if (lock->context.tid == tid &&
-                   lock->context.pid == pid &&
-                   lock->fnum == fnum) {
+       for (i = 0; i < br_lck->num_locks; i++) {
+               struct lock_struct *lock = &locks[i];
  
-                       /* Send unlock messages to any pending waiters that overlap. */
-                       for (j=0; j<count; j++) {
-                               struct lock_struct *pend_lock = &locks[j];
+               if (IS_PENDING_LOCK(lock->lock_type)) {
+                       continue;
+               }
  
-                               /* Ignore our own or non-pending locks. */
-                               if (pend_lock->lock_type != PENDING_LOCK)
-                                       continue;
+               /* Only remove our own locks that match in start, size, and flavour. */
+               if (brl_same_context(&lock->context, &plock->context) &&
+                                       lock->fnum == plock->fnum &&
+                                       lock->lock_flav == WINDOWS_LOCK &&
+                                       lock->start == plock->start &&
+                                       lock->size == plock->size ) {
+                       deleted_lock_type = lock->lock_type;
+                       break;
+               }
+       }
  
-                               if (pend_lock->context.tid == tid &&
-                                   pend_lock->context.pid == pid &&
-                                   pend_lock->fnum == fnum)
-                                       continue;
+       if (i == br_lck->num_locks) {
+               /* we didn't find it */
+               return False;
+       }
  
-                               /* We could send specific lock info here... */
-                               if (brl_pending_overlap(lock, pend_lock))
-                                       message_send_pid(pend_lock->context.pid,
-                                                       MSG_SMB_UNLOCK,
-                                                       NULL, 0, True);
-                       }
+#if ZERO_ZERO
+  unlock_continue:
+#endif
  
-                       /* found it - delete it */
-                       if (count > 1 && i < count-1) {
-                               memmove(&locks[i], &locks[i+1], 
-                                       sizeof(*locks)*((count-1) - i));
-                       }
-                       count--;
-                       i--;
-                       dcount++;
-               }
+       /* Actually delete the lock. */
+       if (i < br_lck->num_locks - 1) {
+               memmove(&locks[i], &locks[i+1], 
+                       sizeof(*locks)*((br_lck->num_locks-1) - i));
         }
  
-       if (count == 0) {
-               tdb_delete(tdb, kbuf);
-       } else if (count < (dbuf.dsize / sizeof(*locks))) {
-               dbuf.dsize -= dcount * sizeof(*locks);
-               tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
+       br_lck->num_locks -= 1;
+       br_lck->modified = True;
+
+       /* Unlock the underlying POSIX regions. */
+       if(lp_posix_locking(br_lck->fsp->conn->params)) {
+               release_posix_lock_windows_flavour(br_lck->fsp,
+                               plock->start,
+                               plock->size,
+                               deleted_lock_type,
+                               &plock->context,
+                               locks,
+                               br_lck->num_locks);
         }
  
-       /* we didn't find it */
- fail:
-       SAFE_FREE(dbuf.dptr);
-       tdb_chainunlock(tdb, kbuf);
-}
+       /* Send unlock messages to any pending waiters that overlap. */
+       for (j=0; j < br_lck->num_locks; j++) {
+               struct lock_struct *pend_lock = &locks[j];
  
-/****************************************************************************
+               /* Ignore non-pending locks. */
+               if (!IS_PENDING_LOCK(pend_lock->lock_type)) {
+                       continue;
+               }
+
+               /* We could send specific lock info here... */
+               if (brl_pending_overlap(plock, pend_lock)) {
+                       DEBUG(10,("brl_unlock: sending unlock message to pid %s\n",
+                               procid_str_static(&pend_lock->context.pid )));
+
+                       messaging_send(msg_ctx, pend_lock->context.pid,
+                                      MSG_SMB_UNLOCK, &data_blob_null);
+               }
+       }
+
+       contend_level2_oplocks_end(br_lck->fsp, LEVEL2_CONTEND_WINDOWS_BRL);
+       return True;
+}
+
+/****************************************************************************
+ Unlock a range of bytes - POSIX semantics.
+****************************************************************************/
+
+static bool brl_unlock_posix(struct messaging_context *msg_ctx,
+                            struct byte_range_lock *br_lck,
+                            struct lock_struct *plock)
+{
+       unsigned int i, j, count;
+       struct lock_struct *tp;
+       struct lock_struct *locks = br_lck->lock_data;
+       bool overlap_found = False;
+
+       /* No zero-zero locks for POSIX. */
+       if (plock->start == 0 && plock->size == 0) {
+               return False;
+       }
+
+       /* Don't allow 64-bit lock wrap. */
+       if (plock->start + plock->size < plock->start ||
+                       plock->start + plock->size < plock->size) {
+               DEBUG(10,("brl_unlock_posix: lock wrap\n"));
+               return False;
+       }
+
+       /* The worst case scenario here is we have to split an
+          existing POSIX lock range into two, so we need at most
+          1 more entry. */
+
+       tp = SMB_MALLOC_ARRAY(struct lock_struct, (br_lck->num_locks + 1));
+       if (!tp) {
+               DEBUG(10,("brl_unlock_posix: malloc fail\n"));
+               return False;
+       }
+
+       count = 0;
+       for (i = 0; i < br_lck->num_locks; i++) {
+               struct lock_struct *lock = &locks[i];
+               unsigned int tmp_count;
+
+               /* Only remove our own locks - ignore fnum. */
+               if (IS_PENDING_LOCK(lock->lock_type) ||
+                               !brl_same_context(&lock->context, &plock->context)) {
+                       memcpy(&tp[count], lock, sizeof(struct lock_struct));
+                       count++;
+                       continue;
+               }
+
+               if (lock->lock_flav == WINDOWS_LOCK) {
+                       /* Do any Windows flavour locks conflict ? */
+                       if (brl_conflict(lock, plock)) {
+                               SAFE_FREE(tp);
+                               return false;
+                       }
+                       /* Just copy the Windows lock into the new array. */
+                       memcpy(&tp[count], lock, sizeof(struct lock_struct));
+                       count++;
+                       continue;
+               }
+
+               /* Work out overlaps. */
+               tmp_count = brlock_posix_split_merge(&tp[count], lock, plock);
+
+               if (tmp_count == 0) {
+                       /* plock overlapped the existing lock completely,
+                          or replaced it. Don't copy the existing lock. */
+                       overlap_found = true;
+               } else if (tmp_count == 1) {
+                       /* Either no overlap, (simple copy of existing lock) or
+                        * an overlap of an existing lock. */
+                       /* If the lock changed size, we had an overlap. */
+                       if (tp[count].size != lock->size) {
+                               overlap_found = true;
+                       }
+                       count += tmp_count;
+               } else if (tmp_count == 2) {
+                       /* We split a lock range in two. */
+                       overlap_found = true;
+                       count += tmp_count;
+
+                       /* Optimisation... */
+                       /* We know we're finished here as we can't overlap any
+                          more POSIX locks. Copy the rest of the lock array. */
+
+                       if (i < br_lck->num_locks - 1) {
+                               memcpy(&tp[count], &locks[i+1],
+                                       sizeof(*locks)*((br_lck->num_locks-1) - i));
+                               count += ((br_lck->num_locks-1) - i);
+                       }
+                       break;
+               }
+
+       }
+
+       if (!overlap_found) {
+               /* Just ignore - no change. */
+               SAFE_FREE(tp);
+               DEBUG(10,("brl_unlock_posix: No overlap - unlocked.\n"));
+               return True;
+       }
+
+       /* Unlock any POSIX regions. */
+       if(lp_posix_locking(br_lck->fsp->conn->params)) {
+               release_posix_lock_posix_flavour(br_lck->fsp,
+                                               plock->start,
+                                               plock->size,
+                                               &plock->context,
+                                               tp,
+                                               count);
+       }
+
+       /* Realloc so we don't leak entries per unlock call. */
+       if (count) {
+               tp = (struct lock_struct *)SMB_REALLOC(tp, count * sizeof(*locks));
+               if (!tp) {
+                       DEBUG(10,("brl_unlock_posix: realloc fail\n"));
+                       return False;
+               }
+       } else {
+               /* We deleted the last lock. */
+               SAFE_FREE(tp);
+               tp = NULL;
+       }
+
+       contend_level2_oplocks_end(br_lck->fsp,
+                                  LEVEL2_CONTEND_POSIX_BRL);
+
+       br_lck->num_locks = count;
+       SAFE_FREE(br_lck->lock_data);
+       locks = tp;
+       br_lck->lock_data = tp;
+       br_lck->modified = True;
+
+       /* Send unlock messages to any pending waiters that overlap. */
+
+       for (j=0; j < br_lck->num_locks; j++) {
+               struct lock_struct *pend_lock = &locks[j];
+
+               /* Ignore non-pending locks. */
+               if (!IS_PENDING_LOCK(pend_lock->lock_type)) {
+                       continue;
+               }
+
+               /* We could send specific lock info here... */
+               if (brl_pending_overlap(plock, pend_lock)) {
+                       DEBUG(10,("brl_unlock: sending unlock message to pid %s\n",
+                               procid_str_static(&pend_lock->context.pid )));
+
+                       messaging_send(msg_ctx, pend_lock->context.pid,
+                                      MSG_SMB_UNLOCK, &data_blob_null);
+               }
+       }
+
+       return True;
+}
+
+bool smb_vfs_call_brl_unlock_windows(struct vfs_handle_struct *handle,
+                                    struct messaging_context *msg_ctx,
+                                    struct byte_range_lock *br_lck,
+                                    const struct lock_struct *plock)
+{
+       VFS_FIND(brl_unlock_windows);
+       return handle->fns->brl_unlock_windows_fn(handle, msg_ctx, br_lck, 
+                                                 plock);
+}
+
+/****************************************************************************
+ Unlock a range of bytes.
+****************************************************************************/
+
+bool brl_unlock(struct messaging_context *msg_ctx,
+               struct byte_range_lock *br_lck,
+               uint64_t smblctx,
+               struct server_id pid,
+               br_off start,
+               br_off size,
+               enum brl_flavour lock_flav)
+{
+       struct lock_struct lock;
+
+       lock.context.smblctx = smblctx;
+       lock.context.pid = pid;
+       lock.context.tid = br_lck->fsp->conn->cnum;
+       lock.start = start;
+       lock.size = size;
+       lock.fnum = br_lck->fsp->fnum;
+       lock.lock_type = UNLOCK_LOCK;
+       lock.lock_flav = lock_flav;
+
+       if (lock_flav == WINDOWS_LOCK) {
+               return SMB_VFS_BRL_UNLOCK_WINDOWS(br_lck->fsp->conn, msg_ctx,
+                   br_lck, &lock);
+       } else {
+               return brl_unlock_posix(msg_ctx, br_lck, &lock);
+       }
+}
+
+/****************************************************************************
+ Test if we could add a lock if we wanted to.
+ Returns True if the region required is currently unlocked, False if locked.
+****************************************************************************/
+
+bool brl_locktest(struct byte_range_lock *br_lck,
+               uint64_t smblctx,
+               struct server_id pid,
+               br_off start,
+               br_off size, 
+               enum brl_type lock_type,
+               enum brl_flavour lock_flav)
+{
+       bool ret = True;
+       unsigned int i;
+       struct lock_struct lock;
+       const struct lock_struct *locks = br_lck->lock_data;
+       files_struct *fsp = br_lck->fsp;
+
+       lock.context.smblctx = smblctx;
+       lock.context.pid = pid;
+       lock.context.tid = br_lck->fsp->conn->cnum;
+       lock.start = start;
+       lock.size = size;
+       lock.fnum = fsp->fnum;
+       lock.lock_type = lock_type;
+       lock.lock_flav = lock_flav;
+
+       /* Make sure existing locks don't conflict */
+       for (i=0; i < br_lck->num_locks; i++) {
+               /*
+                * Our own locks don't conflict.
+                */
+               if (brl_conflict_other(&locks[i], &lock)) {
+                       return False;
+               }
+       }
+
+       /*
+        * There is no lock held by an SMB daemon, check to
+        * see if there is a POSIX lock from a UNIX or NFS process.
+        * This only conflicts with Windows locks, not POSIX locks.
+        */
+
+       if(lp_posix_locking(fsp->conn->params) && (lock_flav == WINDOWS_LOCK)) {
+               ret = is_posix_locked(fsp, &start, &size, &lock_type, WINDOWS_LOCK);
+
+               DEBUG(10,("brl_locktest: posix start=%.0f len=%.0f %s for %s file %s\n",
+                       (double)start, (double)size, ret ? "locked" : "unlocked",
+                       fsp_fnum_dbg(fsp), fsp_str_dbg(fsp)));
+
+               /* We need to return the inverse of is_posix_locked. */
+               ret = !ret;
+        }
+
+       /* no conflicts - we could have added it */
+       return ret;
+}
+
+/****************************************************************************
+ Query for existing locks.
+****************************************************************************/
+
+NTSTATUS brl_lockquery(struct byte_range_lock *br_lck,
+               uint64_t *psmblctx,
+               struct server_id pid,
+               br_off *pstart,
+               br_off *psize, 
+               enum brl_type *plock_type,
+               enum brl_flavour lock_flav)
+{
+       unsigned int i;
+       struct lock_struct lock;
+       const struct lock_struct *locks = br_lck->lock_data;
+       files_struct *fsp = br_lck->fsp;
+
+       lock.context.smblctx = *psmblctx;
+       lock.context.pid = pid;
+       lock.context.tid = br_lck->fsp->conn->cnum;
+       lock.start = *pstart;
+       lock.size = *psize;
+       lock.fnum = fsp->fnum;
+       lock.lock_type = *plock_type;
+       lock.lock_flav = lock_flav;
+
+       /* Make sure existing locks don't conflict */
+       for (i=0; i < br_lck->num_locks; i++) {
+               const struct lock_struct *exlock = &locks[i];
+               bool conflict = False;
+
+               if (exlock->lock_flav == WINDOWS_LOCK) {
+                       conflict = brl_conflict(exlock, &lock);
+               } else {        
+                       conflict = brl_conflict_posix(exlock, &lock);
+               }
+
+               if (conflict) {
+                       *psmblctx = exlock->context.smblctx;
+                       *pstart = exlock->start;
+                       *psize = exlock->size;
+                       *plock_type = exlock->lock_type;
+                       return NT_STATUS_LOCK_NOT_GRANTED;
+               }
+       }
+
+       /*
+        * There is no lock held by an SMB daemon, check to
+        * see if there is a POSIX lock from a UNIX or NFS process.
+        */
+
+       if(lp_posix_locking(fsp->conn->params)) {
+               bool ret = is_posix_locked(fsp, pstart, psize, plock_type, POSIX_LOCK);
+
+               DEBUG(10,("brl_lockquery: posix start=%.0f len=%.0f %s for %s file %s\n",
+                       (double)*pstart, (double)*psize, ret ? "locked" : "unlocked",
+                       fsp_fnum_dbg(fsp), fsp_str_dbg(fsp)));
+
+               if (ret) {
+                       /* Hmmm. No clue what to set smblctx to - use -1. */
+                       *psmblctx = 0xFFFFFFFFFFFFFFFFLL;
+                       return NT_STATUS_LOCK_NOT_GRANTED;
+               }
+        }
+
+       return NT_STATUS_OK;
+}
+
+
+bool smb_vfs_call_brl_cancel_windows(struct vfs_handle_struct *handle,
+                                    struct byte_range_lock *br_lck,
+                                    struct lock_struct *plock,
+                                    struct blocking_lock_record *blr)
+{
+       VFS_FIND(brl_cancel_windows);
+       return handle->fns->brl_cancel_windows_fn(handle, br_lck, plock, blr);
+}
+
+/****************************************************************************
+ Remove a particular pending lock.
+****************************************************************************/
+bool brl_lock_cancel(struct byte_range_lock *br_lck,
+               uint64_t smblctx,
+               struct server_id pid,
+               br_off start,
+               br_off size,
+               enum brl_flavour lock_flav,
+               struct blocking_lock_record *blr)
+{
+       bool ret;
+       struct lock_struct lock;
+
+       lock.context.smblctx = smblctx;
+       lock.context.pid = pid;
+       lock.context.tid = br_lck->fsp->conn->cnum;
+       lock.start = start;
+       lock.size = size;
+       lock.fnum = br_lck->fsp->fnum;
+       lock.lock_flav = lock_flav;
+       /* lock.lock_type doesn't matter */
+
+       if (lock_flav == WINDOWS_LOCK) {
+               ret = SMB_VFS_BRL_CANCEL_WINDOWS(br_lck->fsp->conn, br_lck,
+                   &lock, blr);
+       } else {
+               ret = brl_lock_cancel_default(br_lck, &lock);
+       }
+
+       return ret;
+}
+
+bool brl_lock_cancel_default(struct byte_range_lock *br_lck,
+               struct lock_struct *plock)
+{
+       unsigned int i;
+       struct lock_struct *locks = br_lck->lock_data;
+
+       SMB_ASSERT(plock);
+
+       for (i = 0; i < br_lck->num_locks; i++) {
+               struct lock_struct *lock = &locks[i];
+
+               /* For pending locks we *always* care about the fnum. */
+               if (brl_same_context(&lock->context, &plock->context) &&
+                               lock->fnum == plock->fnum &&
+                               IS_PENDING_LOCK(lock->lock_type) &&
+                               lock->lock_flav == plock->lock_flav &&
+                               lock->start == plock->start &&
+                               lock->size == plock->size) {
+                       break;
+               }
+       }
+
+       if (i == br_lck->num_locks) {
+               /* Didn't find it. */
+               return False;
+       }
+
+       if (i < br_lck->num_locks - 1) {
+               /* Found this particular pending lock - delete it */
+               memmove(&locks[i], &locks[i+1], 
+                       sizeof(*locks)*((br_lck->num_locks-1) - i));
+       }
+
+       br_lck->num_locks -= 1;
+       br_lck->modified = True;
+       return True;
+}
+
+/****************************************************************************
+ Remove any locks associated with a open file.
+ We return True if this process owns any other Windows locks on this
+ fd and so we should not immediately close the fd.
+****************************************************************************/
+
+void brl_close_fnum(struct messaging_context *msg_ctx,
+                   struct byte_range_lock *br_lck)
+{
+       files_struct *fsp = br_lck->fsp;
+       uint32_t tid = fsp->conn->cnum;
+       uint64_t fnum = fsp->fnum;
+       unsigned int i;
+       struct lock_struct *locks = br_lck->lock_data;
+       struct server_id pid = messaging_server_id(fsp->conn->sconn->msg_ctx);
+       struct lock_struct *locks_copy;
+       unsigned int num_locks_copy;
+
+       /* Copy the current lock array. */
+       if (br_lck->num_locks) {
+               locks_copy = (struct lock_struct *)talloc_memdup(br_lck, locks, br_lck->num_locks * sizeof(struct lock_struct));
+               if (!locks_copy) {
+                       smb_panic("brl_close_fnum: talloc failed");
+                       }
+       } else {
+               locks_copy = NULL;
+       }
+
+       num_locks_copy = br_lck->num_locks;
+
+       for (i=0; i < num_locks_copy; i++) {
+               struct lock_struct *lock = &locks_copy[i];
+
+               if (lock->context.tid == tid && serverid_equal(&lock->context.pid, &pid) &&
+                               (lock->fnum == fnum)) {
+                       brl_unlock(msg_ctx,
+                               br_lck,
+                               lock->context.smblctx,
+                               pid,
+                               lock->start,
+                               lock->size,
+                               lock->lock_flav);
+               }
+       }
+}
+
+bool brl_mark_disconnected(struct files_struct *fsp)
+{
+       uint32_t tid = fsp->conn->cnum;
+       uint64_t smblctx = fsp->op->global->open_persistent_id;
+       uint64_t fnum = fsp->fnum;
+       unsigned int i;
+       struct server_id self = messaging_server_id(fsp->conn->sconn->msg_ctx);
+       struct byte_range_lock *br_lck = NULL;
+
+       if (!fsp->op->global->durable) {
+               return false;
+       }
+
+       if (fsp->current_lock_count == 0) {
+               return true;
+       }
+
+       br_lck = brl_get_locks(talloc_tos(), fsp);
+       if (br_lck == NULL) {
+               return false;
+       }
+
+       for (i=0; i < br_lck->num_locks; i++) {
+               struct lock_struct *lock = &br_lck->lock_data[i];
+
+               /*
+                * as this is a durable handle, we only expect locks
+                * of the current file handle!
+                */
+
+               if (lock->context.smblctx != smblctx) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               if (lock->context.tid != tid) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               if (!serverid_equal(&lock->context.pid, &self)) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               if (lock->fnum != fnum) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               server_id_set_disconnected(&lock->context.pid);
+               lock->context.tid = TID_FIELD_INVALID;
+               lock->fnum = FNUM_FIELD_INVALID;
+       }
+
+       br_lck->modified = true;
+       TALLOC_FREE(br_lck);
+       return true;
+}
+
+bool brl_reconnect_disconnected(struct files_struct *fsp)
+{
+       uint32_t tid = fsp->conn->cnum;
+       uint64_t smblctx = fsp->op->global->open_persistent_id;
+       uint64_t fnum = fsp->fnum;
+       unsigned int i;
+       struct server_id self = messaging_server_id(fsp->conn->sconn->msg_ctx);
+       struct byte_range_lock *br_lck = NULL;
+
+       if (!fsp->op->global->durable) {
+               return false;
+       }
+
+       /* we want to validate ourself */
+       fsp->lockdb_clean = true;
+
+       br_lck = brl_get_locks(talloc_tos(), fsp);
+       if (br_lck == NULL) {
+               return false;
+       }
+
+       if (br_lck->num_locks == 0) {
+               TALLOC_FREE(br_lck);
+               return true;
+       }
+
+       for (i=0; i < br_lck->num_locks; i++) {
+               struct lock_struct *lock = &br_lck->lock_data[i];
+
+               /*
+                * as this is a durable handle we only expect locks
+                * of the current file handle!
+                */
+
+               if (lock->context.smblctx != smblctx) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               if (lock->context.tid != TID_FIELD_INVALID) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               if (!server_id_is_disconnected(&lock->context.pid)) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               if (lock->fnum != FNUM_FIELD_INVALID) {
+                       TALLOC_FREE(br_lck);
+                       return false;
+               }
+
+               lock->context.pid = self;
+               lock->context.tid = tid;
+               lock->fnum = fnum;
+       }
+
+       fsp->current_lock_count = br_lck->num_locks;
+       br_lck->modified = true;
+       TALLOC_FREE(br_lck);
+       return true;
+}
+
+/****************************************************************************
+ Ensure this set of lock entries is valid.
+****************************************************************************/
+static bool validate_lock_entries(unsigned int *pnum_entries, struct lock_struct **pplocks)
+{
+       unsigned int i;
+       unsigned int num_valid_entries = 0;
+       struct lock_struct *locks = *pplocks;
+
+       for (i = 0; i < *pnum_entries; i++) {
+               struct lock_struct *lock_data = &locks[i];
+               if (!serverid_exists(&lock_data->context.pid)) {
+                       /* This process no longer exists - mark this
+                          entry as invalid by zeroing it. */
+                       ZERO_STRUCTP(lock_data);
+               } else {
+                       num_valid_entries++;
+               }
+       }
+
+       if (num_valid_entries != *pnum_entries) {
+               struct lock_struct *new_lock_data = NULL;
+
+               if (num_valid_entries) {
+                       new_lock_data = SMB_MALLOC_ARRAY(struct lock_struct, num_valid_entries);
+                       if (!new_lock_data) {
+                               DEBUG(3, ("malloc fail\n"));
+                               return False;
+                       }
+
+                       num_valid_entries = 0;
+                       for (i = 0; i < *pnum_entries; i++) {
+                               struct lock_struct *lock_data = &locks[i];
+                               if (lock_data->context.smblctx &&
+                                               lock_data->context.tid) {
+                                       /* Valid (nonzero) entry - copy it. */
+                                       memcpy(&new_lock_data[num_valid_entries],
+                                               lock_data, sizeof(struct lock_struct));
+                                       num_valid_entries++;
+                               }
+                       }
+               }
+
+               SAFE_FREE(*pplocks);
+               *pplocks = new_lock_data;
+               *pnum_entries = num_valid_entries;
+       }
+
+       return True;
+}
+
+struct brl_forall_cb {
+       void (*fn)(struct file_id id, struct server_id pid,
+                  enum brl_type lock_type,
+                  enum brl_flavour lock_flav,
+                  br_off start, br_off size,
+                  void *private_data);
+       void *private_data;
+};
+
+/****************************************************************************
   Traverse the whole database with this function, calling traverse_callback
   on each lock.
  ****************************************************************************/
  
-static int traverse_fn(TDB_CONTEXT *ttdb, TDB_DATA kbuf, TDB_DATA dbuf, void *state)
+static int brl_traverse_fn(struct db_record *rec, void *state)
  {
+       struct brl_forall_cb *cb = (struct brl_forall_cb *)state;
         struct lock_struct *locks;
-       struct lock_key *key;
-       int i;
+       struct file_id *key;
+       unsigned int i;
+       unsigned int num_locks = 0;
+       unsigned int orig_num_locks = 0;
+       TDB_DATA dbkey;
+       TDB_DATA value;
+
+       dbkey = dbwrap_record_get_key(rec);
+       value = dbwrap_record_get_value(rec);
+
+       /* In a traverse function we must make a copy of
+          dbuf before modifying it. */
+
+       locks = (struct lock_struct *)memdup(value.dptr, value.dsize);
+       if (!locks) {
+               return -1; /* Terminate traversal. */
+       }
  
-       BRLOCK_FN(traverse_callback) = (BRLOCK_FN_CAST())state;
+       key = (struct file_id *)dbkey.dptr;
+       orig_num_locks = num_locks = value.dsize/sizeof(*locks);
  
-       locks = (struct lock_struct *)dbuf.dptr;
-       key = (struct lock_key *)kbuf.dptr;
+       /* Ensure the lock db is clean of entries from invalid processes. */
  
-       for (i=0;i<dbuf.dsize/sizeof(*locks);i++) {
-               traverse_callback(key->device, key->inode,
-                                 locks[i].context.pid,
-                                 locks[i].lock_type,
-                                 locks[i].start,
-                                 locks[i].size);
+       if (!validate_lock_entries(&num_locks, &locks)) {
+               SAFE_FREE(locks);
+               return -1; /* Terminate traversal */
         }
+
+       if (orig_num_locks != num_locks) {
+               if (num_locks) {
+                       TDB_DATA data;
+                       data.dptr = (uint8_t *)locks;
+                       data.dsize = num_locks*sizeof(struct lock_struct);
+                       dbwrap_record_store(rec, data, TDB_REPLACE);
+               } else {
+                       dbwrap_record_delete(rec);
+               }
+       }
+
+       if (cb->fn) {
+               for ( i=0; i<num_locks; i++) {
+                       cb->fn(*key,
+                               locks[i].context.pid,
+                               locks[i].lock_type,
+                               locks[i].lock_flav,
+                               locks[i].start,
+                               locks[i].size,
+                               cb->private_data);
+               }
+       }
+
+       SAFE_FREE(locks);
         return 0;
  }
  
@@ -702,8 +1775,327 @@ static int traverse_fn(TDB_CONTEXT *ttdb, TDB_DATA kbuf, TDB_DATA dbuf, void *st
   Call the specified function on each lock in the database.
  ********************************************************************/
  
-int brl_forall(BRLOCK_FN(fn))
+int brl_forall(void (*fn)(struct file_id id, struct server_id pid,
+                         enum brl_type lock_type,
+                         enum brl_flavour lock_flav,
+                         br_off start, br_off size,
+                         void *private_data),
+              void *private_data)
+{
+       struct brl_forall_cb cb;
+       NTSTATUS status;
+       int count = 0;
+
+       if (!brlock_db) {
+               return 0;
+       }
+       cb.fn = fn;
+       cb.private_data = private_data;
+       status = dbwrap_traverse(brlock_db, brl_traverse_fn, &cb, &count);
+
+       if (!NT_STATUS_IS_OK(status)) {
+               return -1;
+       } else {
+               return count;
+       }
+}
+
+/*******************************************************************
+ Store a potentially modified set of byte range lock data back into
+ the database.
+ Unlock the record.
+********************************************************************/
+
+static void byte_range_lock_flush(struct byte_range_lock *br_lck)
+{
+       if (br_lck->read_only) {
+               SMB_ASSERT(!br_lck->modified);
+       }
+
+       if (!br_lck->modified) {
+               goto done;
+       }
+
+       if (br_lck->num_locks == 0) {
+               /* No locks - delete this entry. */
+               NTSTATUS status = dbwrap_record_delete(br_lck->record);
+               if (!NT_STATUS_IS_OK(status)) {
+                       DEBUG(0, ("delete_rec returned %s\n",
+                                 nt_errstr(status)));
+                       smb_panic("Could not delete byte range lock entry");
+               }
+       } else {
+               TDB_DATA data;
+               NTSTATUS status;
+
+               data.dptr = (uint8 *)br_lck->lock_data;
+               data.dsize = br_lck->num_locks * sizeof(struct lock_struct);
+
+               status = dbwrap_record_store(br_lck->record, data, TDB_REPLACE);
+               if (!NT_STATUS_IS_OK(status)) {
+                       DEBUG(0, ("store returned %s\n", nt_errstr(status)));
+                       smb_panic("Could not store byte range mode entry");
+               }
+       }
+
+ done:
+
+       br_lck->read_only = true;
+       br_lck->modified = false;
+
+       TALLOC_FREE(br_lck->record);
+}
+
+static int byte_range_lock_destructor(struct byte_range_lock *br_lck)
+{
+       byte_range_lock_flush(br_lck);
+       SAFE_FREE(br_lck->lock_data);
+       return 0;
+}
+
+/*******************************************************************
+ Fetch a set of byte range lock data from the database.
+ Leave the record locked.
+ TALLOC_FREE(brl) will release the lock in the destructor.
+********************************************************************/
+
+static struct byte_range_lock *brl_get_locks_internal(TALLOC_CTX *mem_ctx,
+                                       files_struct *fsp, bool read_only)
+{
+       TDB_DATA key, data;
+       struct byte_range_lock *br_lck = talloc(mem_ctx, struct byte_range_lock);
+       bool do_read_only = read_only;
+
+       if (br_lck == NULL) {
+               return NULL;
+       }
+
+       br_lck->fsp = fsp;
+       br_lck->num_locks = 0;
+       br_lck->modified = False;
+       br_lck->key = fsp->file_id;
+
+       key.dptr = (uint8 *)&br_lck->key;
+       key.dsize = sizeof(struct file_id);
+
+       if (!fsp->lockdb_clean) {
+               /* We must be read/write to clean
+                  the dead entries. */
+               do_read_only = false;
+       }
+
+       if (do_read_only) {
+               NTSTATUS status;
+               status = dbwrap_fetch(brlock_db, br_lck, key, &data);
+               if (!NT_STATUS_IS_OK(status)) {
+                       DEBUG(3, ("Could not fetch byte range lock record\n"));
+                       TALLOC_FREE(br_lck);
+                       return NULL;
+               }
+               br_lck->record = NULL;
+       } else {
+               br_lck->record = dbwrap_fetch_locked(brlock_db, br_lck, key);
+
+               if (br_lck->record == NULL) {
+                       DEBUG(3, ("Could not lock byte range lock entry\n"));
+                       TALLOC_FREE(br_lck);
+                       return NULL;
+               }
+
+               data = dbwrap_record_get_value(br_lck->record);
+       }
+
+       br_lck->read_only = do_read_only;
+       br_lck->lock_data = NULL;
+
+       talloc_set_destructor(br_lck, byte_range_lock_destructor);
+
+       br_lck->num_locks = data.dsize / sizeof(struct lock_struct);
+
+       if (br_lck->num_locks != 0) {
+               br_lck->lock_data = SMB_MALLOC_ARRAY(struct lock_struct,
+                                                    br_lck->num_locks);
+               if (br_lck->lock_data == NULL) {
+                       DEBUG(0, ("malloc failed\n"));
+                       TALLOC_FREE(br_lck);
+                       return NULL;
+               }
+
+               memcpy(br_lck->lock_data, data.dptr, data.dsize);
+       }
+
+       if (!fsp->lockdb_clean) {
+               int orig_num_locks = br_lck->num_locks;
+
+               /* This is the first time we've accessed this. */
+               /* Go through and ensure all entries exist - remove any that don't. */
+               /* Makes the lockdb self cleaning at low cost. */
+
+               if (!validate_lock_entries(&br_lck->num_locks,
+                                          &br_lck->lock_data)) {
+                       SAFE_FREE(br_lck->lock_data);
+                       TALLOC_FREE(br_lck);
+                       return NULL;
+               }
+
+               /* Ensure invalid locks are cleaned up in the destructor. */
+               if (orig_num_locks != br_lck->num_locks) {
+                       br_lck->modified = True;
+               }
+
+               /* Mark the lockdb as "clean" as seen from this open file. */
+               fsp->lockdb_clean = True;
+       }
+
+       if (DEBUGLEVEL >= 10) {
+               unsigned int i;
+               struct lock_struct *locks = br_lck->lock_data;
+               DEBUG(10,("brl_get_locks_internal: %u current locks on file_id %s\n",
+                       br_lck->num_locks,
+                         file_id_string_tos(&fsp->file_id)));
+               for( i = 0; i < br_lck->num_locks; i++) {
+                       print_lock_struct(i, &locks[i]);
+               }
+       }
+
+       if (do_read_only != read_only) {
+               /*
+                * this stores the record and gets rid of
+                * the write lock that is needed for a cleanup
+                */
+               byte_range_lock_flush(br_lck);
+       }
+
+       return br_lck;
+}
+
+struct byte_range_lock *brl_get_locks(TALLOC_CTX *mem_ctx,
+                                       files_struct *fsp)
  {
-       if (!tdb) return 0;
-       return tdb_traverse(tdb, traverse_fn, (void *)fn);
+       return brl_get_locks_internal(mem_ctx, fsp, False);
+}
+
+struct byte_range_lock *brl_get_locks_readonly(files_struct *fsp)
+{
+       struct byte_range_lock *br_lock;
+
+       if (lp_clustering()) {
+               return brl_get_locks_internal(talloc_tos(), fsp, true);
+       }
+
+       if ((fsp->brlock_rec != NULL)
+           && (dbwrap_get_seqnum(brlock_db) == fsp->brlock_seqnum)) {
+               return fsp->brlock_rec;
+       }
+
+       TALLOC_FREE(fsp->brlock_rec);
+
+       br_lock = brl_get_locks_internal(talloc_tos(), fsp, true);
+       if (br_lock == NULL) {
+               return NULL;
+       }
+       fsp->brlock_seqnum = dbwrap_get_seqnum(brlock_db);
+
+       fsp->brlock_rec = talloc_move(fsp, &br_lock);
+
+       return fsp->brlock_rec;
+}
+
+struct brl_revalidate_state {
+       ssize_t array_size;
+       uint32 num_pids;
+       struct server_id *pids;
+};
+
+/*
+ * Collect PIDs of all processes with pending entries
+ */
+
+static void brl_revalidate_collect(struct file_id id, struct server_id pid,
+                                  enum brl_type lock_type,
+                                  enum brl_flavour lock_flav,
+                                  br_off start, br_off size,
+                                  void *private_data)
+{
+       struct brl_revalidate_state *state =
+               (struct brl_revalidate_state *)private_data;
+
+       if (!IS_PENDING_LOCK(lock_type)) {
+               return;
+       }
+
+       add_to_large_array(state, sizeof(pid), (void *)&pid,
+                          &state->pids, &state->num_pids,
+                          &state->array_size);
+}
+
+/*
+ * qsort callback to sort the processes
+ */
+
+static int compare_procids(const void *p1, const void *p2)
+{
+       const struct server_id *i1 = (const struct server_id *)p1;
+       const struct server_id *i2 = (const struct server_id *)p2;
+
+       if (i1->pid < i2->pid) return -1;
+       if (i2->pid > i2->pid) return 1;
+       return 0;
+}
+
+/*
+ * Send a MSG_SMB_UNLOCK message to all processes with pending byte range
+ * locks so that they retry. Mainly used in the cluster code after a node has
+ * died.
+ *
+ * Done in two steps to avoid double-sends: First we collect all entries in an
+ * array, then qsort that array and only send to non-dupes.
+ */
+
+void brl_revalidate(struct messaging_context *msg_ctx,
+                   void *private_data,
+                   uint32_t msg_type,
+                   struct server_id server_id,
+                   DATA_BLOB *data)
+{
+       struct brl_revalidate_state *state;
+       uint32 i;
+       struct server_id last_pid;
+
+       if (!(state = talloc_zero(NULL, struct brl_revalidate_state))) {
+               DEBUG(0, ("talloc failed\n"));
+               return;
+       }
+
+       brl_forall(brl_revalidate_collect, state);
+
+       if (state->array_size == -1) {
+               DEBUG(0, ("talloc failed\n"));
+               goto done;
+       }
+
+       if (state->num_pids == 0) {
+               goto done;
+       }
+
+       TYPESAFE_QSORT(state->pids, state->num_pids, compare_procids);
+
+       ZERO_STRUCT(last_pid);
+
+       for (i=0; i<state->num_pids; i++) {
+               if (serverid_equal(&last_pid, &state->pids[i])) {
+                       /*
+                        * We've seen that one already
+                        */
+                       continue;
+               }
+
+               messaging_send(msg_ctx, state->pids[i], MSG_SMB_UNLOCK,
+                              &data_blob_null);
+               last_pid = state->pids[i];
+       }
+
+ done:
+       TALLOC_FREE(state);
+       return;
  }