94055de2b0750e7fdfb08e9bb3f5e8984c724d38
[kai/samba-autobuild/.git] / source3 / locking / posix.c
1 /* 
2    Unix SMB/CIFS implementation.
3    Locking functions
4    Copyright (C) Jeremy Allison 1992-2000
5    
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2 of the License, or
9    (at your option) any later version.
10    
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15    
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software
18    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20    Revision History:
21
22    POSIX locking support. Jeremy Allison (jeremy@valinux.com), Apr. 2000.
23 */
24
25 #include "includes.h"
26
27 /*
28  * The POSIX locking database handle.
29  */
30
31 static TDB_CONTEXT *posix_lock_tdb;
32
33 /*
34  * The pending close database handle.
35  */
36
37 static TDB_CONTEXT *posix_pending_close_tdb;
38
39 /*
40  * The data in POSIX lock records is an unsorted linear array of these
41  * records.  It is unnecessary to store the count as tdb provides the
42  * size of the record.
43  */
44
45 struct posix_lock {
46         int fd;
47         SMB_OFF_T start;
48         SMB_OFF_T size;
49         int lock_type;
50 };
51
52 /*
53  * The data in POSIX pending close records is an unsorted linear array of int
54  * records.  It is unnecessary to store the count as tdb provides the
55  * size of the record.
56  */
57
58 /* The key used in both the POSIX databases. */
59
60 struct posix_lock_key {
61         SMB_DEV_T device;
62         SMB_INO_T inode;
63 }; 
64
65 /*******************************************************************
66  Form a static locking key for a dev/inode pair.
67 ******************************************************************/
68
69 static TDB_DATA locking_key(SMB_DEV_T dev, SMB_INO_T inode)
70 {
71         static struct posix_lock_key key;
72         TDB_DATA kbuf;
73
74         memset(&key, '\0', sizeof(key));
75         key.device = dev;
76         key.inode = inode;
77         kbuf.dptr = (char *)&key;
78         kbuf.dsize = sizeof(key);
79         return kbuf;
80 }
81
82 /*******************************************************************
83  Convenience function to get a key from an fsp.
84 ******************************************************************/
85
86 static TDB_DATA locking_key_fsp(files_struct *fsp)
87 {
88         return locking_key(fsp->dev, fsp->inode);
89 }
90
91 /****************************************************************************
92  Add an fd to the pending close tdb.
93 ****************************************************************************/
94
95 static BOOL add_fd_to_close_entry(files_struct *fsp)
96 {
97         TDB_DATA kbuf = locking_key_fsp(fsp);
98         TDB_DATA dbuf;
99         char *tp;
100
101         dbuf.dptr = NULL;
102
103         dbuf = tdb_fetch(posix_pending_close_tdb, kbuf);
104
105         tp = Realloc(dbuf.dptr, dbuf.dsize + sizeof(int));
106         if (!tp) {
107                 DEBUG(0,("add_fd_to_close_entry: Realloc fail !\n"));
108                 SAFE_FREE(dbuf.dptr);
109                 return False;
110         } else
111                 dbuf.dptr = tp;
112
113         memcpy(dbuf.dptr + dbuf.dsize, &fsp->fd, sizeof(int));
114         dbuf.dsize += sizeof(int);
115
116         if (tdb_store(posix_pending_close_tdb, kbuf, dbuf, TDB_REPLACE) == -1) {
117                 DEBUG(0,("add_fd_to_close_entry: tdb_store fail !\n"));
118         }
119
120         SAFE_FREE(dbuf.dptr);
121         return True;
122 }
123
124 /****************************************************************************
125  Remove all fd entries for a specific dev/inode pair from the tdb.
126 ****************************************************************************/
127
128 static void delete_close_entries(files_struct *fsp)
129 {
130         TDB_DATA kbuf = locking_key_fsp(fsp);
131
132         if (tdb_delete(posix_pending_close_tdb, kbuf) == -1)
133                 DEBUG(0,("delete_close_entries: tdb_delete fail !\n"));
134 }
135
136 /****************************************************************************
137  Get the array of POSIX pending close records for an open fsp. Caller must
138  free. Returns number of entries.
139 ****************************************************************************/
140
141 static size_t get_posix_pending_close_entries(files_struct *fsp, int **entries)
142 {
143         TDB_DATA kbuf = locking_key_fsp(fsp);
144         TDB_DATA dbuf;
145         size_t count = 0;
146
147         *entries = NULL;
148         dbuf.dptr = NULL;
149
150         dbuf = tdb_fetch(posix_pending_close_tdb, kbuf);
151
152         if (!dbuf.dptr) {
153                 return 0;
154         }
155
156         *entries = (int *)dbuf.dptr;
157         count = (size_t)(dbuf.dsize / sizeof(int));
158
159         return count;
160 }
161
162 /****************************************************************************
163  Get the array of POSIX locks for an fsp. Caller must free. Returns
164  number of entries.
165 ****************************************************************************/
166
167 static size_t get_posix_lock_entries(files_struct *fsp, struct posix_lock **entries)
168 {
169         TDB_DATA kbuf = locking_key_fsp(fsp);
170         TDB_DATA dbuf;
171         size_t count = 0;
172
173         *entries = NULL;
174
175         dbuf.dptr = NULL;
176
177         dbuf = tdb_fetch(posix_lock_tdb, kbuf);
178
179         if (!dbuf.dptr) {
180                 return 0;
181         }
182
183         *entries = (struct posix_lock *)dbuf.dptr;
184         count = (size_t)(dbuf.dsize / sizeof(struct posix_lock));
185
186         return count;
187 }
188
189 /****************************************************************************
190  Deal with pending closes needed by POSIX locking support.
191  Note that posix_locking_close_file() is expected to have been called
192  to delete all locks on this fsp before this function is called.
193 ****************************************************************************/
194
195 int fd_close_posix(struct connection_struct *conn, files_struct *fsp)
196 {
197         int saved_errno = 0;
198         int ret;
199         size_t count, i;
200         struct posix_lock *entries = NULL;
201         int *fd_array = NULL;
202         BOOL locks_on_other_fds = False;
203
204         if (!lp_posix_locking(SNUM(conn))) {
205                 /*
206                  * No POSIX to worry about, just close.
207                  */
208                 ret = conn->vfs_ops.close(fsp,fsp->fd);
209                 fsp->fd = -1;
210                 return ret;
211         }
212
213         /*
214          * Get the number of outstanding POSIX locks on this dev/inode pair.
215          */
216
217         count = get_posix_lock_entries(fsp, &entries);
218
219         /*
220          * Check if there are any outstanding locks belonging to
221          * other fd's. This should never be the case if posix_locking_close_file()
222          * has been called first, but it never hurts to be *sure*.
223          */
224
225         for (i = 0; i < count; i++) {
226                 if (entries[i].fd != fsp->fd) {
227                         locks_on_other_fds = True;
228                         break;
229                 }
230         }
231
232         if (locks_on_other_fds) {
233
234                 /*
235                  * There are outstanding locks on this dev/inode pair on other fds.
236                  * Add our fd to the pending close tdb and set fsp->fd to -1.
237                  */
238
239                 if (!add_fd_to_close_entry(fsp)) {
240                         SAFE_FREE(entries);
241                         return False;
242                 }
243
244                 SAFE_FREE(entries);
245                 fsp->fd = -1;
246                 return 0;
247         }
248
249         SAFE_FREE(entries);
250
251         /*
252          * No outstanding POSIX locks. Get the pending close fd's
253          * from the tdb and close them all.
254          */
255
256         count = get_posix_pending_close_entries(fsp, &fd_array);
257
258         if (count) {
259                 DEBUG(10,("fd_close_posix: doing close on %u fd's.\n", (unsigned int)count ));
260
261                 for(i = 0; i < count; i++) {
262                         if (conn->vfs_ops.close(fsp,fd_array[i]) == -1) {
263                                 saved_errno = errno;
264                         }
265                 }
266
267                 /*
268                  * Delete all fd's stored in the tdb
269                  * for this dev/inode pair.
270                  */
271
272                 delete_close_entries(fsp);
273         }
274
275         SAFE_FREE(fd_array);
276
277         /*
278          * Finally close the fd associated with this fsp.
279          */
280
281         ret = conn->vfs_ops.close(fsp,fsp->fd);
282
283         if (saved_errno != 0) {
284         errno = saved_errno;
285                 ret = -1;
286     } 
287
288         fsp->fd = -1;
289
290         return ret;
291 }
292
293 /****************************************************************************
294  Debugging aid :-).
295 ****************************************************************************/
296
297 static const char *posix_lock_type_name(int lock_type)
298 {
299         return (lock_type == F_RDLCK) ? "READ" : "WRITE";
300 }
301
302 /****************************************************************************
303  Delete a POSIX lock entry by index number. Used if the tdb add succeeds, but
304  then the POSIX fcntl lock fails.
305 ****************************************************************************/
306
307 static BOOL delete_posix_lock_entry_by_index(files_struct *fsp, size_t entry)
308 {
309         TDB_DATA kbuf = locking_key_fsp(fsp);
310         TDB_DATA dbuf;
311         struct posix_lock *locks;
312         size_t count;
313
314         dbuf.dptr = NULL;
315         
316         dbuf = tdb_fetch(posix_lock_tdb, kbuf);
317
318         if (!dbuf.dptr) {
319                 DEBUG(10,("delete_posix_lock_entry_by_index: tdb_fetch failed !\n"));
320                 goto fail;
321         }
322
323         count = (size_t)(dbuf.dsize / sizeof(struct posix_lock));
324         locks = (struct posix_lock *)dbuf.dptr;
325
326         if (count == 1) {
327                 tdb_delete(posix_lock_tdb, kbuf);
328         } else {
329                 if (entry < count-1) {
330                         memmove(&locks[entry], &locks[entry+1], sizeof(*locks)*((count-1) - entry));
331                 }
332                 dbuf.dsize -= sizeof(*locks);
333                 tdb_store(posix_lock_tdb, kbuf, dbuf, TDB_REPLACE);
334         }
335
336         SAFE_FREE(dbuf.dptr);
337
338         return True;
339
340  fail:
341
342         SAFE_FREE(dbuf.dptr);
343         return False;
344 }
345
346 /****************************************************************************
347  Add an entry into the POSIX locking tdb. We return the index number of the
348  added lock (used in case we need to delete *exactly* this entry). Returns
349  False on fail, True on success.
350 ****************************************************************************/
351
352 static BOOL add_posix_lock_entry(files_struct *fsp, SMB_OFF_T start, SMB_OFF_T size, int lock_type, size_t *pentry_num)
353 {
354         TDB_DATA kbuf = locking_key_fsp(fsp);
355         TDB_DATA dbuf;
356         struct posix_lock pl;
357         char *tp;
358
359         dbuf.dptr = NULL;
360
361         dbuf = tdb_fetch(posix_lock_tdb, kbuf);
362
363         *pentry_num = (size_t)(dbuf.dsize / sizeof(pl));
364
365         /*
366          * Add new record.
367          */
368
369         pl.fd = fsp->fd;
370         pl.start = start;
371         pl.size = size;
372         pl.lock_type = lock_type;
373
374         tp = Realloc(dbuf.dptr, dbuf.dsize + sizeof(pl));
375         if (!tp) {
376                 DEBUG(0,("add_posix_lock_entry: Realloc fail !\n"));
377                 goto fail;
378         } else
379                 dbuf.dptr = tp;
380
381         memcpy(dbuf.dptr + dbuf.dsize, &pl, sizeof(pl));
382         dbuf.dsize += sizeof(pl);
383
384         if (tdb_store(posix_lock_tdb, kbuf, dbuf, TDB_REPLACE) == -1) {
385                 DEBUG(0,("add_posix_lock: Failed to add lock entry on file %s\n", fsp->fsp_name));
386                 goto fail;
387         }
388
389         SAFE_FREE(dbuf.dptr);
390
391         DEBUG(10,("add_posix_lock: File %s: type = %s: start=%.0f size=%.0f: dev=%.0f inode=%.0f\n",
392                         fsp->fsp_name, posix_lock_type_name(lock_type), (double)start, (double)size,
393                         (double)fsp->dev, (double)fsp->inode ));
394
395         return True;
396
397  fail:
398
399         SAFE_FREE(dbuf.dptr);
400         return False;
401 }
402
403 /****************************************************************************
404  Calculate if locks have any overlap at all.
405 ****************************************************************************/
406
407 static BOOL does_lock_overlap(SMB_OFF_T start1, SMB_OFF_T size1, SMB_OFF_T start2, SMB_OFF_T size2)
408 {
409         if (start1 >= start2 && start1 <= start2 + size2)
410                 return True;
411
412         if (start1 < start2 && start1 + size1 > start2)
413                 return True;
414
415         return False;
416 }
417
418 /****************************************************************************
419  Delete an entry from the POSIX locking tdb. Returns a copy of the entry being
420  deleted and the number of records that are overlapped by this one, or -1 on error.
421 ****************************************************************************/
422
423 static int delete_posix_lock_entry(files_struct *fsp, SMB_OFF_T start, SMB_OFF_T size, struct posix_lock *pl)
424 {
425         TDB_DATA kbuf = locking_key_fsp(fsp);
426         TDB_DATA dbuf;
427         struct posix_lock *locks;
428         size_t i, count;
429         BOOL found = False;
430         int num_overlapping_records = 0;
431
432         dbuf.dptr = NULL;
433         
434         dbuf = tdb_fetch(posix_lock_tdb, kbuf);
435
436         if (!dbuf.dptr) {
437                 DEBUG(10,("delete_posix_lock_entry: tdb_fetch failed !\n"));
438                 goto fail;
439         }
440
441         /* There are existing locks - find a match. */
442         locks = (struct posix_lock *)dbuf.dptr;
443         count = (size_t)(dbuf.dsize / sizeof(*locks));
444
445         /*
446          * Search for and delete the first record that matches the
447          * unlock criteria.
448          */
449
450         for (i=0; i<count; i++) { 
451                 struct posix_lock *entry = &locks[i];
452
453                 if (entry->fd == fsp->fd &&
454                         entry->start == start &&
455                         entry->size == size) {
456
457                         /* Make a copy if requested. */
458                         if (pl)
459                                 *pl = *entry;
460
461                         /* Found it - delete it. */
462                         if (count == 1) {
463                                 tdb_delete(posix_lock_tdb, kbuf);
464                         } else {
465                                 if (i < count-1) {
466                                         memmove(&locks[i], &locks[i+1], sizeof(*locks)*((count-1) - i));
467                                 }
468                                 dbuf.dsize -= sizeof(*locks);
469                                 tdb_store(posix_lock_tdb, kbuf, dbuf, TDB_REPLACE);
470                         }
471                         count--;
472                         found = True;
473                         break;
474                 }
475         }
476
477         if (!found)
478                 goto fail;
479
480         /*
481          * Count the number of entries that are
482          * overlapped by this unlock request.
483          */
484
485         for (i = 0; i < count; i++) {
486                 struct posix_lock *entry = &locks[i];
487
488                 if (fsp->fd == entry->fd &&
489                         does_lock_overlap( start, size, entry->start, entry->size))
490                                 num_overlapping_records++;
491         }
492
493         DEBUG(10,("delete_posix_lock_entry: type = %s: start=%.0f size=%.0f, num_records = %d\n",
494                         posix_lock_type_name(pl->lock_type), (double)pl->start, (double)pl->size,
495                                 (unsigned int)num_overlapping_records ));
496
497         SAFE_FREE(dbuf.dptr);
498
499         return num_overlapping_records;
500
501  fail:
502
503         SAFE_FREE(dbuf.dptr);
504         return -1;
505 }
506
507 /****************************************************************************
508  Utility function to map a lock type correctly depending on the open
509  mode of a file.
510 ****************************************************************************/
511
512 static int map_posix_lock_type( files_struct *fsp, enum brl_type lock_type)
513 {
514         if((lock_type == WRITE_LOCK) && !fsp->can_write) {
515                 /*
516                  * Many UNIX's cannot get a write lock on a file opened read-only.
517                  * Win32 locking semantics allow this.
518                  * Do the best we can and attempt a read-only lock.
519                  */
520                 DEBUG(10,("map_posix_lock_type: Downgrading write lock to read due to read-only file.\n"));
521                 return F_RDLCK;
522         } else if((lock_type == READ_LOCK) && !fsp->can_read) {
523                 /*
524                  * Ditto for read locks on write only files.
525                  */
526                 DEBUG(10,("map_posix_lock_type: Changing read lock to write due to write-only file.\n"));
527                 return F_WRLCK;
528         }
529
530   /*
531    * This return should be the most normal, as we attempt
532    * to always open files read/write.
533    */
534
535   return (lock_type == READ_LOCK) ? F_RDLCK : F_WRLCK;
536 }
537
538 /****************************************************************************
539  Check to see if the given unsigned lock range is within the possible POSIX
540  range. Modifies the given args to be in range if possible, just returns
541  False if not.
542 ****************************************************************************/
543
544 static BOOL posix_lock_in_range(SMB_OFF_T *offset_out, SMB_OFF_T *count_out,
545                                                                 SMB_BIG_UINT u_offset, SMB_BIG_UINT u_count)
546 {
547         SMB_OFF_T offset = (SMB_OFF_T)u_offset;
548         SMB_OFF_T count = (SMB_OFF_T)u_count;
549
550         /*
551          * For the type of system we are, attempt to
552          * find the maximum positive lock offset as an SMB_OFF_T.
553          */
554
555 #if defined(LARGE_SMB_OFF_T) && !defined(HAVE_BROKEN_FCNTL64_LOCKS)
556
557         /*
558          * In this case SMB_OFF_T is 64 bits,
559          * and the underlying system can handle 64 bit signed locks.
560          */
561
562     SMB_OFF_T mask2 = ((SMB_OFF_T)0x4) << (SMB_OFF_T_BITS-4);
563     SMB_OFF_T mask = (mask2<<1);
564     SMB_OFF_T max_positive_lock_offset = ~mask;
565
566 #else /* !LARGE_SMB_OFF_T || HAVE_BROKEN_FCNTL64_LOCKS */
567
568         /*
569          * In this case either SMB_OFF_T is 32 bits,
570          * or the underlying system cannot handle 64 bit signed locks.
571          * All offsets & counts must be 2^31 or less.
572          */
573
574     SMB_OFF_T max_positive_lock_offset = 0x7FFFFFFF;
575
576 #endif /* !LARGE_SMB_OFF_T || HAVE_BROKEN_FCNTL64_LOCKS */
577
578         /*
579          * POSIX locks of length zero mean lock to end-of-file.
580          * Win32 locks of length zero are point probes. Ignore
581          * any Win32 locks of length zero. JRA.
582          */
583
584         if (count == (SMB_OFF_T)0) {
585                 DEBUG(10,("posix_lock_in_range: count = 0, ignoring.\n"));
586                 return False;
587         }
588
589         /*
590          * If the given offset was > max_positive_lock_offset then we cannot map this at all
591          * ignore this lock.
592          */
593
594         if (u_offset & ~((SMB_BIG_UINT)max_positive_lock_offset)) {
595                 DEBUG(10,("posix_lock_in_range: (offset = %.0f) offset > %.0f and we cannot handle this. Ignoring lock.\n",
596                                 (double)u_offset, (double)((SMB_BIG_UINT)max_positive_lock_offset) ));
597                 return False;
598         }
599
600         /*
601          * We must truncate the offset and count to less than max_positive_lock_offset.
602          */
603
604         offset &= max_positive_lock_offset;
605         count &= max_positive_lock_offset;
606
607
608         /*
609          * Deal with a very common case of count of all ones.
610          * (lock entire file).
611          */
612
613         if(count == (SMB_OFF_T)-1)
614                 count = max_positive_lock_offset;
615
616         /*
617          * Truncate count to end at max lock offset.
618          */
619
620         if (offset + count < 0 || offset + count > max_positive_lock_offset)
621                 count = max_positive_lock_offset - offset;
622
623         /*
624          * If we ate all the count, ignore this lock.
625          */
626
627         if (count == 0) {
628                 DEBUG(10,("posix_lock_in_range: Count = 0. Ignoring lock u_offset = %.0f, u_count = %.0f\n",
629                                 (double)u_offset, (double)u_count ));
630                 return False;
631         }
632
633         /*
634          * The mapping was successful.
635          */
636
637         DEBUG(10,("posix_lock_in_range: offset_out = %.0f, count_out = %.0f\n",
638                         (double)offset, (double)count ));
639
640         *offset_out = offset;
641         *count_out = count;
642         
643         return True;
644 }
645
646 /****************************************************************************
647  Actual function that does POSIX locks. Copes with 64 -> 32 bit cruft and
648  broken NFS implementations.
649 ****************************************************************************/
650
651 static BOOL posix_fcntl_lock(files_struct *fsp, int op, SMB_OFF_T offset, SMB_OFF_T count, int type)
652 {
653         int ret;
654         struct connection_struct *conn = fsp->conn;
655
656         DEBUG(8,("posix_fcntl_lock %d %d %.0f %.0f %d\n",fsp->fd,op,(double)offset,(double)count,type));
657
658         ret = conn->vfs_ops.lock(fsp,fsp->fd,op,offset,count,type);
659
660         if (!ret && ((errno == EFBIG) || (errno == ENOLCK) || (errno ==  EINVAL))) {
661
662                 DEBUG(0,("posix_fcntl_lock: WARNING: lock request at offset %.0f, length %.0f returned\n",
663                                         (double)offset,(double)count));
664                 DEBUG(0,("an %s error. This can happen when using 64 bit lock offsets\n", strerror(errno)));
665                 DEBUG(0,("on 32 bit NFS mounted file systems.\n"));
666
667                 /*
668                  * If the offset is > 0x7FFFFFFF then this will cause problems on
669                  * 32 bit NFS mounted filesystems. Just ignore it.
670                  */
671
672                 if (offset & ~((SMB_OFF_T)0x7fffffff)) {
673                         DEBUG(0,("Offset greater than 31 bits. Returning success.\n"));
674                         return True;
675                 }
676
677                 if (count & ~((SMB_OFF_T)0x7fffffff)) {
678                         /* 32 bit NFS file system, retry with smaller offset */
679                         DEBUG(0,("Count greater than 31 bits - retrying with 31 bit truncated length.\n"));
680                         errno = 0;
681                         count &= 0x7fffffff;
682                         ret = conn->vfs_ops.lock(fsp,fsp->fd,op,offset,count,type);
683                 }
684         }
685
686         DEBUG(8,("posix_fcntl_lock: Lock call %s\n", ret ? "successful" : "failed"));
687
688         return ret;
689 }
690
691 /****************************************************************************
692  POSIX function to see if a file region is locked. Returns True if the
693  region is locked, False otherwise.
694 ****************************************************************************/
695
696 BOOL is_posix_locked(files_struct *fsp, SMB_BIG_UINT u_offset, SMB_BIG_UINT u_count, enum brl_type lock_type)
697 {
698         SMB_OFF_T offset;
699         SMB_OFF_T count;
700         int posix_lock_type = map_posix_lock_type(fsp,lock_type);
701
702         DEBUG(10,("is_posix_locked: File %s, offset = %.0f, count = %.0f, type = %s\n",
703                         fsp->fsp_name, (double)u_offset, (double)u_count, posix_lock_type_name(lock_type) ));
704
705         /*
706          * If the requested lock won't fit in the POSIX range, we will
707          * never set it, so presume it is not locked.
708          */
709
710         if(!posix_lock_in_range(&offset, &count, u_offset, u_count))
711                 return False;
712
713         /*
714          * Note that most UNIX's can *test* for a write lock on
715          * a read-only fd, just not *set* a write lock on a read-only
716          * fd. So we don't need to use map_lock_type here.
717          */ 
718
719         return posix_fcntl_lock(fsp,SMB_F_GETLK,offset,count,posix_lock_type);
720 }
721
722 /*
723  * Structure used when splitting a lock range
724  * into a POSIX lock range. Doubly linked list.
725  */
726
727 struct lock_list {
728     struct lock_list *next;
729     struct lock_list *prev;
730     SMB_OFF_T start;
731     SMB_OFF_T size;
732 };
733
734 /****************************************************************************
735  Create a list of lock ranges that don't overlap a given range. Used in calculating
736  POSIX locks and unlocks. This is a difficult function that requires ASCII art to
737  understand it :-).
738 ****************************************************************************/
739
740 static struct lock_list *posix_lock_list(TALLOC_CTX *ctx, struct lock_list *lhead, files_struct *fsp)
741 {
742         TDB_DATA kbuf = locking_key_fsp(fsp);
743         TDB_DATA dbuf;
744         struct posix_lock *locks;
745         size_t num_locks, i;
746
747         dbuf.dptr = NULL;
748
749         dbuf = tdb_fetch(posix_lock_tdb, kbuf);
750
751         if (!dbuf.dptr)
752                 return lhead;
753         
754         locks = (struct posix_lock *)dbuf.dptr;
755         num_locks = (size_t)(dbuf.dsize / sizeof(*locks));
756
757         /*
758          * Check the current lock list on this dev/inode pair.
759          * Quit if the list is deleted.
760          */
761
762         DEBUG(10,("posix_lock_list: curr: start=%.0f,size=%.0f\n",
763                 (double)lhead->start, (double)lhead->size ));
764
765         for (i=0; i<num_locks && lhead; i++) {
766
767                 struct posix_lock *lock = &locks[i];
768                 struct lock_list *l_curr;
769
770                 /*
771                  * Walk the lock list, checking for overlaps. Note that
772                  * the lock list can expand within this loop if the current
773                  * range being examined needs to be split.
774                  */
775
776                 for (l_curr = lhead; l_curr;) {
777
778                         DEBUG(10,("posix_lock_list: lock: fd=%d: start=%.0f,size=%.0f:type=%s", lock->fd,
779                                 (double)lock->start, (double)lock->size, posix_lock_type_name(lock->lock_type) ));
780
781                         if ( (l_curr->start >= (lock->start + lock->size)) ||
782                                  (lock->start >= (l_curr->start + l_curr->size))) {
783
784                                 /* No overlap with this lock - leave this range alone. */
785 /*********************************************
786                                              +---------+
787                                              | l_curr  |
788                                              +---------+
789                                 +-------+
790                                 | lock  |
791                                 +-------+
792 OR....
793              +---------+
794              |  l_curr |
795              +---------+
796 **********************************************/
797
798                                 DEBUG(10,("no overlap case.\n" ));
799
800                                 l_curr = l_curr->next;
801
802                         } else if ( (l_curr->start >= lock->start) &&
803                                                 (l_curr->start + l_curr->size <= lock->start + lock->size) ) {
804
805                                 /*
806                                  * This unlock is completely overlapped by this existing lock range
807                                  * and thus should have no effect (not be unlocked). Delete it from the list.
808                                  */
809 /*********************************************
810                 +---------+
811                 |  l_curr |
812                 +---------+
813         +---------------------------+
814         |       lock                |
815         +---------------------------+
816 **********************************************/
817                                 /* Save the next pointer */
818                                 struct lock_list *ul_next = l_curr->next;
819
820                                 DEBUG(10,("delete case.\n" ));
821
822                                 DLIST_REMOVE(lhead, l_curr);
823                                 if(lhead == NULL)
824                                         break; /* No more list... */
825
826                                 l_curr = ul_next;
827                                 
828                         } else if ( (l_curr->start >= lock->start) &&
829                                                 (l_curr->start < lock->start + lock->size) &&
830                                                 (l_curr->start + l_curr->size > lock->start + lock->size) ) {
831
832                                 /*
833                                  * This unlock overlaps the existing lock range at the high end.
834                                  * Truncate by moving start to existing range end and reducing size.
835                                  */
836 /*********************************************
837                 +---------------+
838                 |  l_curr       |
839                 +---------------+
840         +---------------+
841         |    lock       |
842         +---------------+
843 BECOMES....
844                         +-------+
845                         | l_curr|
846                         +-------+
847 **********************************************/
848
849                                 l_curr->size = (l_curr->start + l_curr->size) - (lock->start + lock->size);
850                                 l_curr->start = lock->start + lock->size;
851
852                                 DEBUG(10,("truncate high case: start=%.0f,size=%.0f\n",
853                                                                 (double)l_curr->start, (double)l_curr->size ));
854
855                                 l_curr = l_curr->next;
856
857                         } else if ( (l_curr->start < lock->start) &&
858                                                 (l_curr->start + l_curr->size > lock->start) &&
859                                                 (l_curr->start + l_curr->size <= lock->start + lock->size) ) {
860
861                                 /*
862                                  * This unlock overlaps the existing lock range at the low end.
863                                  * Truncate by reducing size.
864                                  */
865 /*********************************************
866    +---------------+
867    |  l_curr       |
868    +---------------+
869            +---------------+
870            |    lock       |
871            +---------------+
872 BECOMES....
873    +-------+
874    | l_curr|
875    +-------+
876 **********************************************/
877
878                                 l_curr->size = lock->start - l_curr->start;
879
880                                 DEBUG(10,("truncate low case: start=%.0f,size=%.0f\n",
881                                                                 (double)l_curr->start, (double)l_curr->size ));
882
883                                 l_curr = l_curr->next;
884                 
885                         } else if ( (l_curr->start < lock->start) &&
886                                                 (l_curr->start + l_curr->size > lock->start + lock->size) ) {
887                                 /*
888                                  * Worst case scenario. Unlock request completely overlaps an existing
889                                  * lock range. Split the request into two, push the new (upper) request
890                                  * into the dlink list, and continue with the entry after ul_new (as we
891                                  * know that ul_new will not overlap with this lock).
892                                  */
893 /*********************************************
894         +---------------------------+
895         |        l_curr             |
896         +---------------------------+
897                 +---------+
898                 | lock    |
899                 +---------+
900 BECOMES.....
901         +-------+         +---------+
902         | l_curr|         | l_new   |
903         +-------+         +---------+
904 **********************************************/
905                                 struct lock_list *l_new = (struct lock_list *)talloc(ctx,
906                                                                                                         sizeof(struct lock_list));
907
908                                 if(l_new == NULL) {
909                                         DEBUG(0,("posix_lock_list: talloc fail.\n"));
910                                         return NULL; /* The talloc_destroy takes care of cleanup. */
911                                 }
912
913                                 ZERO_STRUCTP(l_new);
914                                 l_new->start = lock->start + lock->size;
915                                 l_new->size = l_curr->start + l_curr->size - l_new->start;
916
917                                 /* Truncate the l_curr. */
918                                 l_curr->size = lock->start - l_curr->start;
919
920                                 DEBUG(10,("split case: curr: start=%.0f,size=%.0f \
921 new: start=%.0f,size=%.0f\n", (double)l_curr->start, (double)l_curr->size,
922                                                                 (double)l_new->start, (double)l_new->size ));
923
924                                 /*
925                                  * Add into the dlink list after the l_curr point - NOT at lhead. 
926                                  * Note we can't use DLINK_ADD here as this inserts at the head of the given list.
927                                  */
928
929                                 l_new->prev = l_curr;
930                                 l_new->next = l_curr->next;
931                                 l_curr->next = l_new;
932
933                                 /* And move after the link we added. */
934                                 l_curr = l_new->next;
935
936                         } else {
937
938                                 /*
939                                  * This logic case should never happen. Ensure this is the
940                                  * case by forcing an abort.... Remove in production.
941                                  */
942                                 pstring msg;
943
944                                 slprintf(msg, sizeof(msg)-1, "logic flaw in cases: l_curr: start = %.0f, size = %.0f : \
945 lock: start = %.0f, size = %.0f\n", (double)l_curr->start, (double)l_curr->size, (double)lock->start, (double)lock->size );
946
947                                 smb_panic(msg);
948                         }
949                 } /* end for ( l_curr = lhead; l_curr;) */
950         } /* end for (i=0; i<num_locks && ul_head; i++) */
951
952         SAFE_FREE(dbuf.dptr);
953         
954         return lhead;
955 }
956
957 /****************************************************************************
958  POSIX function to acquire a lock. Returns True if the
959  lock could be granted, False if not.
960 ****************************************************************************/
961
962 BOOL set_posix_lock(files_struct *fsp, SMB_BIG_UINT u_offset, SMB_BIG_UINT u_count, enum brl_type lock_type)
963 {
964         SMB_OFF_T offset;
965         SMB_OFF_T count;
966         BOOL ret = True;
967         size_t entry_num = 0;
968         size_t lock_count;
969         TALLOC_CTX *l_ctx = NULL;
970         struct lock_list *llist = NULL;
971         struct lock_list *ll = NULL;
972         int posix_lock_type = map_posix_lock_type(fsp,lock_type);
973
974         DEBUG(5,("set_posix_lock: File %s, offset = %.0f, count = %.0f, type = %s\n",
975                         fsp->fsp_name, (double)u_offset, (double)u_count, posix_lock_type_name(lock_type) ));
976
977         /*
978          * If the requested lock won't fit in the POSIX range, we will
979          * pretend it was successful.
980          */
981
982         if(!posix_lock_in_range(&offset, &count, u_offset, u_count))
983                 return True;
984
985         /*
986          * Windows is very strange. It allows read locks to be overlayed
987          * (even over a write lock), but leaves the write lock in force until the first
988          * unlock. It also reference counts the locks. This means the following sequence :
989          *
990          * process1                                      process2
991          * ------------------------------------------------------------------------
992          * WRITE LOCK : start = 2, len = 10
993          *                                            READ LOCK: start =0, len = 10 - FAIL
994          * READ LOCK : start = 0, len = 14 
995          *                                            READ LOCK: start =0, len = 10 - FAIL
996          * UNLOCK : start = 2, len = 10
997          *                                            READ LOCK: start =0, len = 10 - OK
998          *
999          * Under POSIX, the same sequence in steps 1 and 2 would not be reference counted, but
1000          * would leave a single read lock over the 0-14 region. In order to
1001          * re-create Windows semantics mapped to POSIX locks, we create multiple TDB
1002          * entries, one for each overlayed lock request. We are guarenteed by the brlock
1003          * semantics that if a write lock is added, then it will be first in the array.
1004          */
1005         
1006         if ((l_ctx = talloc_init()) == NULL) {
1007                 DEBUG(0,("set_posix_lock: unable to init talloc context.\n"));
1008                 return True; /* Not a fatal error. */
1009         }
1010
1011         if ((ll = (struct lock_list *)talloc(l_ctx, sizeof(struct lock_list))) == NULL) {
1012                 DEBUG(0,("set_posix_lock: unable to talloc unlock list.\n"));
1013                 talloc_destroy(l_ctx);
1014                 return True; /* Not a fatal error. */
1015         }
1016
1017         /*
1018          * Create the initial list entry containing the
1019          * lock we want to add.
1020          */
1021
1022         ZERO_STRUCTP(ll);
1023         ll->start = offset;
1024         ll->size = count;
1025
1026         DLIST_ADD(llist, ll);
1027
1028         /*
1029          * The following call calculates if there are any
1030          * overlapping locks held by this process on
1031          * fd's open on the same file and splits this list
1032          * into a list of lock ranges that do not overlap with existing
1033          * POSIX locks.
1034          */
1035
1036         llist = posix_lock_list(l_ctx, llist, fsp);
1037
1038         /*
1039          * Now we have the list of ranges to lock it is safe to add the
1040          * entry into the POSIX lock tdb. We take note of the entry we
1041          * added here in case we have to remove it on POSIX lock fail.
1042          */
1043
1044         if (!add_posix_lock_entry(fsp,offset,count,posix_lock_type,&entry_num)) {
1045                 DEBUG(0,("set_posix_lock: Unable to create posix lock entry !\n"));
1046                 talloc_destroy(l_ctx);
1047                 return False;
1048         }
1049
1050         /*
1051          * Add the POSIX locks on the list of ranges returned.
1052          * As the lock is supposed to be added atomically, we need to
1053          * back out all the locks if any one of these calls fail.
1054          */
1055
1056         for (lock_count = 0, ll = llist; ll; ll = ll->next, lock_count++) {
1057                 offset = ll->start;
1058                 count = ll->size;
1059
1060                 DEBUG(5,("set_posix_lock: Real lock: Type = %s: offset = %.0f, count = %.0f\n",
1061                         posix_lock_type_name(posix_lock_type), (double)offset, (double)count ));
1062
1063                 if (!posix_fcntl_lock(fsp,SMB_F_SETLK,offset,count,posix_lock_type)) {
1064                         DEBUG(5,("set_posix_lock: Lock fail !: Type = %s: offset = %.0f, count = %.0f. Errno = %s\n",
1065                                 posix_lock_type_name(posix_lock_type), (double)offset, (double)count, strerror(errno) ));
1066                         ret = False;
1067                         break;
1068                 }
1069         }
1070
1071         if (!ret) {
1072
1073                 /*
1074                  * Back out all the POSIX locks we have on fail.
1075                  */
1076
1077                 for (ll = llist; lock_count; ll = ll->next, lock_count--) {
1078                         offset = ll->start;
1079                         count = ll->size;
1080
1081                         DEBUG(5,("set_posix_lock: Backing out locks: Type = %s: offset = %.0f, count = %.0f\n",
1082                                 posix_lock_type_name(posix_lock_type), (double)offset, (double)count ));
1083
1084                         posix_fcntl_lock(fsp,SMB_F_SETLK,offset,count,F_UNLCK);
1085                 }
1086
1087                 /*
1088                  * Remove the tdb entry for this lock.
1089                  */
1090
1091                 delete_posix_lock_entry_by_index(fsp,entry_num);
1092         }
1093
1094         talloc_destroy(l_ctx);
1095         return ret;
1096 }
1097
1098 /****************************************************************************
1099  POSIX function to release a lock. Returns True if the
1100  lock could be released, False if not.
1101 ****************************************************************************/
1102
1103 BOOL release_posix_lock(files_struct *fsp, SMB_BIG_UINT u_offset, SMB_BIG_UINT u_count)
1104 {
1105         SMB_OFF_T offset;
1106         SMB_OFF_T count;
1107         BOOL ret = True;
1108         TALLOC_CTX *ul_ctx = NULL;
1109         struct lock_list *ulist = NULL;
1110         struct lock_list *ul = NULL;
1111         struct posix_lock deleted_lock;
1112         int num_overlapped_entries;
1113
1114         DEBUG(5,("release_posix_lock: File %s, offset = %.0f, count = %.0f\n",
1115                 fsp->fsp_name, (double)u_offset, (double)u_count ));
1116
1117         /*
1118          * If the requested lock won't fit in the POSIX range, we will
1119          * pretend it was successful.
1120          */
1121
1122         if(!posix_lock_in_range(&offset, &count, u_offset, u_count))
1123                 return True;
1124
1125         /*
1126          * We treat this as one unlock request for POSIX accounting purposes even
1127          * if it may later be split into multiple smaller POSIX unlock ranges.
1128          * num_overlapped_entries is the number of existing locks that have any
1129          * overlap with this unlock request.
1130          */ 
1131
1132         num_overlapped_entries = delete_posix_lock_entry(fsp, offset, count, &deleted_lock);
1133
1134         if (num_overlapped_entries == -1) {
1135         smb_panic("release_posix_lock: unable find entry to delete !\n");
1136         }
1137
1138         /*
1139          * If num_overlapped_entries is > 0, and the lock_type we just deleted from the tdb was
1140          * a POSIX write lock, then before doing the unlock we need to downgrade
1141          * the POSIX lock to a read lock. This allows any overlapping read locks
1142          * to be atomically maintained.
1143          */
1144
1145         if (num_overlapped_entries > 0 && deleted_lock.lock_type == F_WRLCK) {
1146                 if (!posix_fcntl_lock(fsp,SMB_F_SETLK,offset,count,F_RDLCK)) {
1147                         DEBUG(0,("release_posix_lock: downgrade of lock failed with error %s !\n", strerror(errno) ));
1148                         return False;
1149                 }
1150         }
1151
1152         if ((ul_ctx = talloc_init()) == NULL) {
1153                 DEBUG(0,("release_posix_lock: unable to init talloc context.\n"));
1154                 return True; /* Not a fatal error. */
1155         }
1156
1157         if ((ul = (struct lock_list *)talloc(ul_ctx, sizeof(struct lock_list))) == NULL) {
1158                 DEBUG(0,("release_posix_lock: unable to talloc unlock list.\n"));
1159                 talloc_destroy(ul_ctx);
1160                 return True; /* Not a fatal error. */
1161         }
1162
1163         /*
1164          * Create the initial list entry containing the
1165          * lock we want to remove.
1166          */
1167
1168         ZERO_STRUCTP(ul);
1169         ul->start = offset;
1170         ul->size = count;
1171
1172         DLIST_ADD(ulist, ul);
1173
1174         /*
1175          * The following call calculates if there are any
1176          * overlapping locks held by this process on
1177          * fd's open on the same file and creates a
1178          * list of unlock ranges that will allow
1179          * POSIX lock ranges to remain on the file whilst the
1180          * unlocks are performed.
1181          */
1182
1183         ulist = posix_lock_list(ul_ctx, ulist, fsp);
1184
1185         /*
1186          * Release the POSIX locks on the list of ranges returned.
1187          */
1188
1189         for(; ulist; ulist = ulist->next) {
1190                 offset = ulist->start;
1191                 count = ulist->size;
1192
1193                 DEBUG(5,("release_posix_lock: Real unlock: offset = %.0f, count = %.0f\n",
1194                         (double)offset, (double)count ));
1195
1196                 if (!posix_fcntl_lock(fsp,SMB_F_SETLK,offset,count,F_UNLCK))
1197                         ret = False;
1198         }
1199
1200         talloc_destroy(ul_ctx);
1201
1202         return ret;
1203 }
1204
1205 /****************************************************************************
1206  Remove all lock entries for a specific dev/inode pair from the tdb.
1207 ****************************************************************************/
1208
1209 static void delete_posix_lock_entries(files_struct *fsp)
1210 {
1211         TDB_DATA kbuf = locking_key_fsp(fsp);
1212
1213         if (tdb_delete(posix_lock_tdb, kbuf) == -1)
1214                 DEBUG(0,("delete_close_entries: tdb_delete fail !\n"));
1215 }
1216
1217 /****************************************************************************
1218  Debug function.
1219 ****************************************************************************/
1220
1221 static void dump_entry(struct posix_lock *pl)
1222 {
1223         DEBUG(10,("entry: start=%.0f, size=%.0f, type=%d, fd=%i\n",
1224                 (double)pl->start, (double)pl->size, (int)pl->lock_type, pl->fd ));
1225 }
1226
1227 /****************************************************************************
1228  Remove any locks on this fd. Called from file_close().
1229 ****************************************************************************/
1230
1231 void posix_locking_close_file(files_struct *fsp)
1232 {
1233         struct posix_lock *entries = NULL;
1234         size_t count, i;
1235
1236         /*
1237          * Optimization for the common case where we are the only
1238          * opener of a file. If all fd entries are our own, we don't
1239          * need to explicitly release all the locks via the POSIX functions,
1240          * we can just remove all the entries in the tdb and allow the
1241          * close to remove the real locks.
1242          */
1243
1244         count = get_posix_lock_entries(fsp, &entries);
1245
1246         if (count == 0) {
1247                 DEBUG(10,("posix_locking_close_file: file %s has no outstanding locks.\n", fsp->fsp_name ));
1248                 return;
1249         }
1250
1251         for (i = 0; i < count; i++) {
1252                 if (entries[i].fd != fsp->fd )
1253                         break;
1254
1255                 dump_entry(&entries[i]);
1256         }
1257
1258         if (i == count) {
1259                 /* All locks are ours. */
1260                 DEBUG(10,("posix_locking_close_file: file %s has %u outstanding locks, but all on one fd.\n", 
1261                         fsp->fsp_name, (unsigned int)count ));
1262                 SAFE_FREE(entries);
1263                 delete_posix_lock_entries(fsp);
1264                 return;
1265         }
1266
1267         /*
1268          * Difficult case. We need to delete all our locks, whilst leaving
1269          * all other POSIX locks in place.
1270          */
1271
1272         for (i = 0; i < count; i++) {
1273                 struct posix_lock *pl = &entries[i];
1274                 if (pl->fd == fsp->fd)
1275                         release_posix_lock(fsp, (SMB_BIG_UINT)pl->start, (SMB_BIG_UINT)pl->size );
1276         }
1277         SAFE_FREE(entries);
1278 }
1279
1280 /*******************************************************************
1281  Create the in-memory POSIX lock databases.
1282 ********************************************************************/
1283
1284 BOOL posix_locking_init(int read_only)
1285 {
1286         if (posix_lock_tdb && posix_pending_close_tdb)
1287                 return True;
1288         
1289         if (!posix_lock_tdb)
1290                 posix_lock_tdb = tdb_open_log(NULL, 0, TDB_INTERNAL,
1291                                           read_only?O_RDONLY:(O_RDWR|O_CREAT), 0644);
1292         if (!posix_lock_tdb) {
1293                 DEBUG(0,("Failed to open POSIX byte range locking database.\n"));
1294                 return False;
1295         }
1296         if (!posix_pending_close_tdb)
1297                 posix_pending_close_tdb = tdb_open_log(NULL, 0, TDB_INTERNAL,
1298                                                    read_only?O_RDONLY:(O_RDWR|O_CREAT), 0644);
1299         if (!posix_pending_close_tdb) {
1300                 DEBUG(0,("Failed to open POSIX pending close database.\n"));
1301                 return False;
1302         }
1303
1304         return True;
1305 }
1306
1307 /*******************************************************************
1308  Delete the in-memory POSIX lock databases.
1309 ********************************************************************/
1310
1311 BOOL posix_locking_end(void)
1312 {
1313     if (posix_lock_tdb && tdb_close(posix_lock_tdb) != 0)
1314                 return False;
1315     if (posix_pending_close_tdb && tdb_close(posix_pending_close_tdb) != 0)
1316                 return False;
1317         return True;
1318 }