Fix for very subtle POSIX lock interaction race condition found by
[kai/samba.git] / source3 / locking / brlock.c
1 /* 
2    Unix SMB/CIFS implementation.
3    byte range locking code
4    Updated to handle range splits/merges.
5
6    Copyright (C) Andrew Tridgell 1992-2000
7    Copyright (C) Jeremy Allison 1992-2000
8    
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 2 of the License, or
12    (at your option) any later version.
13    
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18    
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 */
23
24 /* This module implements a tdb based byte range locking service,
25    replacing the fcntl() based byte range locking previously
26    used. This allows us to provide the same semantics as NT */
27
28 #include "includes.h"
29
30 #define ZERO_ZERO 0
31
32 /* This contains elements that differentiate locks. The smbpid is a
33    client supplied pid, and is essentially the locking context for
34    this client */
35
36 struct lock_context {
37         uint16 smbpid;
38         uint16 tid;
39         pid_t pid;
40 };
41
42 /* The data in brlock records is an unsorted linear array of these
43    records.  It is unnecessary to store the count as tdb provides the
44    size of the record */
45
46 struct lock_struct {
47         struct lock_context context;
48         br_off start;
49         br_off size;
50         int fnum;
51         enum brl_type lock_type;
52 };
53
54 /* The key used in the brlock database. */
55
56 struct lock_key {
57         SMB_DEV_T device;
58         SMB_INO_T inode;
59 };
60
61 /* The open brlock.tdb database. */
62
63 static TDB_CONTEXT *tdb;
64
65 /****************************************************************************
66  Create a locking key - ensuring zero filled for pad purposes.
67 ****************************************************************************/
68
69 static TDB_DATA locking_key(SMB_DEV_T dev, SMB_INO_T inode)
70 {
71         static struct lock_key key;
72         TDB_DATA kbuf;
73
74         memset(&key, '\0', sizeof(key));
75         key.device = dev;
76         key.inode = inode;
77         kbuf.dptr = (char *)&key;
78         kbuf.dsize = sizeof(key);
79         return kbuf;
80 }
81
82 /****************************************************************************
83  See if two locking contexts are equal.
84 ****************************************************************************/
85
86 static BOOL brl_same_context(struct lock_context *ctx1, 
87                              struct lock_context *ctx2)
88 {
89         return (ctx1->pid == ctx2->pid) &&
90                 (ctx1->smbpid == ctx2->smbpid) &&
91                 (ctx1->tid == ctx2->tid);
92 }
93
94 /****************************************************************************
95  See if lock2 can be added when lock1 is in place.
96 ****************************************************************************/
97
98 static BOOL brl_conflict(struct lock_struct *lck1, 
99                          struct lock_struct *lck2)
100 {
101         if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
102                 return False;
103
104         if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) {
105                 return False;
106         }
107
108         if (brl_same_context(&lck1->context, &lck2->context) &&
109             lck2->lock_type == READ_LOCK && lck1->fnum == lck2->fnum) {
110                 return False;
111         }
112
113         if (lck1->start >= (lck2->start + lck2->size) ||
114             lck2->start >= (lck1->start + lck1->size)) {
115                 return False;
116         }
117             
118         return True;
119
120
121 #if ZERO_ZERO
122 static BOOL brl_conflict1(struct lock_struct *lck1, 
123                          struct lock_struct *lck2)
124 {
125         if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
126                 return False;
127
128         if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) {
129                 return False;
130         }
131
132         if (brl_same_context(&lck1->context, &lck2->context) &&
133             lck2->lock_type == READ_LOCK && lck1->fnum == lck2->fnum) {
134                 return False;
135         }
136
137         if (lck2->start == 0 && lck2->size == 0 && lck1->size != 0) {
138                 return True;
139         }
140
141         if (lck1->start >= (lck2->start + lck2->size) ||
142             lck2->start >= (lck1->start + lck1->size)) {
143                 return False;
144         }
145             
146         return True;
147
148 #endif
149
150 /****************************************************************************
151  Check to see if this lock conflicts, but ignore our own locks on the
152  same fnum only.
153 ****************************************************************************/
154
155 static BOOL brl_conflict_other(struct lock_struct *lck1, struct lock_struct *lck2)
156 {
157         if (lck1->lock_type == PENDING_LOCK || lck2->lock_type == PENDING_LOCK )
158                 return False;
159
160         if (lck1->lock_type == READ_LOCK && lck2->lock_type == READ_LOCK) 
161                 return False;
162
163         /*
164          * Incoming WRITE locks conflict with existing READ locks even
165          * if the context is the same. JRA. See LOCKTEST7 in smbtorture.
166          */
167
168         if (!(lck2->lock_type == WRITE_LOCK && lck1->lock_type == READ_LOCK)) {
169                 if (brl_same_context(&lck1->context, &lck2->context) &&
170                                         lck1->fnum == lck2->fnum)
171                         return False;
172         }
173
174         if (lck1->start >= (lck2->start + lck2->size) ||
175             lck2->start >= (lck1->start + lck1->size)) return False;
176             
177         return True;
178
179
180
181 #if DONT_DO_THIS
182         /* doing this traversal could kill solaris machines under high load (tridge) */
183         /* delete any dead locks */
184
185 /****************************************************************************
186  Delete a record if it is for a dead process, if check_self is true, then
187  delete any records belonging to this pid also (there shouldn't be any).
188 ****************************************************************************/
189
190 static int delete_fn(TDB_CONTEXT *ttdb, TDB_DATA kbuf, TDB_DATA dbuf, void *state)
191 {
192         struct lock_struct *locks;
193         int count, i;
194         BOOL check_self = *(BOOL *)state;
195         pid_t mypid = sys_getpid();
196
197         tdb_chainlock(tdb, kbuf);
198
199         locks = (struct lock_struct *)dbuf.dptr;
200
201         count = dbuf.dsize / sizeof(*locks);
202         for (i=0; i<count; i++) {
203                 struct lock_struct *lock = &locks[i];
204
205                 /* If check_self is true we want to remove our own records. */
206                 if (check_self && (mypid == lock->context.pid)) {
207
208                         DEBUG(0,("brlock : delete_fn. LOGIC ERROR ! Shutting down and a record for my pid (%u) exists !\n",
209                                         (unsigned int)lock->context.pid ));
210
211                 } else if (process_exists(lock->context.pid)) {
212
213                         DEBUG(10,("brlock : delete_fn. pid %u exists.\n", (unsigned int)lock->context.pid ));
214                         continue;
215                 }
216
217                 DEBUG(10,("brlock : delete_fn. Deleting record for process %u\n",
218                                 (unsigned int)lock->context.pid ));
219
220                 if (count > 1 && i < count-1) {
221                         memmove(&locks[i], &locks[i+1], 
222                                 sizeof(*locks)*((count-1) - i));
223                 }
224                 count--;
225                 i--;
226         }
227
228         if (count == 0) {
229                 tdb_delete(tdb, kbuf);
230         } else if (count < (dbuf.dsize / sizeof(*locks))) {
231                 dbuf.dsize = count * sizeof(*locks);
232                 tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
233         }
234
235         tdb_chainunlock(tdb, kbuf);
236         return 0;
237 }
238 #endif
239
240 /****************************************************************************
241  Open up the brlock.tdb database.
242 ****************************************************************************/
243
244 void brl_init(int read_only)
245 {
246         if (tdb)
247                 return;
248         tdb = tdb_open_log(lock_path("brlock.tdb"), 0,  TDB_DEFAULT|(read_only?0x0:TDB_CLEAR_IF_FIRST),
249                        read_only?O_RDONLY:(O_RDWR|O_CREAT), 0644);
250         if (!tdb) {
251                 DEBUG(0,("Failed to open byte range locking database\n"));
252                 return;
253         }
254
255 #if DONT_DO_THIS
256         /* doing this traversal could kill solaris machines under high load (tridge) */
257         /* delete any dead locks */
258         if (!read_only) {
259                 BOOL check_self = False;
260                 tdb_traverse(tdb, delete_fn, &check_self);
261         }
262 #endif
263 }
264
265 /****************************************************************************
266  Close down the brlock.tdb database.
267 ****************************************************************************/
268
269 void brl_shutdown(int read_only)
270 {
271         if (!tdb)
272                 return;
273
274 #if DONT_DO_THIS
275         /* doing this traversal could kill solaris machines under high load (tridge) */
276         /* delete any dead locks */
277         if (!read_only) {
278                 BOOL check_self = True;
279                 tdb_traverse(tdb, delete_fn, &check_self);
280         }
281 #endif
282
283         tdb_close(tdb);
284 }
285
286 #if ZERO_ZERO
287 /****************************************************************************
288 compare two locks for sorting
289 ****************************************************************************/
290 static int lock_compare(struct lock_struct *lck1, 
291                          struct lock_struct *lck2)
292 {
293         if (lck1->start != lck2->start) return (lck1->start - lck2->start);
294         if (lck2->size != lck1->size) {
295                 return ((int)lck1->size - (int)lck2->size);
296         }
297         return 0;
298 }
299 #endif
300
301 /****************************************************************************
302  Lock a range of bytes.
303 ****************************************************************************/
304
305 NTSTATUS brl_lock(SMB_DEV_T dev, SMB_INO_T ino, int fnum,
306                   uint16 smbpid, pid_t pid, uint16 tid,
307                   br_off start, br_off size, 
308                   enum brl_type lock_type)
309 {
310         TDB_DATA kbuf, dbuf;
311         int count, i;
312         struct lock_struct lock, *locks;
313         char *tp;
314         NTSTATUS status = NT_STATUS_OK;
315         static int last_failed = -1;
316         static br_off last_failed_start;
317
318         kbuf = locking_key(dev,ino);
319
320         dbuf.dptr = NULL;
321
322 #if !ZERO_ZERO
323         if (start == 0 && size == 0) {
324                 DEBUG(0,("client sent 0/0 lock - please report this\n"));
325         }
326 #endif
327
328         tdb_chainlock(tdb, kbuf);
329         dbuf = tdb_fetch(tdb, kbuf);
330
331         lock.context.smbpid = smbpid;
332         lock.context.pid = pid;
333         lock.context.tid = tid;
334         lock.start = start;
335         lock.size = size;
336         lock.fnum = fnum;
337         lock.lock_type = lock_type;
338
339         if (dbuf.dptr) {
340                 /* there are existing locks - make sure they don't conflict */
341                 locks = (struct lock_struct *)dbuf.dptr;
342                 count = dbuf.dsize / sizeof(*locks);
343                 for (i=0; i<count; i++) {
344                         if (brl_conflict(&locks[i], &lock)) {
345                                 status = NT_STATUS_LOCK_NOT_GRANTED;
346                                 goto fail;
347                         }
348 #if ZERO_ZERO
349                         if (lock.start == 0 && lock.size == 0 && 
350                             locks[i].size == 0) {
351                                 break;
352                         }
353 #endif
354                 }
355         }
356
357         /* no conflicts - add it to the list of locks */
358         tp = Realloc(dbuf.dptr, dbuf.dsize + sizeof(*locks));
359         if (!tp) {
360                 status = NT_STATUS_NO_MEMORY;
361                 goto fail;
362         } else {
363                 dbuf.dptr = tp;
364         }
365         memcpy(dbuf.dptr + dbuf.dsize, &lock, sizeof(lock));
366         dbuf.dsize += sizeof(lock);
367
368 #if ZERO_ZERO
369         /* sort the lock list */
370         qsort(dbuf.dptr, dbuf.dsize/sizeof(lock), sizeof(lock), lock_compare);
371 #endif
372
373         tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
374
375         SAFE_FREE(dbuf.dptr);
376         tdb_chainunlock(tdb, kbuf);
377         return NT_STATUS_OK;
378
379  fail:
380         /* this is a nasty hack to try to simulate the lock result cache code in w2k.
381            It isn't completely accurate as I haven't yet worked out the correct
382            semantics (tridge)
383         */
384         if (last_failed == fnum &&
385             last_failed_start == start &&
386             NT_STATUS_EQUAL(status, NT_STATUS_LOCK_NOT_GRANTED)) {
387                 status = NT_STATUS_FILE_LOCK_CONFLICT;
388         }
389         last_failed = fnum;
390         last_failed_start = start;
391
392         SAFE_FREE(dbuf.dptr);
393         tdb_chainunlock(tdb, kbuf);
394         return status;
395 }
396
397 /****************************************************************************
398  Check if an unlock overlaps a pending lock.
399 ****************************************************************************/
400
401 static BOOL brl_pending_overlap(struct lock_struct *lock, struct lock_struct *pend_lock)
402 {
403         if ((lock->start <= pend_lock->start) && (lock->start + lock->size > pend_lock->start))
404                 return True;
405         if ((lock->start >= pend_lock->start) && (lock->start <= pend_lock->start + pend_lock->size))
406                 return True;
407         return False;
408 }
409
410 /****************************************************************************
411  Unlock a range of bytes.
412 ****************************************************************************/
413
414 BOOL brl_unlock(SMB_DEV_T dev, SMB_INO_T ino, int fnum,
415                 uint16 smbpid, pid_t pid, uint16 tid,
416                 br_off start, br_off size,
417                 BOOL remove_pending_locks_only,
418                 void (*pre_unlock_fn)(void *),
419                 void *pre_unlock_data)
420 {
421         TDB_DATA kbuf, dbuf;
422         int count, i, j;
423         struct lock_struct *locks;
424         struct lock_context context;
425
426         kbuf = locking_key(dev,ino);
427
428         dbuf.dptr = NULL;
429
430         tdb_chainlock(tdb, kbuf);
431         dbuf = tdb_fetch(tdb, kbuf);
432
433         if (!dbuf.dptr) {
434                 DEBUG(10,("brl_unlock: tdb_fetch failed !\n"));
435                 goto fail;
436         }
437
438         context.smbpid = smbpid;
439         context.pid = pid;
440         context.tid = tid;
441
442         /* there are existing locks - find a match */
443         locks = (struct lock_struct *)dbuf.dptr;
444         count = dbuf.dsize / sizeof(*locks);
445
446 #if ZERO_ZERO
447         for (i=0; i<count; i++) {
448                 struct lock_struct *lock = &locks[i];
449
450                 if (lock->lock_type == WRITE_LOCK &&
451                     brl_same_context(&lock->context, &context) &&
452                     lock->fnum == fnum &&
453                     lock->start == start &&
454                     lock->size == size) {
455
456                         if (pre_unlock_fn)
457                                 (*pre_unlock_fn)(pre_unlock_data);
458
459                         /* found it - delete it */
460                         if (count == 1) {
461                                 tdb_delete(tdb, kbuf);
462                         } else {
463                                 if (i < count-1) {
464                                         memmove(&locks[i], &locks[i+1], 
465                                                 sizeof(*locks)*((count-1) - i));
466                                 }
467                                 dbuf.dsize -= sizeof(*locks);
468                                 tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
469                         }
470
471                         SAFE_FREE(dbuf.dptr);
472                         tdb_chainunlock(tdb, kbuf);
473                         return True;
474                 }
475         }
476 #endif
477
478         locks = (struct lock_struct *)dbuf.dptr;
479         count = dbuf.dsize / sizeof(*locks);
480         for (i=0; i<count; i++) {
481                 struct lock_struct *lock = &locks[i];
482
483                 if (brl_same_context(&lock->context, &context) &&
484                                 lock->fnum == fnum &&
485                                 lock->start == start &&
486                                 lock->size == size) {
487
488                         if (remove_pending_locks_only && lock->lock_type != PENDING_LOCK)
489                                 continue;
490
491                         if (lock->lock_type != PENDING_LOCK) {
492
493                                 /* Do any POSIX unlocks needed. */
494                                 if (pre_unlock_fn)
495                                         (*pre_unlock_fn)(pre_unlock_data);
496
497                                 /* Send unlock messages to any pending waiters that overlap. */
498                                 for (j=0; j<count; j++) {
499                                         struct lock_struct *pend_lock = &locks[j];
500
501                                         /* Ignore non-pending locks. */
502                                         if (pend_lock->lock_type != PENDING_LOCK)
503                                                 continue;
504
505                                         /* We could send specific lock info here... */
506                                         if (brl_pending_overlap(lock, pend_lock)) {
507                                                 DEBUG(10,("brl_unlock: sending unlock message to pid %u\n",
508                                                                         (unsigned int)pend_lock->context.pid ));
509
510                                                 message_send_pid(pend_lock->context.pid,
511                                                                 MSG_SMB_UNLOCK,
512                                                                 NULL, 0, True);
513                                         }
514                                 }
515                         }
516
517                         /* found it - delete it */
518                         if (count == 1) {
519                                 tdb_delete(tdb, kbuf);
520                         } else {
521                                 if (i < count-1) {
522                                         memmove(&locks[i], &locks[i+1], 
523                                                 sizeof(*locks)*((count-1) - i));
524                                 }
525                                 dbuf.dsize -= sizeof(*locks);
526                                 tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
527                         }
528
529                         SAFE_FREE(dbuf.dptr);
530                         tdb_chainunlock(tdb, kbuf);
531                         return True;
532                 }
533         }
534
535         /* we didn't find it */
536
537  fail:
538         SAFE_FREE(dbuf.dptr);
539         tdb_chainunlock(tdb, kbuf);
540         return False;
541 }
542
543
544 /****************************************************************************
545  Test if we could add a lock if we wanted to.
546 ****************************************************************************/
547
548 BOOL brl_locktest(SMB_DEV_T dev, SMB_INO_T ino, int fnum,
549                   uint16 smbpid, pid_t pid, uint16 tid,
550                   br_off start, br_off size, 
551                   enum brl_type lock_type, int check_self)
552 {
553         TDB_DATA kbuf, dbuf;
554         int count, i;
555         struct lock_struct lock, *locks;
556
557         kbuf = locking_key(dev,ino);
558
559         dbuf.dptr = NULL;
560
561         tdb_chainlock(tdb, kbuf);
562         dbuf = tdb_fetch(tdb, kbuf);
563
564         lock.context.smbpid = smbpid;
565         lock.context.pid = pid;
566         lock.context.tid = tid;
567         lock.start = start;
568         lock.size = size;
569         lock.fnum = fnum;
570         lock.lock_type = lock_type;
571
572         if (dbuf.dptr) {
573                 /* there are existing locks - make sure they don't conflict */
574                 locks = (struct lock_struct *)dbuf.dptr;
575                 count = dbuf.dsize / sizeof(*locks);
576                 for (i=0; i<count; i++) {
577                         if (check_self) {
578                                 if (brl_conflict(&locks[i], &lock))
579                                         goto fail;
580                         } else {
581                                 /*
582                                  * Our own locks don't conflict.
583                                  */
584                                 if (brl_conflict_other(&locks[i], &lock))
585                                         goto fail;
586                         }
587                 }
588         }
589
590         /* no conflicts - we could have added it */
591         SAFE_FREE(dbuf.dptr);
592         tdb_chainunlock(tdb, kbuf);
593         return True;
594
595  fail:
596         SAFE_FREE(dbuf.dptr);
597         tdb_chainunlock(tdb, kbuf);
598         return False;
599 }
600
601 /****************************************************************************
602  Remove any locks associated with a open file.
603 ****************************************************************************/
604
605 void brl_close(SMB_DEV_T dev, SMB_INO_T ino, pid_t pid, int tid, int fnum)
606 {
607         TDB_DATA kbuf, dbuf;
608         int count, i, j, dcount=0;
609         struct lock_struct *locks;
610
611         kbuf = locking_key(dev,ino);
612
613         dbuf.dptr = NULL;
614
615         tdb_chainlock(tdb, kbuf);
616         dbuf = tdb_fetch(tdb, kbuf);
617
618         if (!dbuf.dptr) goto fail;
619
620         /* there are existing locks - remove any for this fnum */
621         locks = (struct lock_struct *)dbuf.dptr;
622         count = dbuf.dsize / sizeof(*locks);
623
624         for (i=0; i<count; i++) {
625                 struct lock_struct *lock = &locks[i];
626
627                 if (lock->context.tid == tid &&
628                     lock->context.pid == pid &&
629                     lock->fnum == fnum) {
630
631                         /* Send unlock messages to any pending waiters that overlap. */
632                         for (j=0; j<count; j++) {
633                                 struct lock_struct *pend_lock = &locks[j];
634
635                                 /* Ignore our own or non-pending locks. */
636                                 if (pend_lock->lock_type != PENDING_LOCK)
637                                         continue;
638
639                                 if (pend_lock->context.tid == tid &&
640                                     pend_lock->context.pid == pid &&
641                                     pend_lock->fnum == fnum)
642                                         continue;
643
644                                 /* We could send specific lock info here... */
645                                 if (brl_pending_overlap(lock, pend_lock))
646                                         message_send_pid(pend_lock->context.pid,
647                                                         MSG_SMB_UNLOCK,
648                                                         NULL, 0, True);
649                         }
650
651                         /* found it - delete it */
652                         if (count > 1 && i < count-1) {
653                                 memmove(&locks[i], &locks[i+1], 
654                                         sizeof(*locks)*((count-1) - i));
655                         }
656                         count--;
657                         i--;
658                         dcount++;
659                 }
660         }
661
662         if (count == 0) {
663                 tdb_delete(tdb, kbuf);
664         } else if (count < (dbuf.dsize / sizeof(*locks))) {
665                 dbuf.dsize -= dcount * sizeof(*locks);
666                 tdb_store(tdb, kbuf, dbuf, TDB_REPLACE);
667         }
668
669         /* we didn't find it */
670  fail:
671         SAFE_FREE(dbuf.dptr);
672         tdb_chainunlock(tdb, kbuf);
673 }
674
675 /****************************************************************************
676  Traverse the whole database with this function, calling traverse_callback
677  on each lock.
678 ****************************************************************************/
679
680 static int traverse_fn(TDB_CONTEXT *ttdb, TDB_DATA kbuf, TDB_DATA dbuf, void *state)
681 {
682         struct lock_struct *locks;
683         struct lock_key *key;
684         int i;
685
686         BRLOCK_FN(traverse_callback) = (BRLOCK_FN_CAST())state;
687
688         locks = (struct lock_struct *)dbuf.dptr;
689         key = (struct lock_key *)kbuf.dptr;
690
691         for (i=0;i<dbuf.dsize/sizeof(*locks);i++) {
692                 traverse_callback(key->device, key->inode,
693                                   locks[i].context.pid,
694                                   locks[i].lock_type,
695                                   locks[i].start,
696                                   locks[i].size);
697         }
698         return 0;
699 }
700
701 /*******************************************************************
702  Call the specified function on each lock in the database.
703 ********************************************************************/
704
705 int brl_forall(BRLOCK_FN(fn))
706 {
707         if (!tdb) return 0;
708         return tdb_traverse(tdb, traverse_fn, (void *)fn);
709 }