Merge branch 'reiserfs/kill-bkl' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 9 Dec 2009 15:58:15 +0000 (07:58 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 9 Dec 2009 15:58:15 +0000 (07:58 -0800)
* 'reiserfs/kill-bkl' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing: (31 commits)
  kill-the-bkl/reiserfs: turn GFP_ATOMIC flag to GFP_NOFS in reiserfs_get_block()
  kill-the-bkl/reiserfs: drop the fs race watchdog from _get_block_create_0()
  kill-the-bkl/reiserfs: definitely drop the bkl from reiserfs_ioctl()
  kill-the-bkl/reiserfs: always lock the ioctl path
  kill-the-bkl/reiserfs: fix reiserfs lock to cpu_add_remove_lock dependency
  kill-the-bkl/reiserfs: Fix induced mm->mmap_sem to sysfs_mutex dependency
  kill-the-bkl/reiserfs: panic in case of lock imbalance
  kill-the-bkl/reiserfs: fix recursive reiserfs write lock in reiserfs_commit_write()
  kill-the-bkl/reiserfs: fix recursive reiserfs lock in reiserfs_mkdir()
  kill-the-bkl/reiserfs: fix "reiserfs lock" / "inode mutex" lock inversion dependency
  kill-the-bkl/reiserfs: move the concurrent tree accesses checks per superblock
  kill-the-bkl/reiserfs: acquire the inode mutex safely
  kill-the-bkl/reiserfs: unlock only when needed in search_by_key
  kill-the-bkl/reiserfs: use mutex_lock in reiserfs_mutex_lock_safe
  kill-the-bkl/reiserfs: factorize the locking in reiserfs_write_end()
  kill-the-bkl/reiserfs: reduce number of contentions in search_by_key()
  kill-the-bkl/reiserfs: don't hold the write recursively in reiserfs_lookup()
  kill-the-bkl/reiserfs: lock only once on reiserfs_get_block()
  kill-the-bkl/reiserfs: conditionaly release the write lock on fs_changed()
  kill-the-BKL/reiserfs: add reiserfs_cond_resched()
  ...

18 files changed:
fs/reiserfs/Makefile
fs/reiserfs/bitmap.c
fs/reiserfs/dir.c
fs/reiserfs/do_balan.c
fs/reiserfs/file.c
fs/reiserfs/fix_node.c
fs/reiserfs/inode.c
fs/reiserfs/ioctl.c
fs/reiserfs/journal.c
fs/reiserfs/lock.c [new file with mode: 0644]
fs/reiserfs/namei.c
fs/reiserfs/prints.c
fs/reiserfs/resize.c
fs/reiserfs/stree.c
fs/reiserfs/super.c
fs/reiserfs/xattr.c
include/linux/reiserfs_fs.h
include/linux/reiserfs_fs_sb.h

index 7c5ab6330dd6bbe4edbd1f6b62a3a1d5a3f6b645..6a9e30c041dda2a52a44bbe14d9aa664dae14bb7 100644 (file)
@@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
                 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
                 hashes.o tail_conversion.o journal.o resize.o \
-                item_ops.o ioctl.o procfs.o xattr.o
+                item_ops.o ioctl.o procfs.o xattr.o lock.o
 
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
index e716161ab325c8f246b33c11110a60fff351100f..685495707181c7f168ec280e69b72ba3a1bcc230 100644 (file)
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        else if (bitmap == 0)
                block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
 
+       reiserfs_write_unlock(sb);
        bh = sb_bread(sb, block);
+       reiserfs_write_lock(sb);
        if (bh == NULL)
                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
                                 "reading failed", __func__, block);
        else {
                if (buffer_locked(bh)) {
                        PROC_INFO_INC(sb, scan_bitmap.wait);
+                       reiserfs_write_unlock(sb);
                        __wait_on_buffer(bh);
+                       reiserfs_write_lock(sb);
                }
                BUG_ON(!buffer_uptodate(bh));
                BUG_ON(atomic_read(&bh->b_count) == 0);
index 6d2668fdc3848eb5b2be29d027c5634c727148f6..c094f58c7448b06d1da88d87ebbbeb4e27dd7872 100644 (file)
@@ -20,7 +20,7 @@ const struct file_operations reiserfs_dir_operations = {
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
-       .ioctl = reiserfs_ioctl,
+       .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
@@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                // user space buffer is swapped out. At that time
                                // entry can move to somewhere else
                                memcpy(local_buf, d_name, d_reclen);
+
+                               /*
+                                * Since filldir might sleep, we can release
+                                * the write lock here for other waiters
+                                */
+                               reiserfs_write_unlock(inode->i_sb);
                                if (filldir
                                    (dirent, local_buf, d_reclen, d_off, d_ino,
                                     DT_UNKNOWN) < 0) {
+                                       reiserfs_write_lock(inode->i_sb);
                                        if (local_buf != small_buf) {
                                                kfree(local_buf);
                                        }
                                        goto end;
                                }
+                               reiserfs_write_lock(inode->i_sb);
                                if (local_buf != small_buf) {
                                        kfree(local_buf);
                                }
index 128d3f7c8aa5a56a9ca6ad38487034dbdcaf9b92..60c08044066160e7b2289c265ab33d7ee00c3c96 100644 (file)
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 
-#ifdef CONFIG_REISERFS_CHECK
-
-struct tree_balance *cur_tb = NULL;    /* detects whether more than one
-                                          copy of tb exists as a means
-                                          of checking whether schedule
-                                          is interrupting do_balance */
-#endif
-
 static inline void buffer_info_init_left(struct tree_balance *tb,
                                          struct buffer_info *bi)
 {
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
 {
        int retval = 0;
 
-       if (cur_tb) {
+       if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
                               "occurred based on cur_tb not being null at "
                               "this point in code. do_balance cannot properly "
-                              "handle schedule occurring while it runs.");
+                              "handle concurrent tree accesses on a same "
+                              "mount point.");
        }
 
        /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
             "check");*/
        RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
-       cur_tb = tb;
+       REISERFS_SB(tb->tb_sb)->cur_tb = tb;
 #endif
 }
 
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
 #ifdef CONFIG_REISERFS_CHECK
        check_leaf_level(tb);
        check_internal_levels(tb);
-       cur_tb = NULL;
+       REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
 #endif
 
        /* reiserfs_free_block is no longer schedule safe.  So, we need to
index 9f436668b7f816e6b40bad3759fb7f32268344c6..da2dba082e2d4e6ba5191bf938d134ca647d3877 100644 (file)
@@ -284,7 +284,7 @@ static ssize_t reiserfs_file_write(struct file *file,       /* the file we are going t
 const struct file_operations reiserfs_file_operations = {
        .read = do_sync_read,
        .write = reiserfs_file_write,
-       .ioctl = reiserfs_ioctl,
+       .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
index 5e5a4e6fbaf8290d2bf91799d7e116390256ccf0..d2f31330dcae7ecd80a67a0eec638c941822bd2c 100644 (file)
@@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
        return needed_nodes;
 }
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 
 /* Set parameters for balancing.
  * Performs write of results of analysis of balancing into structure tb,
@@ -1022,7 +1019,11 @@ static int get_far_parent(struct tree_balance *tb,
        /* Check whether the common parent is locked. */
 
        if (buffer_locked(*pcom_father)) {
+
+               /* Release the write lock while the buffer is busy */
+               reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(*pcom_father);
+               reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb)) {
                        brelse(*pcom_father);
                        return REPEAT_SEARCH;
@@ -1927,7 +1928,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
                return REPEAT_SEARCH;
 
        if (buffer_locked(bh)) {
+               reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(bh);
+               reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
@@ -1965,7 +1968,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
                                                                       FL[h]);
                son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+               reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+               reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2008,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                child_position =
                    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
                son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+               reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+               reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2285,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
                                    REPEAT_SEARCH : CARRY_ON;
                        }
 #endif
+                       reiserfs_write_unlock(tb->tb_sb);
                        __wait_on_buffer(locked);
+                       reiserfs_write_lock(tb->tb_sb);
                        if (FILESYSTEM_CHANGED_TB(tb))
                                return REPEAT_SEARCH;
                }
@@ -2349,12 +2358,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
 
        /* if it possible in indirect_to_direct conversion */
        if (buffer_locked(tbS0)) {
+               reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(tbS0);
+               reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
 #ifdef CONFIG_REISERFS_CHECK
-       if (cur_tb) {
+       if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                print_cur_tb("fix_nodes");
                reiserfs_panic(tb->tb_sb, "PAP-8305",
                               "there is pending do_balance");
index a14d6cd9eeda2670251c0ec3377518912cb465a2..3a28e7751b3c714da6e3b2c0b7fe9ad89e82a87b 100644 (file)
@@ -251,7 +251,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        struct cpu_key key;
        struct buffer_head *bh;
        struct item_head *ih, tmp_ih;
-       int fs_gen;
        b_blocknr_t blocknr;
        char *p = NULL;
        int chars;
@@ -265,7 +264,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
                     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
                     3);
 
-      research:
        result = search_for_position_by_key(inode->i_sb, &key, &path);
        if (result != POSITION_FOUND) {
                pathrelse(&path);
@@ -340,7 +338,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        }
        // read file tail into part of page
        offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
-       fs_gen = get_generation(inode->i_sb);
        copy_item_head(&tmp_ih, ih);
 
        /* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +345,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
         ** sure we need to.  But, this means the item might move if
         ** kmap schedules
         */
-       if (!p) {
+       if (!p)
                p = (char *)kmap(bh_result->b_page);
-               if (fs_changed(fs_gen, inode->i_sb)
-                   && item_moved(&tmp_ih, &path)) {
-                       goto research;
-               }
-       }
+
        p += offset;
        memset(p, 0, inode->i_sb->s_blocksize);
        do {
@@ -489,10 +482,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
           disappeared */
        if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
                int err;
-               lock_kernel();
+
+               reiserfs_write_lock(inode->i_sb);
+
                err = reiserfs_commit_for_inode(inode);
                REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-               unlock_kernel();
+
+               reiserfs_write_unlock(inode->i_sb);
+
                if (err < 0)
                        ret = err;
        }
@@ -601,6 +598,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        __le32 *item;
        int done;
        int fs_gen;
+       int lock_depth;
        struct reiserfs_transaction_handle *th = NULL;
        /* space reserved in transaction batch:
           . 3 balancings in direct->indirect conversion
@@ -616,12 +614,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        loff_t new_offset =
            (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
 
-       /* bad.... */
-       reiserfs_write_lock(inode->i_sb);
+       lock_depth = reiserfs_write_lock_once(inode->i_sb);
        version = get_inode_item_key_version(inode);
 
        if (!file_capable(inode, block)) {
-               reiserfs_write_unlock(inode->i_sb);
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return -EFBIG;
        }
 
@@ -633,7 +630,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                /* find number of block-th logical block of the file */
                ret = _get_block_create_0(inode, block, bh_result,
                                          create | GET_BLOCK_READ_DIRECT);
-               reiserfs_write_unlock(inode->i_sb);
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return ret;
        }
        /*
@@ -751,7 +748,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                if (!dangle && th)
                        retval = reiserfs_end_persistent_transaction(th);
 
-               reiserfs_write_unlock(inode->i_sb);
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 
                /* the item was found, so new blocks were not added to the file
                 ** there is no need to make sure the inode is updated with this
@@ -935,7 +932,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (blocks_needed == 1) {
                                un = &unf_single;
                        } else {
-                               un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);      // We need to avoid scheduling.
+                               un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
                                if (!un) {
                                        un = &unf_single;
                                        blocks_needed = 1;
@@ -997,10 +994,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (retval)
                                goto failure;
                }
-               /* inserting indirect pointers for a hole can take a
-                ** long time.  reschedule if needed
+               /*
+                * inserting indirect pointers for a hole can take a
+                * long time.  reschedule if needed and also release the write
+                * lock for others.
                 */
-               cond_resched();
+               if (need_resched()) {
+                       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+                       schedule();
+                       lock_depth = reiserfs_write_lock_once(inode->i_sb);
+               }
 
                retval = search_for_position_by_key(inode->i_sb, &key, &path);
                if (retval == IO_ERROR) {
@@ -1035,7 +1038,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        retval = err;
        }
 
-       reiserfs_write_unlock(inode->i_sb);
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        reiserfs_check_path(&path);
        return retval;
 }
@@ -2072,8 +2075,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
        int error;
        struct buffer_head *bh = NULL;
        int err2;
+       int lock_depth;
 
-       reiserfs_write_lock(inode->i_sb);
+       lock_depth = reiserfs_write_lock_once(inode->i_sb);
 
        if (inode->i_size > 0) {
                error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2146,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
                page_cache_release(page);
        }
 
-       reiserfs_write_unlock(inode->i_sb);
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
        return 0;
       out:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
        }
-       reiserfs_write_unlock(inode->i_sb);
+
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
        return error;
 }
 
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
        int ret;
        int old_ref = 0;
 
+       reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+       reiserfs_write_lock(inode->i_sb);
+
        fix_tail_page_for_writing(page);
        if (reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th;
@@ -2664,6 +2674,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th;
        unsigned start;
+       int lock_depth = 0;
+       bool locked = false;
 
        if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
                pos ++;
@@ -2690,9 +2702,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
         ** to do the i_size updates here.
         */
        pos += copied;
+
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-               reiserfs_write_lock(inode->i_sb);
+               lock_depth = reiserfs_write_lock_once(inode->i_sb);
+               locked = true;
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2703,10 +2717,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 
                ret = journal_begin(&myth, inode->i_sb, 1);
-               if (ret) {
-                       reiserfs_write_unlock(inode->i_sb);
+               if (ret)
                        goto journal_error;
-               }
+
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2718,34 +2731,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
+               if (!locked) {
+                       lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                       locked = true;
+               }
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
 
       out:
+       if (locked)
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        unlock_page(page);
        page_cache_release(page);
        return ret == 0 ? copied : ret;
 
       journal_error:
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+       locked = false;
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
        }
-
        goto out;
 }
 
@@ -2758,7 +2773,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th = NULL;
 
+       reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+       reiserfs_write_lock(inode->i_sb);
+
        if (reiserfs_transaction_running(inode->i_sb)) {
                th = current->journal_info;
        }
@@ -2770,7 +2788,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
         */
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-               reiserfs_write_lock(inode->i_sb);
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2781,10 +2798,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 
                ret = journal_begin(&myth, inode->i_sb, 1);
-               if (ret) {
-                       reiserfs_write_unlock(inode->i_sb);
+               if (ret)
                        goto journal_error;
-               }
+
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2796,16 +2812,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
@@ -2815,11 +2828,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 
       journal_error:
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
        }
 
        return ret;
index 0ccc3fdda7bfb7d5d00e59e8b26e74a0331e6d3c..ace77451ceb16d3280a94e4bebc1044e0c6920f3 100644 (file)
 #include <linux/compat.h>
 
 /*
-** reiserfs_ioctl - handler for ioctl for inode
-** supported commands:
-**  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
-**                           and prevent packing file (argument arg has to be non-zero)
-**  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
-**  3) That's all for a while ...
-*/
-int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-                  unsigned long arg)
+ * reiserfs_ioctl - handler for ioctl for inode
+ * supported commands:
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *                           and prevent packing file (argument arg has to be non-zero)
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  3) That's all for a while ...
+ */
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+       struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int err = 0;
 
+       reiserfs_write_lock(inode->i_sb);
+
        switch (cmd) {
        case REISERFS_IOC_UNPACK:
                if (S_ISREG(inode->i_mode)) {
                        if (arg)
-                               return reiserfs_unpack(inode, filp);
-                       else
-                               return 0;
+                               err = reiserfs_unpack(inode, filp);
                } else
-                       return -ENOTTY;
-               /* following two cases are taken from fs/ext2/ioctl.c by Remy
-                  Card (card@masi.ibp.fr) */
+                       err = -ENOTTY;
+               break;
+               /*
+                * following two cases are taken from fs/ext2/ioctl.c by Remy
+                * Card (card@masi.ibp.fr)
+                */
        case REISERFS_IOC_GETFLAGS:
-               if (!reiserfs_attrs(inode->i_sb))
-                       return -ENOTTY;
+               if (!reiserfs_attrs(inode->i_sb)) {
+                       err = -ENOTTY;
+                       break;
+               }
 
                flags = REISERFS_I(inode)->i_attrs;
                i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
-               return put_user(flags, (int __user *)arg);
+               err = put_user(flags, (int __user *)arg);
+               break;
        case REISERFS_IOC_SETFLAGS:{
-                       if (!reiserfs_attrs(inode->i_sb))
-                               return -ENOTTY;
+                       if (!reiserfs_attrs(inode->i_sb)) {
+                               err = -ENOTTY;
+                               break;
+                       }
 
                        err = mnt_want_write(filp->f_path.mnt);
                        if (err)
-                               return err;
+                               break;
 
                        if (!is_owner_or_cap(inode)) {
                                err = -EPERM;
@@ -90,16 +98,18 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        mark_inode_dirty(inode);
 setflags_out:
                        mnt_drop_write(filp->f_path.mnt);
-                       return err;
+                       break;
                }
        case REISERFS_IOC_GETVERSION:
-               return put_user(inode->i_generation, (int __user *)arg);
+               err = put_user(inode->i_generation, (int __user *)arg);
+               break;
        case REISERFS_IOC_SETVERSION:
                if (!is_owner_or_cap(inode))
-                       return -EPERM;
+                       err = -EPERM;
+                       break;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
-                       return err;
+                       break;
                if (get_user(inode->i_generation, (int __user *)arg)) {
                        err = -EFAULT;
                        goto setversion_out;
@@ -108,19 +118,20 @@ setflags_out:
                mark_inode_dirty(inode);
 setversion_out:
                mnt_drop_write(filp->f_path.mnt);
-               return err;
+               break;
        default:
-               return -ENOTTY;
+               err = -ENOTTY;
        }
+
+       reiserfs_write_unlock(inode->i_sb);
+
+       return err;
 }
 
 #ifdef CONFIG_COMPAT
 long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
 {
-       struct inode *inode = file->f_path.dentry->d_inode;
-       int ret;
-
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case REISERFS_IOC32_UNPACK:
@@ -141,10 +152,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
        default:
                return -ENOIOCTLCMD;
        }
-       lock_kernel();
-       ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-       unlock_kernel();
-       return ret;
+
+       return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
 
index 90622200b39c0622e0f159d423c929a036d76257..2f8a7e7b8dabf04b8641e7b4426b02b3bba8e570 100644 (file)
@@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
        clear_buffer_journal_restore_dirty(bh);
 }
 
-/* utility function to force a BUG if it is called without the big
-** kernel lock held.  caller is the string printed just before calling BUG()
-*/
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-#ifdef CONFIG_SMP
-       if (current->lock_depth < 0) {
-               reiserfs_panic(sb, "journal-1", "%s called without kernel "
-                              "lock held", caller);
-       }
-#else
-       ;
-#endif
-}
-
 /* return a cnode with same dev, block number and size in table, or null if not found */
 static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
                                                                  super_block
@@ -556,7 +541,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *sb)
 {
        PROC_INFO_INC(sb, journal.lock_journal);
-       mutex_lock(&SB_JOURNAL(sb)->j_mutex);
+
+       reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
 }
 
 /* unlock the current transaction */
@@ -708,7 +694,9 @@ static void check_barrier_completion(struct super_block *s,
                disable_barrier(s);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
+               reiserfs_write_unlock(s);
                sync_dirty_buffer(bh);
+               reiserfs_write_lock(s);
        }
 }
 
@@ -996,8 +984,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
 {
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
-       if (atomic_read(&j->j_async_throttle))
+
+       if (atomic_read(&j->j_async_throttle)) {
+               reiserfs_write_unlock(s);
                congestion_wait(BLK_RW_ASYNC, HZ / 10);
+               reiserfs_write_lock(s);
+       }
+
        return 0;
 }
 
@@ -1043,7 +1036,8 @@ static int flush_commit_list(struct super_block *s,
        }
 
        /* make sure nobody is trying to flush this one at the same time */
-       mutex_lock(&jl->j_commit_mutex);
+       reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
+
        if (!journal_list_still_alive(s, trans_id)) {
                mutex_unlock(&jl->j_commit_mutex);
                goto put_jl;
@@ -1061,12 +1055,17 @@ static int flush_commit_list(struct super_block *s,
 
        if (!list_empty(&jl->j_bh_list)) {
                int ret;
-               unlock_kernel();
+
+               /*
+                * We might sleep in numerous places inside
+                * write_ordered_buffers. Relax the write lock.
+                */
+               reiserfs_write_unlock(s);
                ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                            journal, jl, &jl->j_bh_list);
                if (ret < 0 && retval == 0)
                        retval = ret;
-               lock_kernel();
+               reiserfs_write_lock(s);
        }
        BUG_ON(!list_empty(&jl->j_bh_list));
        /*
@@ -1085,8 +1084,11 @@ static int flush_commit_list(struct super_block *s,
                    SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
                if (tbh) {
-                       if (buffer_dirty(tbh))
-                           ll_rw_block(WRITE, 1, &tbh) ;
+                       if (buffer_dirty(tbh)) {
+                           reiserfs_write_unlock(s);
+                           ll_rw_block(WRITE, 1, &tbh);
+                           reiserfs_write_lock(s);
+                       }
                        put_bh(tbh) ;
                }
        }
@@ -1114,12 +1116,19 @@ static int flush_commit_list(struct super_block *s,
                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
+
+               reiserfs_write_unlock(s);
                wait_on_buffer(tbh);
+               reiserfs_write_lock(s);
                // since we're using ll_rw_blk above, it might have skipped over
                // a locked buffer.  Double check here
                //
-               if (buffer_dirty(tbh))  /* redundant, sync_dirty_buffer() checks */
+               /* redundant, sync_dirty_buffer() checks */
+               if (buffer_dirty(tbh)) {
+                       reiserfs_write_unlock(s);
                        sync_dirty_buffer(tbh);
+                       reiserfs_write_lock(s);
+               }
                if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
                        reiserfs_warning(s, "journal-601",
@@ -1143,10 +1152,15 @@ static int flush_commit_list(struct super_block *s,
                        if (buffer_dirty(jl->j_commit_bh))
                                BUG();
                        mark_buffer_dirty(jl->j_commit_bh) ;
+                       reiserfs_write_unlock(s);
                        sync_dirty_buffer(jl->j_commit_bh) ;
+                       reiserfs_write_lock(s);
                }
-       } else
+       } else {
+               reiserfs_write_unlock(s);
                wait_on_buffer(jl->j_commit_bh);
+               reiserfs_write_lock(s);
+       }
 
        check_barrier_completion(s, jl->j_commit_bh);
 
@@ -1286,7 +1300,9 @@ static int _update_journal_header_block(struct super_block *sb,
 
        if (trans_id >= journal->j_last_flush_trans_id) {
                if (buffer_locked((journal->j_header_bh))) {
+                       reiserfs_write_unlock(sb);
                        wait_on_buffer((journal->j_header_bh));
+                       reiserfs_write_lock(sb);
                        if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
                                reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1328,16 @@ static int _update_journal_header_block(struct super_block *sb,
                                disable_barrier(sb);
                                goto sync;
                        }
+                       reiserfs_write_unlock(sb);
                        wait_on_buffer(journal->j_header_bh);
+                       reiserfs_write_lock(sb);
                        check_barrier_completion(sb, journal->j_header_bh);
                } else {
                      sync:
                        set_buffer_dirty(journal->j_header_bh);
+                       reiserfs_write_unlock(sb);
                        sync_dirty_buffer(journal->j_header_bh);
+                       reiserfs_write_lock(sb);
                }
                if (!buffer_uptodate(journal->j_header_bh)) {
                        reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
 
        /* if flushall == 0, the lock is already held */
        if (flushall) {
-               mutex_lock(&journal->j_flush_mutex);
+               reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        } else if (mutex_trylock(&journal->j_flush_mutex)) {
                BUG();
        }
@@ -1553,7 +1573,11 @@ static int flush_journal_list(struct super_block *s,
                                        reiserfs_panic(s, "journal-1011",
                                                       "cn->bh is NULL");
                                }
+
+                               reiserfs_write_unlock(s);
                                wait_on_buffer(cn->bh);
+                               reiserfs_write_lock(s);
+
                                if (!cn->bh) {
                                        reiserfs_panic(s, "journal-1012",
                                                       "cn->bh is NULL");
@@ -1769,7 +1793,7 @@ static int kupdate_transactions(struct super_block *s,
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        chunk.nr = 0;
 
-       mutex_lock(&journal->j_flush_mutex);
+       reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        if (!journal_list_still_alive(s, orig_trans_id)) {
                goto done;
        }
@@ -1973,11 +1997,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
        reiserfs_mounted_fs_count--;
        /* wait for all commits to finish */
        cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
+
+       /*
+        * We must release the write lock here because
+        * the workqueue job (flush_async_commit) needs this lock
+        */
+       reiserfs_write_unlock(sb);
        flush_workqueue(commit_wq);
+
        if (!reiserfs_mounted_fs_count) {
                destroy_workqueue(commit_wq);
                commit_wq = NULL;
        }
+       reiserfs_write_lock(sb);
 
        free_journal_ram(sb);
 
@@ -2243,7 +2275,11 @@ static int journal_read_transaction(struct super_block *sb,
        /* read in the log blocks, memcpy to the corresponding real block */
        ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
        for (i = 0; i < get_desc_trans_len(desc); i++) {
+
+               reiserfs_write_unlock(sb);
                wait_on_buffer(log_blocks[i]);
+               reiserfs_write_lock(sb);
+
                if (!buffer_uptodate(log_blocks[i])) {
                        reiserfs_warning(sb, "journal-1212",
                                         "REPLAY FAILURE fsck required! "
@@ -2765,11 +2801,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
 
+       /*
+        * We need to unlock here to avoid creating the following
+        * dependency:
+        * reiserfs_lock -> sysfs_mutex
+        * Because the reiserfs mmap path creates the following dependency:
+        * mm->mmap -> reiserfs_lock, hence we have
+        * mm->mmap -> reiserfs_lock ->sysfs_mutex
+        * This would ends up in a circular dependency with sysfs readdir path
+        * which does sysfs_mutex -> mm->mmap_sem
+        * This is fine because the reiserfs lock is useless in mount path,
+        * at least until we call journal_begin. We keep it for paranoid
+        * reasons.
+        */
+       reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+               reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
+       reiserfs_write_lock(sb);
 
        rs = SB_DISK_SUPER_BLOCK(sb);
 
@@ -2881,8 +2933,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        }
 
        reiserfs_mounted_fs_count++;
-       if (reiserfs_mounted_fs_count <= 1)
+       if (reiserfs_mounted_fs_count <= 1) {
+               reiserfs_write_unlock(sb);
                commit_wq = create_workqueue("reiserfs");
+               reiserfs_write_lock(sb);
+       }
 
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2964,8 +3019,11 @@ static void queue_log_writer(struct super_block *s)
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&journal->j_join_wait, &wait);
        set_current_state(TASK_UNINTERRUPTIBLE);
-       if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+       if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+               reiserfs_write_unlock(s);
                schedule();
+               reiserfs_write_lock(s);
+       }
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&journal->j_join_wait, &wait);
 }
@@ -2982,7 +3040,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
        struct reiserfs_journal *journal = SB_JOURNAL(sb);
        unsigned long bcount = journal->j_bcount;
        while (1) {
+               reiserfs_write_unlock(sb);
                schedule_timeout_uninterruptible(1);
+               reiserfs_write_lock(sb);
                journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
                while ((atomic_read(&journal->j_wcount) > 0 ||
                        atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3093,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 
        if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
                unlock_journal(sb);
+               reiserfs_write_unlock(sb);
                reiserfs_wait_on_write_block(sb);
+               reiserfs_write_lock(sb);
                PROC_INFO_INC(sb, journal.journal_relock_writers);
                goto relock;
        }
@@ -3506,14 +3568,14 @@ static void flush_async_commits(struct work_struct *work)
        struct reiserfs_journal_list *jl;
        struct list_head *entry;
 
-       lock_kernel();
+       reiserfs_write_lock(sb);
        if (!list_empty(&journal->j_journal_list)) {
                /* last entry is the youngest, commit it and you get everything */
                entry = journal->j_journal_list.prev;
                jl = JOURNAL_LIST_ENTRY(entry);
                flush_commit_list(sb, jl, 1);
        }
-       unlock_kernel();
+       reiserfs_write_unlock(sb);
 }
 
 /*
@@ -4041,7 +4103,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * the new transaction is fully setup, and we've already flushed the
         * ordered bh list
         */
-       mutex_lock(&jl->j_commit_mutex);
+       reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
 
        /* save the transaction id in case we need to commit it later */
        commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4218,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
                next = cn->next;
                free_cnode(sb, cn);
                cn = next;
+               reiserfs_write_unlock(sb);
                cond_resched();
+               reiserfs_write_lock(sb);
        }
 
        /* we are done  with both the c_bh and d_bh, but
@@ -4203,10 +4267,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * is lost.
         */
        if (!list_empty(&jl->j_tail_bh_list)) {
-               unlock_kernel();
+               reiserfs_write_unlock(sb);
                write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                      journal, jl, &jl->j_tail_bh_list);
-               lock_kernel();
+               reiserfs_write_lock(sb);
        }
        BUG_ON(!list_empty(&jl->j_tail_bh_list));
        mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644 (file)
index 0000000..ee2cfc0
--- /dev/null
@@ -0,0 +1,88 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/mutex.h>
+
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partialy based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+       if (sb_i->lock_owner != current) {
+               mutex_lock(&sb_i->lock);
+               sb_i->lock_owner = current;
+       }
+
+       /* No need to protect it, only the current task touches it */
+       sb_i->lock_depth++;
+}
+
+void reiserfs_write_unlock(struct super_block *s)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+       /*
+        * Are we unlocking without even holding the lock?
+        * Such a situation must raise a BUG() if we don't want
+        * to corrupt the data.
+        */
+       BUG_ON(sb_i->lock_owner != current);
+
+       if (--sb_i->lock_depth == -1) {
+               sb_i->lock_owner = NULL;
+               mutex_unlock(&sb_i->lock);
+       }
+}
+
+/*
+ * If we already own the lock, just exit and don't increase the depth.
+ * Useful when we don't want to lock more than once.
+ *
+ * We always return the lock_depth we had before calling
+ * this function.
+ */
+int reiserfs_write_lock_once(struct super_block *s)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+       if (sb_i->lock_owner != current) {
+               mutex_lock(&sb_i->lock);
+               sb_i->lock_owner = current;
+               return sb_i->lock_depth++;
+       }
+
+       return sb_i->lock_depth;
+}
+
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+{
+       if (lock_depth == -1)
+               reiserfs_write_unlock(s);
+}
+
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+
+       if (sb_i->lock_depth < 0)
+               reiserfs_panic(sb, "%s called without kernel lock held %d",
+                              caller);
+}
index 271579128634242b64c2b96f97c2f86dc0e3435f..e296ff72a6ccb2bc2e99cfbb97472c7da39f3253 100644 (file)
@@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nd)
 {
        int retval;
+       int lock_depth;
        struct inode *inode = NULL;
        struct reiserfs_dir_entry de;
        INITIALIZE_PATH(path_to_entry);
@@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
        if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
                return ERR_PTR(-ENAMETOOLONG);
 
-       reiserfs_write_lock(dir->i_sb);
+       /*
+        * Might be called with or without the write lock, must be careful
+        * to not recursively hold it in case we want to release the lock
+        * before rescheduling.
+        */
+       lock_depth = reiserfs_write_lock_once(dir->i_sb);
+
        de.de_gen_number_bit_string = NULL;
        retval =
            reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                inode = reiserfs_iget(dir->i_sb,
                                      (struct cpu_key *)&(de.de_dir_id));
                if (!inode || IS_ERR(inode)) {
-                       reiserfs_write_unlock(dir->i_sb);
+                       reiserfs_write_unlock_once(dir->i_sb, lock_depth);
                        return ERR_PTR(-EACCES);
                }
 
@@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                if (IS_PRIVATE(dir))
                        inode->i_flags |= S_PRIVATE;
        }
-       reiserfs_write_unlock(dir->i_sb);
+       reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        if (retval == IO_ERROR) {
                return ERR_PTR(-EIO);
        }
@@ -725,6 +732,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        struct reiserfs_security_handle security;
+       int lock_depth;
        /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
        int jbegin_count =
            JOURNAL_PER_BALANCE_CNT * 3 +
@@ -748,7 +756,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                return retval;
        }
        jbegin_count += retval;
-       reiserfs_write_lock(dir->i_sb);
+       lock_depth = reiserfs_write_lock_once(dir->i_sb);
 
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval) {
@@ -798,8 +806,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
-      out_failed:
-       reiserfs_write_unlock(dir->i_sb);
+out_failed:
+       reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        return retval;
 }
 
index 536eacaeb71005a935a95c88dd7afba67535a4ac..adbc6f538515e15f7d7982d97ab1d3fb30728551 100644 (file)
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
 
    .  */
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
-
 void __reiserfs_panic(struct super_block *sb, const char *id,
                      const char *function, const char *fmt, ...)
 {
index 18b315d3d104ea0cebfbb5a1991356038dc8c4fd..b3a94d20f0fcd37b044596369687a7d1cebfa2f5 100644 (file)
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
+                       reiserfs_write_unlock(s);
                        sync_dirty_buffer(bh);
+                       reiserfs_write_lock(s);
                        // update bitmap_info stuff
                        bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
                        brelse(bh);
index d036ee5b1c81a8bd43d8f8836fbd78b68c462acb..5fa7118f04e117c079a14b7da3811450f6d72573 100644 (file)
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key,       /* Key to search for. */
        return ITEM_NOT_FOUND;
 }
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 
 /* Minimal possible key. It is never in the tree. */
 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
 
 #define SEARCH_BY_KEY_READA 16
 
-/* The function is NOT SCHEDULE-SAFE! */
-static void search_by_key_reada(struct super_block *s,
+/*
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
                                struct buffer_head **bh,
                                b_blocknr_t *b, int num)
 {
        int i, j;
+       bool unlocked = false;
 
        for (i = 0; i < num; i++) {
                bh[i] = sb_getblk(s, b[i]);
        }
+       /*
+        * We are going to read some blocks on which we
+        * have a reference. It's safe, though we might be
+        * reading blocks concurrently changed if we release
+        * the lock. But it's still fine because we check later
+        * if the tree changed
+        */
        for (j = 0; j < i; j++) {
                /*
                 * note, this needs attention if we are getting rid of the BKL
                 * you have to make sure the prepared bit isn't set on this buffer
                 */
-               if (!buffer_uptodate(bh[j]))
+               if (!buffer_uptodate(bh[j])) {
+                       if (!unlocked) {
+                               reiserfs_write_unlock(s);
+                               unlocked = true;
+                       }
                        ll_rw_block(READA, 1, bh + j);
+               }
                brelse(bh[j]);
        }
+       return unlocked;
 }
 
 /**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,      /* Key to s
                   have a pointer to it. */
                if ((bh = last_element->pe_buffer =
                     sb_getblk(sb, block_number))) {
+                       bool unlocked = false;
+
                        if (!buffer_uptodate(bh) && reada_count > 1)
-                               search_by_key_reada(sb, reada_bh,
+                               /* may unlock the write lock */
+                               unlocked = search_by_key_reada(sb, reada_bh,
                                                    reada_blocks, reada_count);
+                       /*
+                        * If we haven't already unlocked the write lock,
+                        * then we need to do that here before reading
+                        * the current block
+                        */
+                       if (!buffer_uptodate(bh) && !unlocked) {
+                               reiserfs_write_unlock(sb);
+                               unlocked = true;
+                       }
                        ll_rw_block(READ, 1, &bh);
                        wait_on_buffer(bh);
+
+                       if (unlocked)
+                               reiserfs_write_lock(sb);
                        if (!buffer_uptodate(bh))
                                goto io_error;
                } else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,        /* Key to s
                       !key_in_buffer(search_path, key, sb),
                       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
-               if (cur_tb) {
+               if (REISERFS_SB(sb)->cur_tb) {
                        print_cur_tb("5140");
                        reiserfs_panic(sb, "PAP-5140",
                                       "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
                        reiserfs_free_block(th, inode, block, 1);
                    }
 
+                   reiserfs_write_unlock(sb);
                    cond_resched();
+                   reiserfs_write_lock(sb);
 
                    if (item_moved (&s_ih, path))  {
                        need_re_search = 1;
index f0ad05f380223736cd9a8a121c59d6f1c7ecd7e3..339b0baf2af6ed0a721c3aa3b1f4941f633ec59a 100644 (file)
@@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
 
-       lock_kernel();
+       reiserfs_write_lock(s);
 
        if (s->s_dirt)
                reiserfs_write_super(s);
@@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s)
 
        reiserfs_proc_info_done(s);
 
+       reiserfs_write_unlock(s);
+       mutex_destroy(&REISERFS_SB(s)->lock);
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-
-       unlock_kernel();
 }
 
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +554,28 @@ static void reiserfs_dirty_inode(struct inode *inode)
        struct reiserfs_transaction_handle th;
 
        int err = 0;
+       int lock_depth;
+
        if (inode->i_sb->s_flags & MS_RDONLY) {
                reiserfs_warning(inode->i_sb, "clm-6006",
                                 "writing inode %lu on readonly FS",
                                 inode->i_ino);
                return;
        }
-       reiserfs_write_lock(inode->i_sb);
+       lock_depth = reiserfs_write_lock_once(inode->i_sb);
 
        /* this is really only used for atime updates, so they don't have
         ** to be included in O_SYNC or fsync
         */
        err = journal_begin(&th, inode->i_sb, 1);
-       if (err) {
-               reiserfs_write_unlock(inode->i_sb);
-               return;
-       }
+       if (err)
+               goto out;
+
        reiserfs_update_sd(&th, inode);
        journal_end(&th, inode->i_sb, 1);
-       reiserfs_write_unlock(inode->i_sb);
+
+out:
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 
 #ifdef CONFIG_QUOTA
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
        int i;
+#endif
+
+       reiserfs_write_lock(s);
 
+#ifdef CONFIG_QUOTA
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
-       lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
 
        if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 out_ok:
        replace_mount_options(s, new_opts);
-       unlock_kernel();
+       reiserfs_write_unlock(s);
        return 0;
 
 out_err:
        kfree(new_opts);
-       unlock_kernel();
+       reiserfs_write_unlock(s);
        return err;
 }
 
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+       reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
+       reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1613,7 +1621,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
        if (!sbi) {
                errval = -ENOMEM;
-               goto error;
+               goto error_alloc;
        }
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
@@ -1627,6 +1635,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        /* setup default block allocator options */
        reiserfs_init_alloc_options(s);
 
+       mutex_init(&REISERFS_SB(s)->lock);
+       REISERFS_SB(s)->lock_depth = -1;
+
+       /*
+        * This function is called with the bkl, which also was the old
+        * locking used here.
+        * do_journal_begin() will soon check if we hold the lock (ie: was the
+        * bkl). This is likely because do_journal_begin() has several another
+        * callers because at this time, it doesn't seem to be necessary to
+        * protect against anything.
+        * Anyway, let's be conservative and lock for now.
+        */
+       reiserfs_write_lock(s);
+
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,9 +1874,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        init_waitqueue_head(&(sbi->s_wait));
        spin_lock_init(&sbi->bitmap_lock);
 
+       reiserfs_write_unlock(s);
+
        return (0);
 
 error:
+       reiserfs_write_unlock(s);
+error_alloc:
        if (jinit_done) {       /* kill the commit thread, free journal ram */
                journal_release_error(NULL, s);
        }
index 6925b835a43b6f2f94e8a4217180b2e8d1735ab7..58aa8e75f7f5a8de1608dd7df0d6e4cf707d7a54 100644 (file)
@@ -975,7 +975,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
        int err = 0;
 
        /* If we don't have the privroot located yet - go find it */
-       mutex_lock(&s->s_root->d_inode->i_mutex);
+       reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
        dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
@@ -1004,14 +1004,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                goto error;
 
        if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
-               mutex_lock(&s->s_root->d_inode->i_mutex);
+               reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
                err = create_privroot(REISERFS_SB(s)->priv_root);
                mutex_unlock(&s->s_root->d_inode->i_mutex);
        }
 
        if (privroot->d_inode) {
                s->s_xattr = reiserfs_xattr_handlers;
-               mutex_lock(&privroot->d_inode->i_mutex);
+               reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
                if (!REISERFS_SB(s)->xattr_root) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(XAROOT_NAME, privroot,
index dd31e7bae35cd606943ca4f52bc585c42a9a65f7..a05b4a20768d9026c939f1244ec2897dbdf4ab74 100644 (file)
 #define REISERFS_IOC32_GETVERSION      FS_IOC32_GETVERSION
 #define REISERFS_IOC32_SETVERSION      FS_IOC32_SETVERSION
 
-/* Locking primitives */
-/* Right now we are still falling back to (un)lock_kernel, but eventually that
-   would evolve into real per-fs locks */
-#define reiserfs_write_lock( sb ) lock_kernel()
-#define reiserfs_write_unlock( sb ) unlock_kernel()
+/*
+ * Locking primitives. The write lock is a per superblock
+ * special mutex that has properties close to the Big Kernel Lock
+ * which was used in the previous locking scheme.
+ */
+void reiserfs_write_lock(struct super_block *s);
+void reiserfs_write_unlock(struct super_block *s);
+int reiserfs_write_lock_once(struct super_block *s);
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth);
+
+/*
+ * Several mutexes depend on the write lock.
+ * However sometimes we want to relax the write lock while we hold
+ * these mutexes, according to the release/reacquire on schedule()
+ * properties of the Bkl that were used.
+ * Reiserfs performances and locking were based on this scheme.
+ * Now that the write lock is a mutex and not the bkl anymore, doing so
+ * may result in a deadlock:
+ *
+ * A acquire write_lock
+ * A acquire j_commit_mutex
+ * A release write_lock and wait for something
+ * B acquire write_lock
+ * B can't acquire j_commit_mutex and sleep
+ * A can't acquire write lock anymore
+ * deadlock
+ *
+ * What we do here is avoiding such deadlock by playing the same game
+ * than the Bkl: if we can't acquire a mutex that depends on the write lock,
+ * we release the write lock, wait a bit and then retry.
+ *
+ * The mutexes concerned by this hack are:
+ * - The commit mutex of a journal list
+ * - The flush mutex
+ * - The journal lock
+ * - The inode mutex
+ */
+static inline void reiserfs_mutex_lock_safe(struct mutex *m,
+                              struct super_block *s)
+{
+       reiserfs_write_unlock(s);
+       mutex_lock(m);
+       reiserfs_write_lock(s);
+}
+
+/*
+ * When we schedule, we usually want to also release the write lock,
+ * according to the previous bkl based locking scheme of reiserfs.
+ */
+static inline void reiserfs_cond_resched(struct super_block *s)
+{
+       if (need_resched()) {
+               reiserfs_write_unlock(s);
+               schedule();
+               reiserfs_write_lock(s);
+       }
+}
 
 struct fid;
 
@@ -1329,7 +1381,11 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
 #define get_generation(s) atomic_read (&fs_generation(s))
 #define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
 #define __fs_changed(gen,s) (gen != get_generation (s))
-#define fs_changed(gen,s) ({cond_resched(); __fs_changed(gen, s);})
+#define fs_changed(gen,s)              \
+({                                     \
+       reiserfs_cond_resched(s);       \
+       __fs_changed(gen, s);           \
+})
 
 /***************************************************************************/
 /*                  FIXATE NODES                                           */
@@ -2258,8 +2314,7 @@ __u32 r5_hash(const signed char *msg, int len);
 #define SPARE_SPACE 500
 
 /* prototypes from ioctl.c */
-int reiserfs_ioctl(struct inode *inode, struct file *filp,
-                  unsigned int cmd, unsigned long arg);
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long reiserfs_compat_ioctl(struct file *filp,
                   unsigned int cmd, unsigned long arg);
 int reiserfs_unpack(struct inode *inode, struct file *filp);
index dab68bbed6757d950afa3987135b5e34cea405ae..52c83b6a758a3133121a9394a955719acee10478 100644 (file)
@@ -7,6 +7,8 @@
 #ifdef __KERNEL__
 #include <linux/workqueue.h>
 #include <linux/rwsem.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
 #endif
 
 typedef enum {
@@ -355,6 +357,13 @@ struct reiserfs_sb_info {
        struct reiserfs_journal *s_journal;     /* pointer to journal information */
        unsigned short s_mount_state;   /* reiserfs state (valid, invalid) */
 
+       /* Serialize writers access, replace the old bkl */
+       struct mutex lock;
+       /* Owner of the lock (can be recursive) */
+       struct task_struct *lock_owner;
+       /* Depth of the lock, start from -1 like the bkl */
+       int lock_depth;
+
        /* Comment? -Hans */
        void (*end_io_handler) (struct buffer_head *, int);
        hashf_t s_hash_function;        /* pointer to function which is used
@@ -408,6 +417,17 @@ struct reiserfs_sb_info {
        char *s_qf_names[MAXQUOTAS];
        int s_jquota_fmt;
 #endif
+#ifdef CONFIG_REISERFS_CHECK
+
+       struct tree_balance *cur_tb;    /*
+                                        * Detects whether more than one
+                                        * copy of tb exists per superblock
+                                        * as a means of checking whether
+                                        * do_balance is executing concurrently
+                                        * against another tree reader/writer
+                                        * on a same mount point.
+                                        */
+#endif
 };
 
 /* Definitions of reiserfs on-disk properties: */