Merge tag 'for-5.12-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
[sfrench/cifs-2.6.git] / fs / btrfs / disk-io.c
index 07a2b4f69b10e31f0f3119828b2fd339d497aa2c..41b718cfea406fc3f302b5a784a04ca1e2a5447e 100644 (file)
@@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
                return 0;
 
        found_start = btrfs_header_bytenr(eb);
+
+       if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+               WARN_ON(found_start != 0);
+               return 0;
+       }
+
        /*
         * Please do not consolidate these warnings into a single if.
         * It is useful to know what went wrong.
@@ -591,6 +597,59 @@ out:
        return ret;
 }
 
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+                                  int mirror)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+       struct extent_buffer *eb;
+       bool reads_done;
+       int ret = 0;
+
+       /*
+        * We don't allow bio merge for subpage metadata read, so we should
+        * only get one eb for each endio hook.
+        */
+       ASSERT(end == start + fs_info->nodesize - 1);
+       ASSERT(PagePrivate(page));
+
+       eb = find_extent_buffer(fs_info, start);
+       /*
+        * When we are reading one tree block, eb must have been inserted into
+        * the radix tree. If not, something is wrong.
+        */
+       ASSERT(eb);
+
+       reads_done = atomic_dec_and_test(&eb->io_pages);
+       /* Subpage read must finish in page read */
+       ASSERT(reads_done);
+
+       eb->read_mirror = mirror;
+       if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+               ret = -EIO;
+               goto err;
+       }
+       ret = validate_extent_buffer(eb);
+       if (ret < 0)
+               goto err;
+
+       if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+               btree_readahead_hook(eb, ret);
+
+       set_extent_buffer_uptodate(eb);
+
+       free_extent_buffer(eb);
+       return ret;
+err:
+       /*
+        * end_bio_extent_readpage decrements io_pages in case of error,
+        * make sure it has something to decrement.
+        */
+       atomic_inc(&eb->io_pages);
+       clear_extent_buffer_uptodate(eb);
+       free_extent_buffer(eb);
+       return ret;
+}
+
 int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
                                   struct page *page, u64 start, u64 end,
                                   int mirror)
@@ -600,6 +659,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
        int reads_done;
 
        ASSERT(page->private);
+
+       if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+               return validate_subpage_buffer(page, start, end, mirror);
+
        eb = (struct extent_buffer *)page->private;
 
        /*
@@ -646,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio)
        fs_info = end_io_wq->info;
        end_io_wq->status = bio->bi_status;
 
-       if (bio_op(bio) == REQ_OP_WRITE) {
+       if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
                        wq = fs_info->endio_meta_write_workers;
                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
@@ -808,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
 static int check_async_write(struct btrfs_fs_info *fs_info,
                             struct btrfs_inode *bi)
 {
+       if (btrfs_is_zoned(fs_info))
+               return 0;
        if (atomic_read(&bi->sync_writers))
                return 0;
        if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
@@ -822,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
        int async = check_async_write(fs_info, BTRFS_I(inode));
        blk_status_t ret;
 
-       if (bio_op(bio) != REQ_OP_WRITE) {
+       if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
                /*
                 * called for a read, do the setup so that checksum validation
                 * can happen in the async kernel threads
@@ -1016,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        root->orphan_cleanup_state = 0;
 
        root->last_trans = 0;
-       root->highest_objectid = 0;
+       root->free_objectid = 0;
        root->nr_delalloc_inodes = 0;
        root->nr_ordered_extents = 0;
        root->inode_tree = RB_ROOT;
@@ -1189,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
        struct btrfs_root *root;
-       struct extent_buffer *leaf;
 
        root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
        if (!root)
@@ -1199,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
 
+       return root;
+}
+
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root)
+{
+       struct extent_buffer *leaf;
+
        /*
         * DON'T set SHAREABLE bit for log trees.
         *
@@ -1211,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 
        leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
                        NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
-       if (IS_ERR(leaf)) {
-               btrfs_put_root(root);
-               return ERR_CAST(leaf);
-       }
+       if (IS_ERR(leaf))
+               return PTR_ERR(leaf);
 
        root->node = leaf;
 
        btrfs_mark_buffer_dirty(root->node);
        btrfs_tree_unlock(root->node);
-       return root;
+
+       return 0;
 }
 
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -1231,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
        log_root = alloc_log_tree(trans, fs_info);
        if (IS_ERR(log_root))
                return PTR_ERR(log_root);
+
+       if (!btrfs_is_zoned(fs_info)) {
+               int ret = btrfs_alloc_log_tree_node(trans, log_root);
+
+               if (ret) {
+                       btrfs_put_root(log_root);
+                       return ret;
+               }
+       }
+
        WARN_ON(fs_info->log_root_tree);
        fs_info->log_root_tree = log_root;
        return 0;
@@ -1242,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *log_root;
        struct btrfs_inode_item *inode_item;
+       int ret;
 
        log_root = alloc_log_tree(trans, fs_info);
        if (IS_ERR(log_root))
                return PTR_ERR(log_root);
 
+       ret = btrfs_alloc_log_tree_node(trans, log_root);
+       if (ret) {
+               btrfs_put_root(log_root);
+               return ret;
+       }
+
        log_root->last_trans = trans->transid;
        log_root->root_key.offset = root->root_key.objectid;
 
@@ -1367,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
        }
 
        mutex_lock(&root->objectid_mutex);
-       ret = btrfs_find_highest_objectid(root,
-                                       &root->highest_objectid);
+       ret = btrfs_init_root_free_objectid(root);
        if (ret) {
                mutex_unlock(&root->objectid_mutex);
                goto fail;
        }
 
-       ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+       ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
 
        mutex_unlock(&root->objectid_mutex);
 
@@ -1470,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 {
        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
        percpu_counter_destroy(&fs_info->delalloc_bytes);
-       percpu_counter_destroy(&fs_info->dio_bytes);
+       percpu_counter_destroy(&fs_info->ordered_bytes);
        percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
        btrfs_free_csum_hash(fs_info);
        btrfs_free_stripe_hash_table(fs_info);
@@ -2427,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info,
                btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
                ret = -EINVAL;
        }
-       /* Only PAGE SIZE is supported yet */
-       if (sectorsize != PAGE_SIZE) {
+
+       /*
+        * For 4K page size, we only support 4K sector size.
+        * For 64K page size, we support read-write for 64K sector size, and
+        * read-only for 4K sector size.
+        */
+       if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
+           (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
+                                    sectorsize != SZ_64K))) {
                btrfs_err(fs_info,
-                       "sectorsize %llu not supported yet, only support %lu",
+                       "sectorsize %llu not yet supported for page size %lu",
                        sectorsize, PAGE_SIZE);
                ret = -EINVAL;
        }
+
        if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
            nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
                btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
@@ -2646,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
                 * No need to hold btrfs_root::objectid_mutex since the fs
                 * hasn't been fully initialised and we are the only user
                 */
-               ret = btrfs_find_highest_objectid(tree_root,
-                                               &tree_root->highest_objectid);
+               ret = btrfs_init_root_free_objectid(tree_root);
                if (ret < 0) {
                        handle_error = true;
                        continue;
                }
 
-               ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+               ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
 
                ret = btrfs_read_roots(fs_info);
                if (ret < 0) {
@@ -2695,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
        spin_lock_init(&fs_info->super_lock);
        spin_lock_init(&fs_info->buffer_lock);
        spin_lock_init(&fs_info->unused_bgs_lock);
+       spin_lock_init(&fs_info->treelog_bg_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->unused_bg_unpin_mutex);
        mutex_init(&fs_info->delete_unused_bgs_mutex);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
+       mutex_init(&fs_info->zoned_meta_io_lock);
        seqlock_init(&fs_info->profiles_lock);
 
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2804,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
        sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
        sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
 
-       ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
+       ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
        if (ret)
                return ret;
 
@@ -3140,8 +3236,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
        if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
                btrfs_info(fs_info, "has skinny extents");
 
-       fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED);
-
        /*
         * flag our filesystem as having big metadata blocks if
         * they are bigger than the page size
@@ -3194,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                goto fail_alloc;
        }
 
+       /* For 4K sector size support, it's only read-only */
+       if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
+               if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+                       btrfs_err(fs_info,
+       "subpage sectorsize %u only supported read-only for page size %lu",
+                               sectorsize, PAGE_SIZE);
+                       err = -EINVAL;
+                       goto fail_alloc;
+               }
+       }
+
        ret = btrfs_init_workqueues(fs_info, fs_devices);
        if (ret) {
                err = ret;
@@ -3260,6 +3365,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
        if (ret)
                goto fail_tree_roots;
 
+       /*
+        * Get zone type information of zoned block devices. This will also
+        * handle emulation of a zoned filesystem if a regular device has the
+        * zoned incompat feature flag set.
+        */
+       ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+       if (ret) {
+               btrfs_err(fs_info,
+                         "zoned: failed to read device zone info: %d",
+                         ret);
+               goto fail_block_groups;
+       }
+
        /*
         * If we have a uuid root and we're not being told to rescan we need to
         * check the generation here so we can set the
@@ -4114,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 
        cancel_work_sync(&fs_info->async_reclaim_work);
        cancel_work_sync(&fs_info->async_data_reclaim_work);
+       cancel_work_sync(&fs_info->preempt_reclaim_work);
 
        /* Cancel or finish ongoing discard work */
        btrfs_discard_cleanup(fs_info);
@@ -4166,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
                       percpu_counter_sum(&fs_info->delalloc_bytes));
        }
 
-       if (percpu_counter_sum(&fs_info->dio_bytes))
+       if (percpu_counter_sum(&fs_info->ordered_bytes))
                btrfs_info(fs_info, "at unmount dio bytes count %lld",
-                          percpu_counter_sum(&fs_info->dio_bytes));
+                          percpu_counter_sum(&fs_info->ordered_bytes));
 
        btrfs_sysfs_remove_mounted(fs_info);
        btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -4689,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                     EXTENT_DIRTY);
        btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
 
+       btrfs_free_redirty_list(cur_trans);
+
        cur_trans->state =TRANS_STATE_COMPLETED;
        wake_up(&cur_trans->commit_wait);
 }
@@ -4746,7 +4867,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
        return 0;
 }
 
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_init_root_free_objectid(struct btrfs_root *root)
 {
        struct btrfs_path *path;
        int ret;
@@ -4770,10 +4891,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
                slot = path->slots[0] - 1;
                l = path->nodes[0];
                btrfs_item_key_to_cpu(l, &found_key, slot);
-               *objectid = max_t(u64, found_key.objectid,
-                                 BTRFS_FIRST_FREE_OBJECTID - 1);
+               root->free_objectid = max_t(u64, found_key.objectid + 1,
+                                           BTRFS_FIRST_FREE_OBJECTID);
        } else {
-               *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+               root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
        }
        ret = 0;
 error:
@@ -4781,12 +4902,12 @@ error:
        return ret;
 }
 
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
 {
        int ret;
        mutex_lock(&root->objectid_mutex);
 
-       if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+       if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
                btrfs_warn(root->fs_info,
                           "the objectid of root %llu reaches its highest value",
                           root->root_key.objectid);
@@ -4794,7 +4915,7 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
                goto out;
        }
 
-       *objectid = ++root->highest_objectid;
+       *objectid = root->free_objectid++;
        ret = 0;
 out:
        mutex_unlock(&root->objectid_mutex);