Merge tag 'for-5.12-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

[sfrench/cifs-2.6.git] / fs / btrfs / disk-io.c
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 07a2b4f69b10e31f0f3119828b2fd339d497aa2c..41b718cfea406fc3f302b5a784a04ca1e2a5447e 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
                 return 0;
  
         found_start = btrfs_header_bytenr(eb);
+
+       if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+               WARN_ON(found_start != 0);
+               return 0;
+       }
+
         /*
          * Please do not consolidate these warnings into a single if.
          * It is useful to know what went wrong.
@@ -591,6 +597,59 @@ out:
         return ret;
  }
  
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+                                  int mirror)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+       struct extent_buffer *eb;
+       bool reads_done;
+       int ret = 0;
+
+       /*
+        * We don't allow bio merge for subpage metadata read, so we should
+        * only get one eb for each endio hook.
+        */
+       ASSERT(end == start + fs_info->nodesize - 1);
+       ASSERT(PagePrivate(page));
+
+       eb = find_extent_buffer(fs_info, start);
+       /*
+        * When we are reading one tree block, eb must have been inserted into
+        * the radix tree. If not, something is wrong.
+        */
+       ASSERT(eb);
+
+       reads_done = atomic_dec_and_test(&eb->io_pages);
+       /* Subpage read must finish in page read */
+       ASSERT(reads_done);
+
+       eb->read_mirror = mirror;
+       if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+               ret = -EIO;
+               goto err;
+       }
+       ret = validate_extent_buffer(eb);
+       if (ret < 0)
+               goto err;
+
+       if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+               btree_readahead_hook(eb, ret);
+
+       set_extent_buffer_uptodate(eb);
+
+       free_extent_buffer(eb);
+       return ret;
+err:
+       /*
+        * end_bio_extent_readpage decrements io_pages in case of error,
+        * make sure it has something to decrement.
+        */
+       atomic_inc(&eb->io_pages);
+       clear_extent_buffer_uptodate(eb);
+       free_extent_buffer(eb);
+       return ret;
+}
+
  int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
                                    struct page *page, u64 start, u64 end,
                                    int mirror)
@@ -600,6 +659,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
         int reads_done;
  
         ASSERT(page->private);
+
+       if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+               return validate_subpage_buffer(page, start, end, mirror);
+
         eb = (struct extent_buffer *)page->private;
  
         /*
@@ -646,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio)
         fs_info = end_io_wq->info;
         end_io_wq->status = bio->bi_status;
  
-       if (bio_op(bio) == REQ_OP_WRITE) {
+       if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
                 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
                         wq = fs_info->endio_meta_write_workers;
                 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
@@ -808,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
  static int check_async_write(struct btrfs_fs_info *fs_info,
                              struct btrfs_inode *bi)
  {
+       if (btrfs_is_zoned(fs_info))
+               return 0;
         if (atomic_read(&bi->sync_writers))
                 return 0;
         if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
@@ -822,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
         int async = check_async_write(fs_info, BTRFS_I(inode));
         blk_status_t ret;
  
-       if (bio_op(bio) != REQ_OP_WRITE) {
+       if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
                 /*
                  * called for a read, do the setup so that checksum validation
                  * can happen in the async kernel threads
@@ -1016,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
         root->orphan_cleanup_state = 0;
  
         root->last_trans = 0;
-       root->highest_objectid = 0;
+       root->free_objectid = 0;
         root->nr_delalloc_inodes = 0;
         root->nr_ordered_extents = 0;
         root->inode_tree = RB_ROOT;
@@ -1189,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                          struct btrfs_fs_info *fs_info)
  {
         struct btrfs_root *root;
-       struct extent_buffer *leaf;
  
         root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
         if (!root)
@@ -1199,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
         root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
  
+       return root;
+}
+
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root)
+{
+       struct extent_buffer *leaf;
+
         /*
          * DON'T set SHAREABLE bit for log trees.
          *
@@ -1211,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
  
         leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
                         NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
-       if (IS_ERR(leaf)) {
-               btrfs_put_root(root);
-               return ERR_CAST(leaf);
-       }
+       if (IS_ERR(leaf))
+               return PTR_ERR(leaf);
  
         root->node = leaf;
  
         btrfs_mark_buffer_dirty(root->node);
         btrfs_tree_unlock(root->node);
-       return root;
+
+       return 0;
  }
  
  int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -1231,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
         log_root = alloc_log_tree(trans, fs_info);
         if (IS_ERR(log_root))
                 return PTR_ERR(log_root);
+
+       if (!btrfs_is_zoned(fs_info)) {
+               int ret = btrfs_alloc_log_tree_node(trans, log_root);
+
+               if (ret) {
+                       btrfs_put_root(log_root);
+                       return ret;
+               }
+       }
+
         WARN_ON(fs_info->log_root_tree);
         fs_info->log_root_tree = log_root;
         return 0;
@@ -1242,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_root *log_root;
         struct btrfs_inode_item *inode_item;
+       int ret;
  
         log_root = alloc_log_tree(trans, fs_info);
         if (IS_ERR(log_root))
                 return PTR_ERR(log_root);
  
+       ret = btrfs_alloc_log_tree_node(trans, log_root);
+       if (ret) {
+               btrfs_put_root(log_root);
+               return ret;
+       }
+
         log_root->last_trans = trans->transid;
         log_root->root_key.offset = root->root_key.objectid;
  
@@ -1367,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
         }
  
         mutex_lock(&root->objectid_mutex);
-       ret = btrfs_find_highest_objectid(root,
-                                       &root->highest_objectid);
+       ret = btrfs_init_root_free_objectid(root);
         if (ret) {
                 mutex_unlock(&root->objectid_mutex);
                 goto fail;
         }
  
-       ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+       ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
  
         mutex_unlock(&root->objectid_mutex);
  
@@ -1470,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
  {
         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
         percpu_counter_destroy(&fs_info->delalloc_bytes);
-       percpu_counter_destroy(&fs_info->dio_bytes);
+       percpu_counter_destroy(&fs_info->ordered_bytes);
         percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
         btrfs_free_csum_hash(fs_info);
         btrfs_free_stripe_hash_table(fs_info);
@@ -2427,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info,
                 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
                 ret = -EINVAL;
         }
-       /* Only PAGE SIZE is supported yet */
-       if (sectorsize != PAGE_SIZE) {
+
+       /*
+        * For 4K page size, we only support 4K sector size.
+        * For 64K page size, we support read-write for 64K sector size, and
+        * read-only for 4K sector size.
+        */
+       if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
+           (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
+                                    sectorsize != SZ_64K))) {
                 btrfs_err(fs_info,
-                       "sectorsize %llu not supported yet, only support %lu",
+                       "sectorsize %llu not yet supported for page size %lu",
                         sectorsize, PAGE_SIZE);
                 ret = -EINVAL;
         }
+
         if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
             nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
                 btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
@@ -2646,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
                  * No need to hold btrfs_root::objectid_mutex since the fs
                  * hasn't been fully initialised and we are the only user
                  */
-               ret = btrfs_find_highest_objectid(tree_root,
-                                               &tree_root->highest_objectid);
+               ret = btrfs_init_root_free_objectid(tree_root);
                 if (ret < 0) {
                         handle_error = true;
                         continue;
                 }
  
-               ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+               ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
  
                 ret = btrfs_read_roots(fs_info);
                 if (ret < 0) {
@@ -2695,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
         spin_lock_init(&fs_info->super_lock);
         spin_lock_init(&fs_info->buffer_lock);
         spin_lock_init(&fs_info->unused_bgs_lock);
+       spin_lock_init(&fs_info->treelog_bg_lock);
         rwlock_init(&fs_info->tree_mod_log_lock);
         mutex_init(&fs_info->unused_bg_unpin_mutex);
         mutex_init(&fs_info->delete_unused_bgs_mutex);
         mutex_init(&fs_info->reloc_mutex);
         mutex_init(&fs_info->delalloc_root_mutex);
+       mutex_init(&fs_info->zoned_meta_io_lock);
         seqlock_init(&fs_info->profiles_lock);
  
         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2804,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
         sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
         sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
  
-       ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
+       ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
         if (ret)
                 return ret;
  
@@ -3140,8 +3236,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
         if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
                 btrfs_info(fs_info, "has skinny extents");
  
-       fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED);
-
         /*
          * flag our filesystem as having big metadata blocks if
          * they are bigger than the page size
@@ -3194,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                 goto fail_alloc;
         }
  
+       /* For 4K sector size support, it's only read-only */
+       if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
+               if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+                       btrfs_err(fs_info,
+       "subpage sectorsize %u only supported read-only for page size %lu",
+                               sectorsize, PAGE_SIZE);
+                       err = -EINVAL;
+                       goto fail_alloc;
+               }
+       }
+
         ret = btrfs_init_workqueues(fs_info, fs_devices);
         if (ret) {
                 err = ret;
@@ -3260,6 +3365,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
         if (ret)
                 goto fail_tree_roots;
  
+       /*
+        * Get zone type information of zoned block devices. This will also
+        * handle emulation of a zoned filesystem if a regular device has the
+        * zoned incompat feature flag set.
+        */
+       ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+       if (ret) {
+               btrfs_err(fs_info,
+                         "zoned: failed to read device zone info: %d",
+                         ret);
+               goto fail_block_groups;
+       }
+
         /*
          * If we have a uuid root and we're not being told to rescan we need to
          * check the generation here so we can set the
@@ -4114,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
  
         cancel_work_sync(&fs_info->async_reclaim_work);
         cancel_work_sync(&fs_info->async_data_reclaim_work);
+       cancel_work_sync(&fs_info->preempt_reclaim_work);
  
         /* Cancel or finish ongoing discard work */
         btrfs_discard_cleanup(fs_info);
@@ -4166,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
                        percpu_counter_sum(&fs_info->delalloc_bytes));
         }
  
-       if (percpu_counter_sum(&fs_info->dio_bytes))
+       if (percpu_counter_sum(&fs_info->ordered_bytes))
                 btrfs_info(fs_info, "at unmount dio bytes count %lld",
-                          percpu_counter_sum(&fs_info->dio_bytes));
+                          percpu_counter_sum(&fs_info->ordered_bytes));
  
         btrfs_sysfs_remove_mounted(fs_info);
         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -4689,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                      EXTENT_DIRTY);
         btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
  
+       btrfs_free_redirty_list(cur_trans);
+
         cur_trans->state =TRANS_STATE_COMPLETED;
         wake_up(&cur_trans->commit_wait);
  }
@@ -4746,7 +4867,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
         return 0;
  }
  
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_init_root_free_objectid(struct btrfs_root *root)
  {
         struct btrfs_path *path;
         int ret;
@@ -4770,10 +4891,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
                 slot = path->slots[0] - 1;
                 l = path->nodes[0];
                 btrfs_item_key_to_cpu(l, &found_key, slot);
-               *objectid = max_t(u64, found_key.objectid,
-                                 BTRFS_FIRST_FREE_OBJECTID - 1);
+               root->free_objectid = max_t(u64, found_key.objectid + 1,
+                                           BTRFS_FIRST_FREE_OBJECTID);
         } else {
-               *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+               root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
         }
         ret = 0;
  error:
@@ -4781,12 +4902,12 @@ error:
         return ret;
  }
  
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
  {
         int ret;
         mutex_lock(&root->objectid_mutex);
  
-       if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+       if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
                 btrfs_warn(root->fs_info,
                            "the objectid of root %llu reaches its highest value",
                            root->root_key.objectid);
@@ -4794,7 +4915,7 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
                 goto out;
         }
  
-       *objectid = ++root->highest_objectid;
+       *objectid = root->free_objectid++;
         ret = 0;
  out:
         mutex_unlock(&root->objectid_mutex);